lib/Target/X86/X86ISelDAGToDAG.cpp

   1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines a DAG pattern matching instruction selector for X86,
  10 // converting from a legalized dag to a X86 dag.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "X86.h"
  15 #include "X86MachineFunctionInfo.h"
  16 #include "X86RegisterInfo.h"
  17 #include "X86Subtarget.h"
  18 #include "X86TargetMachine.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineFunction.h"
  22 #include "llvm/CodeGen/SelectionDAGISel.h"
  23 #include "llvm/Config/llvm-config.h"
  24 #include "llvm/IR/ConstantRange.h"
  25 #include "llvm/IR/Function.h"
  26 #include "llvm/IR/Instructions.h"
  27 #include "llvm/IR/Intrinsics.h"
  28 #include "llvm/IR/Type.h"
  29 #include "llvm/Support/Debug.h"
  30 #include "llvm/Support/ErrorHandling.h"
  31 #include "llvm/Support/KnownBits.h"
  32 #include "llvm/Support/MathExtras.h"
  33 #include "llvm/Support/raw_ostream.h"
  34 #include "llvm/Target/TargetMachine.h"
  35 #include "llvm/Target/TargetOptions.h"
  36 #include <stdint.h>
  37 using namespace llvm;
  38
  39 #define DEBUG_TYPE "x86-isel"
  40
  41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
  42
  43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
  44     cl::desc("Enable setting constant bits to reduce size of mask immediates"),
  45     cl::Hidden);
  46
  47 //===----------------------------------------------------------------------===//
  48 //                      Pattern Matcher Implementation
  49 //===----------------------------------------------------------------------===//
  50
  51 namespace {
  52   /// This corresponds to X86AddressMode, but uses SDValue's instead of register
  53   /// numbers for the leaves of the matched tree.
  54   struct X86ISelAddressMode {
  55     enum {
  56       RegBase,
  57       FrameIndexBase
  58     } BaseType;
  59
  60     // This is really a union, discriminated by BaseType!
  61     SDValue Base_Reg;
  62     int Base_FrameIndex;
  63
  64     unsigned Scale;
  65     SDValue IndexReg;
  66     int32_t Disp;
  67     SDValue Segment;
  68     const GlobalValue *GV;
  69     const Constant *CP;
  70     const BlockAddress *BlockAddr;
  71     const char *ES;
  72     MCSymbol *MCSym;
  73     int JT;
  74     unsigned Align;    // CP alignment.
  75     unsigned char SymbolFlags;  // X86II::MO_*
  76     bool NegateIndex = false;
  77
  78     X86ISelAddressMode()
  79         : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
  80           Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
  81           MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
  82
  83     bool hasSymbolicDisplacement() const {
  84       return GV != nullptr || CP != nullptr || ES != nullptr ||
  85              MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
  86     }
  87
  88     bool hasBaseOrIndexReg() const {
  89       return BaseType == FrameIndexBase ||
  90              IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
  91     }
  92
  93     /// Return true if this addressing mode is already RIP-relative.
  94     bool isRIPRelative() const {
  95       if (BaseType != RegBase) return false;
  96       if (RegisterSDNode *RegNode =
  97             dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
  98         return RegNode->getReg() == X86::RIP;
  99       return false;
 100     }
 101
 102     void setBaseReg(SDValue Reg) {
 103       BaseType = RegBase;
 104       Base_Reg = Reg;
 105     }
 106
 107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 108     void dump(SelectionDAG *DAG = nullptr) {
 109       dbgs() << "X86ISelAddressMode " << this << '\n';
 110       dbgs() << "Base_Reg ";
 111       if (Base_Reg.getNode())
 112         Base_Reg.getNode()->dump(DAG);
 113       else
 114         dbgs() << "nul\n";
 115       if (BaseType == FrameIndexBase)
 116         dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
 117       dbgs() << " Scale " << Scale << '\n'
 118              << "IndexReg ";
 119       if (NegateIndex)
 120         dbgs() << "negate ";
 121       if (IndexReg.getNode())
 122         IndexReg.getNode()->dump(DAG);
 123       else
 124         dbgs() << "nul\n";
 125       dbgs() << " Disp " << Disp << '\n'
 126              << "GV ";
 127       if (GV)
 128         GV->dump();
 129       else
 130         dbgs() << "nul";
 131       dbgs() << " CP ";
 132       if (CP)
 133         CP->dump();
 134       else
 135         dbgs() << "nul";
 136       dbgs() << '\n'
 137              << "ES ";
 138       if (ES)
 139         dbgs() << ES;
 140       else
 141         dbgs() << "nul";
 142       dbgs() << " MCSym ";
 143       if (MCSym)
 144         dbgs() << MCSym;
 145       else
 146         dbgs() << "nul";
 147       dbgs() << " JT" << JT << " Align" << Align << '\n';
 148     }
 149 #endif
 150   };
 151 }
 152
 153 namespace {
 154   //===--------------------------------------------------------------------===//
 155   /// ISel - X86-specific code to select X86 machine instructions for
 156   /// SelectionDAG operations.
 157   ///
 158   class X86DAGToDAGISel final : public SelectionDAGISel {
 159     /// Keep a pointer to the X86Subtarget around so that we can
 160     /// make the right decision when generating code for different targets.
 161     const X86Subtarget *Subtarget;
 162
 163     /// If true, selector should try to optimize for code size instead of
 164     /// performance.
 165     bool OptForSize;
 166
 167     /// If true, selector should try to optimize for minimum code size.
 168     bool OptForMinSize;
 169
 170     /// Disable direct TLS access through segment registers.
 171     bool IndirectTlsSegRefs;
 172
 173   public:
 174     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
 175         : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
 176           OptForMinSize(false), IndirectTlsSegRefs(false) {}
 177
 178     StringRef getPassName() const override {
 179       return "X86 DAG->DAG Instruction Selection";
 180     }
 181
 182     bool runOnMachineFunction(MachineFunction &MF) override {
 183       // Reset the subtarget each time through.
 184       Subtarget = &MF.getSubtarget<X86Subtarget>();
 185       IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
 186                              "indirect-tls-seg-refs");
 187
 188       // OptFor[Min]Size are used in pattern predicates that isel is matching.
 189       OptForSize = MF.getFunction().hasOptSize();
 190       OptForMinSize = MF.getFunction().hasMinSize();
 191       assert((!OptForMinSize || OptForSize) &&
 192              "OptForMinSize implies OptForSize");
 193
 194       SelectionDAGISel::runOnMachineFunction(MF);
 195       return true;
 196     }
 197
 198     void EmitFunctionEntryCode() override;
 199
 200     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
 201
 202     void PreprocessISelDAG() override;
 203     void PostprocessISelDAG() override;
 204
 205 // Include the pieces autogenerated from the target description.
 206 #include "X86GenDAGISel.inc"
 207
 208   private:
 209     void Select(SDNode *N) override;
 210
 211     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
 212     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
 213     bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
 214     bool matchAddress(SDValue N, X86ISelAddressMode &AM);
 215     bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
 216     bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
 217     bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 218                                  unsigned Depth);
 219     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
 220     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
 221                     SDValue &Scale, SDValue &Index, SDValue &Disp,
 222                     SDValue &Segment);
 223     bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
 224                           SDValue &Scale, SDValue &Index, SDValue &Disp,
 225                           SDValue &Segment);
 226     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
 227     bool selectLEAAddr(SDValue N, SDValue &Base,
 228                        SDValue &Scale, SDValue &Index, SDValue &Disp,
 229                        SDValue &Segment);
 230     bool selectLEA64_32Addr(SDValue N, SDValue &Base,
 231                             SDValue &Scale, SDValue &Index, SDValue &Disp,
 232                             SDValue &Segment);
 233     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
 234                            SDValue &Scale, SDValue &Index, SDValue &Disp,
 235                            SDValue &Segment);
 236     bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
 237                              SDValue &Base, SDValue &Scale,
 238                              SDValue &Index, SDValue &Disp,
 239                              SDValue &Segment,
 240                              SDValue &NodeWithChain);
 241     bool selectRelocImm(SDValue N, SDValue &Op);
 242
 243     bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
 244                      SDValue &Base, SDValue &Scale,
 245                      SDValue &Index, SDValue &Disp,
 246                      SDValue &Segment);
 247
 248     // Convenience method where P is also root.
 249     bool tryFoldLoad(SDNode *P, SDValue N,
 250                      SDValue &Base, SDValue &Scale,
 251                      SDValue &Index, SDValue &Disp,
 252                      SDValue &Segment) {
 253       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
 254     }
 255
 256     bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
 257                           SDValue &Base, SDValue &Scale,
 258                           SDValue &Index, SDValue &Disp,
 259                           SDValue &Segment);
 260
 261     /// Implement addressing mode selection for inline asm expressions.
 262     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
 263                                       unsigned ConstraintID,
 264                                       std::vector<SDValue> &OutOps) override;
 265
 266     void emitSpecialCodeForMain();
 267
 268     inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
 269                                    MVT VT, SDValue &Base, SDValue &Scale,
 270                                    SDValue &Index, SDValue &Disp,
 271                                    SDValue &Segment) {
 272       if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
 273         Base = CurDAG->getTargetFrameIndex(
 274             AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
 275       else if (AM.Base_Reg.getNode())
 276         Base = AM.Base_Reg;
 277       else
 278         Base = CurDAG->getRegister(0, VT);
 279
 280       Scale = getI8Imm(AM.Scale, DL);
 281
 282       // Negate the index if needed.
 283       if (AM.NegateIndex) {
 284         unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
 285         SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
 286                                                      AM.IndexReg), 0);
 287         AM.IndexReg = Neg;
 288       }
 289
 290       if (AM.IndexReg.getNode())
 291         Index = AM.IndexReg;
 292       else
 293         Index = CurDAG->getRegister(0, VT);
 294
 295       // These are 32-bit even in 64-bit mode since RIP-relative offset
 296       // is 32-bit.
 297       if (AM.GV)
 298         Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
 299                                               MVT::i32, AM.Disp,
 300                                               AM.SymbolFlags);
 301       else if (AM.CP)
 302         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
 303                                              AM.Align, AM.Disp, AM.SymbolFlags);
 304       else if (AM.ES) {
 305         assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
 306         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
 307       } else if (AM.MCSym) {
 308         assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
 309         assert(AM.SymbolFlags == 0 && "oo");
 310         Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
 311       } else if (AM.JT != -1) {
 312         assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
 313         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
 314       } else if (AM.BlockAddr)
 315         Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
 316                                              AM.SymbolFlags);
 317       else
 318         Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
 319
 320       if (AM.Segment.getNode())
 321         Segment = AM.Segment;
 322       else
 323         Segment = CurDAG->getRegister(0, MVT::i16);
 324     }
 325
 326     // Utility function to determine whether we should avoid selecting
 327     // immediate forms of instructions for better code size or not.
 328     // At a high level, we'd like to avoid such instructions when
 329     // we have similar constants used within the same basic block
 330     // that can be kept in a register.
 331     //
 332     bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
 333       uint32_t UseCount = 0;
 334
 335       // Do not want to hoist if we're not optimizing for size.
 336       // TODO: We'd like to remove this restriction.
 337       // See the comment in X86InstrInfo.td for more info.
 338       if (!OptForSize)
 339         return false;
 340
 341       // Walk all the users of the immediate.
 342       for (SDNode::use_iterator UI = N->use_begin(),
 343            UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
 344
 345         SDNode *User = *UI;
 346
 347         // This user is already selected. Count it as a legitimate use and
 348         // move on.
 349         if (User->isMachineOpcode()) {
 350           UseCount++;
 351           continue;
 352         }
 353
 354         // We want to count stores of immediates as real uses.
 355         if (User->getOpcode() == ISD::STORE &&
 356             User->getOperand(1).getNode() == N) {
 357           UseCount++;
 358           continue;
 359         }
 360
 361         // We don't currently match users that have > 2 operands (except
 362         // for stores, which are handled above)
 363         // Those instruction won't match in ISEL, for now, and would
 364         // be counted incorrectly.
 365         // This may change in the future as we add additional instruction
 366         // types.
 367         if (User->getNumOperands() != 2)
 368           continue;
 369
 370         // If this can match to INC/DEC, don't count it as a use.
 371         if (User->getOpcode() == ISD::ADD &&
 372             (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
 373           continue;
 374
 375         // Immediates that are used for offsets as part of stack
 376         // manipulation should be left alone. These are typically
 377         // used to indicate SP offsets for argument passing and
 378         // will get pulled into stores/pushes (implicitly).
 379         if (User->getOpcode() == X86ISD::ADD ||
 380             User->getOpcode() == ISD::ADD    ||
 381             User->getOpcode() == X86ISD::SUB ||
 382             User->getOpcode() == ISD::SUB) {
 383
 384           // Find the other operand of the add/sub.
 385           SDValue OtherOp = User->getOperand(0);
 386           if (OtherOp.getNode() == N)
 387             OtherOp = User->getOperand(1);
 388
 389           // Don't count if the other operand is SP.
 390           RegisterSDNode *RegNode;
 391           if (OtherOp->getOpcode() == ISD::CopyFromReg &&
 392               (RegNode = dyn_cast_or_null<RegisterSDNode>(
 393                  OtherOp->getOperand(1).getNode())))
 394             if ((RegNode->getReg() == X86::ESP) ||
 395                 (RegNode->getReg() == X86::RSP))
 396               continue;
 397         }
 398
 399         // ... otherwise, count this and move on.
 400         UseCount++;
 401       }
 402
 403       // If we have more than 1 use, then recommend for hoisting.
 404       return (UseCount > 1);
 405     }
 406
 407     /// Return a target constant with the specified value of type i8.
 408     inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
 409       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
 410     }
 411
 412     /// Return a target constant with the specified value, of type i32.
 413     inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
 414       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
 415     }
 416
 417     /// Return a target constant with the specified value, of type i64.
 418     inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
 419       return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
 420     }
 421
 422     SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
 423                                         const SDLoc &DL) {
 424       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
 425       uint64_t Index = N->getConstantOperandVal(1);
 426       MVT VecVT = N->getOperand(0).getSimpleValueType();
 427       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
 428     }
 429
 430     SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
 431                                       const SDLoc &DL) {
 432       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
 433       uint64_t Index = N->getConstantOperandVal(2);
 434       MVT VecVT = N->getSimpleValueType(0);
 435       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
 436     }
 437
 438     // Helper to detect unneeded and instructions on shift amounts. Called
 439     // from PatFrags in tablegen.
 440     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
 441       assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
 442       const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
 443
 444       if (Val.countTrailingOnes() >= Width)
 445         return true;
 446
 447       APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
 448       return Mask.countTrailingOnes() >= Width;
 449     }
 450
 451     /// Return an SDNode that returns the value of the global base register.
 452     /// Output instructions required to initialize the global base register,
 453     /// if necessary.
 454     SDNode *getGlobalBaseReg();
 455
 456     /// Return a reference to the TargetMachine, casted to the target-specific
 457     /// type.
 458     const X86TargetMachine &getTargetMachine() const {
 459       return static_cast<const X86TargetMachine &>(TM);
 460     }
 461
 462     /// Return a reference to the TargetInstrInfo, casted to the target-specific
 463     /// type.
 464     const X86InstrInfo *getInstrInfo() const {
 465       return Subtarget->getInstrInfo();
 466     }
 467
 468     /// Address-mode matching performs shift-of-and to and-of-shift
 469     /// reassociation in order to expose more scaled addressing
 470     /// opportunities.
 471     bool ComplexPatternFuncMutatesDAG() const override {
 472       return true;
 473     }
 474
 475     bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
 476
 477     /// Returns whether this is a relocatable immediate in the range
 478     /// [-2^Width .. 2^Width-1].
 479     template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
 480       if (auto *CN = dyn_cast<ConstantSDNode>(N))
 481         return isInt<Width>(CN->getSExtValue());
 482       return isSExtAbsoluteSymbolRef(Width, N);
 483     }
 484
 485     // Indicates we should prefer to use a non-temporal load for this load.
 486     bool useNonTemporalLoad(LoadSDNode *N) const {
 487       if (!N->isNonTemporal())
 488         return false;
 489
 490       unsigned StoreSize = N->getMemoryVT().getStoreSize();
 491
 492       if (N->getAlignment() < StoreSize)
 493         return false;
 494
 495       switch (StoreSize) {
 496       default: llvm_unreachable("Unsupported store size");
 497       case 4:
 498       case 8:
 499         return false;
 500       case 16:
 501         return Subtarget->hasSSE41();
 502       case 32:
 503         return Subtarget->hasAVX2();
 504       case 64:
 505         return Subtarget->hasAVX512();
 506       }
 507     }
 508
 509     bool foldLoadStoreIntoMemOperand(SDNode *Node);
 510     MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
 511     bool matchBitExtract(SDNode *Node);
 512     bool shrinkAndImmediate(SDNode *N);
 513     bool isMaskZeroExtended(SDNode *N) const;
 514     bool tryShiftAmountMod(SDNode *N);
 515     bool combineIncDecVector(SDNode *Node);
 516     bool tryShrinkShlLogicImm(SDNode *N);
 517     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
 518     bool tryMatchBitSelect(SDNode *N);
 519
 520     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
 521                                 const SDLoc &dl, MVT VT, SDNode *Node);
 522     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
 523                                 const SDLoc &dl, MVT VT, SDNode *Node,
 524                                 SDValue &InFlag);
 525
 526     bool tryOptimizeRem8Extend(SDNode *N);
 527
 528     bool onlyUsesZeroFlag(SDValue Flags) const;
 529     bool hasNoSignFlagUses(SDValue Flags) const;
 530     bool hasNoCarryFlagUses(SDValue Flags) const;
 531   };
 532 }
 533
 534
 535 // Returns true if this masked compare can be implemented legally with this
 536 // type.
 537 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
 538   unsigned Opcode = N->getOpcode();
 539   if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
 540       Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
 541     // We can get 256-bit 8 element types here without VLX being enabled. When
 542     // this happens we will use 512-bit operations and the mask will not be
 543     // zero extended.
 544     EVT OpVT = N->getOperand(0).getValueType();
 545     if (OpVT.is256BitVector() || OpVT.is128BitVector())
 546       return Subtarget->hasVLX();
 547
 548     return true;
 549   }
 550   // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
 551   if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
 552       Opcode == X86ISD::FSETCCM_SAE)
 553     return true;
 554
 555   return false;
 556 }
 557
 558 // Returns true if we can assume the writer of the mask has zero extended it
 559 // for us.
 560 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
 561   // If this is an AND, check if we have a compare on either side. As long as
 562   // one side guarantees the mask is zero extended, the AND will preserve those
 563   // zeros.
 564   if (N->getOpcode() == ISD::AND)
 565     return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
 566            isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
 567
 568   return isLegalMaskCompare(N, Subtarget);
 569 }
 570
 571 bool
 572 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
 573   if (OptLevel == CodeGenOpt::None) return false;
 574
 575   if (!N.hasOneUse())
 576     return false;
 577
 578   if (N.getOpcode() != ISD::LOAD)
 579     return true;
 580
 581   // Don't fold non-temporal loads if we have an instruction for them.
 582   if (useNonTemporalLoad(cast<LoadSDNode>(N)))
 583     return false;
 584
 585   // If N is a load, do additional profitability checks.
 586   if (U == Root) {
 587     switch (U->getOpcode()) {
 588     default: break;
 589     case X86ISD::ADD:
 590     case X86ISD::ADC:
 591     case X86ISD::SUB:
 592     case X86ISD::SBB:
 593     case X86ISD::AND:
 594     case X86ISD::XOR:
 595     case X86ISD::OR:
 596     case ISD::ADD:
 597     case ISD::ADDCARRY:
 598     case ISD::AND:
 599     case ISD::OR:
 600     case ISD::XOR: {
 601       SDValue Op1 = U->getOperand(1);
 602
 603       // If the other operand is a 8-bit immediate we should fold the immediate
 604       // instead. This reduces code size.
 605       // e.g.
 606       // movl 4(%esp), %eax
 607       // addl $4, %eax
 608       // vs.
 609       // movl $4, %eax
 610       // addl 4(%esp), %eax
 611       // The former is 2 bytes shorter. In case where the increment is 1, then
 612       // the saving can be 4 bytes (by using incl %eax).
 613       if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
 614         if (Imm->getAPIntValue().isSignedIntN(8))
 615           return false;
 616
 617         // If this is a 64-bit AND with an immediate that fits in 32-bits,
 618         // prefer using the smaller and over folding the load. This is needed to
 619         // make sure immediates created by shrinkAndImmediate are always folded.
 620         // Ideally we would narrow the load during DAG combine and get the
 621         // best of both worlds.
 622         if (U->getOpcode() == ISD::AND &&
 623             Imm->getAPIntValue().getBitWidth() == 64 &&
 624             Imm->getAPIntValue().isIntN(32))
 625           return false;
 626
 627         // If this really a zext_inreg that can be represented with a movzx
 628         // instruction, prefer that.
 629         // TODO: We could shrink the load and fold if it is non-volatile.
 630         if (U->getOpcode() == ISD::AND &&
 631             (Imm->getAPIntValue() == UINT8_MAX ||
 632              Imm->getAPIntValue() == UINT16_MAX ||
 633              Imm->getAPIntValue() == UINT32_MAX))
 634           return false;
 635
 636         // ADD/SUB with can negate the immediate and use the opposite operation
 637         // to fit 128 into a sign extended 8 bit immediate.
 638         if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
 639             (-Imm->getAPIntValue()).isSignedIntN(8))
 640           return false;
 641       }
 642
 643       // If the other operand is a TLS address, we should fold it instead.
 644       // This produces
 645       // movl    %gs:0, %eax
 646       // leal    i@NTPOFF(%eax), %eax
 647       // instead of
 648       // movl    $i@NTPOFF, %eax
 649       // addl    %gs:0, %eax
 650       // if the block also has an access to a second TLS address this will save
 651       // a load.
 652       // FIXME: This is probably also true for non-TLS addresses.
 653       if (Op1.getOpcode() == X86ISD::Wrapper) {
 654         SDValue Val = Op1.getOperand(0);
 655         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
 656           return false;
 657       }
 658
 659       // Don't fold load if this matches the BTS/BTR/BTC patterns.
 660       // BTS: (or X, (shl 1, n))
 661       // BTR: (and X, (rotl -2, n))
 662       // BTC: (xor X, (shl 1, n))
 663       if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
 664         if (U->getOperand(0).getOpcode() == ISD::SHL &&
 665             isOneConstant(U->getOperand(0).getOperand(0)))
 666           return false;
 667
 668         if (U->getOperand(1).getOpcode() == ISD::SHL &&
 669             isOneConstant(U->getOperand(1).getOperand(0)))
 670           return false;
 671       }
 672       if (U->getOpcode() == ISD::AND) {
 673         SDValue U0 = U->getOperand(0);
 674         SDValue U1 = U->getOperand(1);
 675         if (U0.getOpcode() == ISD::ROTL) {
 676           auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
 677           if (C && C->getSExtValue() == -2)
 678             return false;
 679         }
 680
 681         if (U1.getOpcode() == ISD::ROTL) {
 682           auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
 683           if (C && C->getSExtValue() == -2)
 684             return false;
 685         }
 686       }
 687
 688       break;
 689     }
 690     case ISD::SHL:
 691     case ISD::SRA:
 692     case ISD::SRL:
 693       // Don't fold a load into a shift by immediate. The BMI2 instructions
 694       // support folding a load, but not an immediate. The legacy instructions
 695       // support folding an immediate, but can't fold a load. Folding an
 696       // immediate is preferable to folding a load.
 697       if (isa<ConstantSDNode>(U->getOperand(1)))
 698         return false;
 699
 700       break;
 701     }
 702   }
 703
 704   // Prevent folding a load if this can implemented with an insert_subreg or
 705   // a move that implicitly zeroes.
 706   if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
 707       isNullConstant(Root->getOperand(2)) &&
 708       (Root->getOperand(0).isUndef() ||
 709        ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
 710     return false;
 711
 712   return true;
 713 }
 714
 715 /// Replace the original chain operand of the call with
 716 /// load's chain operand and move load below the call's chain operand.
 717 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
 718                                SDValue Call, SDValue OrigChain) {
 719   SmallVector<SDValue, 8> Ops;
 720   SDValue Chain = OrigChain.getOperand(0);
 721   if (Chain.getNode() == Load.getNode())
 722     Ops.push_back(Load.getOperand(0));
 723   else {
 724     assert(Chain.getOpcode() == ISD::TokenFactor &&
 725            "Unexpected chain operand");
 726     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
 727       if (Chain.getOperand(i).getNode() == Load.getNode())
 728         Ops.push_back(Load.getOperand(0));
 729       else
 730         Ops.push_back(Chain.getOperand(i));
 731     SDValue NewChain =
 732       CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
 733     Ops.clear();
 734     Ops.push_back(NewChain);
 735   }
 736   Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
 737   CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
 738   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
 739                              Load.getOperand(1), Load.getOperand(2));
 740
 741   Ops.clear();
 742   Ops.push_back(SDValue(Load.getNode(), 1));
 743   Ops.append(Call->op_begin() + 1, Call->op_end());
 744   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
 745 }
 746
 747 /// Return true if call address is a load and it can be
 748 /// moved below CALLSEQ_START and the chains leading up to the call.
 749 /// Return the CALLSEQ_START by reference as a second output.
 750 /// In the case of a tail call, there isn't a callseq node between the call
 751 /// chain and the load.
 752 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 753   // The transformation is somewhat dangerous if the call's chain was glued to
 754   // the call. After MoveBelowOrigChain the load is moved between the call and
 755   // the chain, this can create a cycle if the load is not folded. So it is
 756   // *really* important that we are sure the load will be folded.
 757   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
 758     return false;
 759   LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
 760   if (!LD ||
 761       !LD->isSimple() ||
 762       LD->getAddressingMode() != ISD::UNINDEXED ||
 763       LD->getExtensionType() != ISD::NON_EXTLOAD)
 764     return false;
 765
 766   // Now let's find the callseq_start.
 767   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
 768     if (!Chain.hasOneUse())
 769       return false;
 770     Chain = Chain.getOperand(0);
 771   }
 772
 773   if (!Chain.getNumOperands())
 774     return false;
 775   // Since we are not checking for AA here, conservatively abort if the chain
 776   // writes to memory. It's not safe to move the callee (a load) across a store.
 777   if (isa<MemSDNode>(Chain.getNode()) &&
 778       cast<MemSDNode>(Chain.getNode())->writeMem())
 779     return false;
 780   if (Chain.getOperand(0).getNode() == Callee.getNode())
 781     return true;
 782   if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
 783       Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
 784       Callee.getValue(1).hasOneUse())
 785     return true;
 786   return false;
 787 }
 788
 789 void X86DAGToDAGISel::PreprocessISelDAG() {
 790   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
 791        E = CurDAG->allnodes_end(); I != E; ) {
 792     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 793
 794     // If this is a target specific AND node with no flag usages, turn it back
 795     // into ISD::AND to enable test instruction matching.
 796     if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
 797       SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
 798                                     N->getOperand(0), N->getOperand(1));
 799       --I;
 800       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
 801       ++I;
 802       CurDAG->DeleteNode(N);
 803       continue;
 804     }
 805
 806     switch (N->getOpcode()) {
 807     case ISD::FP_TO_SINT:
 808     case ISD::FP_TO_UINT: {
 809       // Replace vector fp_to_s/uint with their X86 specific equivalent so we
 810       // don't need 2 sets of patterns.
 811       if (!N->getSimpleValueType(0).isVector())
 812         break;
 813
 814       unsigned NewOpc;
 815       switch (N->getOpcode()) {
 816       default: llvm_unreachable("Unexpected opcode!");
 817       case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
 818       case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
 819       }
 820       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
 821                                     N->getOperand(0));
 822       --I;
 823       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
 824       ++I;
 825       CurDAG->DeleteNode(N);
 826       continue;
 827     }
 828     case ISD::SHL:
 829     case ISD::SRA:
 830     case ISD::SRL: {
 831       // Replace vector shifts with their X86 specific equivalent so we don't
 832       // need 2 sets of patterns.
 833       if (!N->getValueType(0).isVector())
 834         break;
 835
 836       unsigned NewOpc;
 837       switch (N->getOpcode()) {
 838       default: llvm_unreachable("Unexpected opcode!");
 839       case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
 840       case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
 841       case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
 842       }
 843       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
 844                                     N->getOperand(0), N->getOperand(1));
 845       --I;
 846       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
 847       ++I;
 848       CurDAG->DeleteNode(N);
 849       continue;
 850     }
 851     case ISD::ANY_EXTEND:
 852     case ISD::ANY_EXTEND_VECTOR_INREG: {
 853       // Replace vector any extend with the zero extend equivalents so we don't
 854       // need 2 sets of patterns. Ignore vXi1 extensions.
 855       if (!N->getValueType(0).isVector() ||
 856           N->getOperand(0).getScalarValueSizeInBits() == 1)
 857         break;
 858
 859       unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
 860                             ? ISD::ZERO_EXTEND
 861                             : ISD::ZERO_EXTEND_VECTOR_INREG;
 862
 863       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
 864                                     N->getOperand(0));
 865       --I;
 866       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
 867       ++I;
 868       CurDAG->DeleteNode(N);
 869       continue;
 870     }
 871     case ISD::FCEIL:
 872     case ISD::FFLOOR:
 873     case ISD::FTRUNC:
 874     case ISD::FNEARBYINT:
 875     case ISD::FRINT: {
 876       // Replace fp rounding with their X86 specific equivalent so we don't
 877       // need 2 sets of patterns.
 878       unsigned Imm;
 879       switch (N->getOpcode()) {
 880       default: llvm_unreachable("Unexpected opcode!");
 881       case ISD::FCEIL:      Imm = 0xA; break;
 882       case ISD::FFLOOR:     Imm = 0x9; break;
 883       case ISD::FTRUNC:     Imm = 0xB; break;
 884       case ISD::FNEARBYINT: Imm = 0xC; break;
 885       case ISD::FRINT:      Imm = 0x4; break;
 886       }
 887       SDLoc dl(N);
 888       SDValue Res = CurDAG->getNode(
 889           X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0),
 890           CurDAG->getTargetConstant(Imm, dl, MVT::i8));
 891       --I;
 892       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
 893       ++I;
 894       CurDAG->DeleteNode(N);
 895       continue;
 896     }
 897     case X86ISD::FANDN:
 898     case X86ISD::FAND:
 899     case X86ISD::FOR:
 900     case X86ISD::FXOR: {
 901       // Widen scalar fp logic ops to vector to reduce isel patterns.
 902       // FIXME: Can we do this during lowering/combine.
 903       MVT VT = N->getSimpleValueType(0);
 904       if (VT.isVector() || VT == MVT::f128)
 905         break;
 906
 907       MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
 908       SDLoc dl(N);
 909       SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
 910                                     N->getOperand(0));
 911       SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
 912                                     N->getOperand(1));
 913
 914       SDValue Res;
 915       if (Subtarget->hasSSE2()) {
 916         EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
 917         Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
 918         Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
 919         unsigned Opc;
 920         switch (N->getOpcode()) {
 921         default: llvm_unreachable("Unexpected opcode!");
 922         case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
 923         case X86ISD::FAND:  Opc = ISD::AND;      break;
 924         case X86ISD::FOR:   Opc = ISD::OR;       break;
 925         case X86ISD::FXOR:  Opc = ISD::XOR;      break;
 926         }
 927         Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
 928         Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
 929       } else {
 930         Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
 931       }
 932       Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
 933                             CurDAG->getIntPtrConstant(0, dl));
 934       --I;
 935       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
 936       ++I;
 937       CurDAG->DeleteNode(N);
 938       continue;
 939     }
 940     }
 941
 942     if (OptLevel != CodeGenOpt::None &&
 943         // Only do this when the target can fold the load into the call or
 944         // jmp.
 945         !Subtarget->useRetpolineIndirectCalls() &&
 946         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
 947          (N->getOpcode() == X86ISD::TC_RETURN &&
 948           (Subtarget->is64Bit() ||
 949            !getTargetMachine().isPositionIndependent())))) {
 950       /// Also try moving call address load from outside callseq_start to just
 951       /// before the call to allow it to be folded.
 952       ///
 953       ///     [Load chain]
 954       ///         ^
 955       ///         |
 956       ///       [Load]
 957       ///       ^    ^
 958       ///       |    |
 959       ///      /      \--
 960       ///     /          |
 961       ///[CALLSEQ_START] |
 962       ///     ^          |
 963       ///     |          |
 964       /// [LOAD/C2Reg]   |
 965       ///     |          |
 966       ///      \        /
 967       ///       \      /
 968       ///       [CALL]
 969       bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
 970       SDValue Chain = N->getOperand(0);
 971       SDValue Load  = N->getOperand(1);
 972       if (!isCalleeLoad(Load, Chain, HasCallSeq))
 973         continue;
 974       moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
 975       ++NumLoadMoved;
 976       continue;
 977     }
 978
 979     // Lower fpround and fpextend nodes that target the FP stack to be store and
 980     // load to the stack.  This is a gross hack.  We would like to simply mark
 981     // these as being illegal, but when we do that, legalize produces these when
 982     // it expands calls, then expands these in the same legalize pass.  We would
 983     // like dag combine to be able to hack on these between the call expansion
 984     // and the node legalization.  As such this pass basically does "really
 985     // late" legalization of these inline with the X86 isel pass.
 986     // FIXME: This should only happen when not compiled with -O0.
 987     switch (N->getOpcode()) {
 988     default: continue;
 989     case ISD::FP_ROUND:
 990     case ISD::FP_EXTEND:
 991     {
 992       MVT SrcVT = N->getOperand(0).getSimpleValueType();
 993       MVT DstVT = N->getSimpleValueType(0);
 994
 995       // If any of the sources are vectors, no fp stack involved.
 996       if (SrcVT.isVector() || DstVT.isVector())
 997         continue;
 998
 999       // If the source and destination are SSE registers, then this is a legal
1000       // conversion that should not be lowered.
1001       const X86TargetLowering *X86Lowering =
1002           static_cast<const X86TargetLowering *>(TLI);
1003       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1004       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1005       if (SrcIsSSE && DstIsSSE)
1006         continue;
1007
1008       if (!SrcIsSSE && !DstIsSSE) {
1009         // If this is an FPStack extension, it is a noop.
1010         if (N->getOpcode() == ISD::FP_EXTEND)
1011           continue;
1012         // If this is a value-preserving FPStack truncation, it is a noop.
1013         if (N->getConstantOperandVal(1))
1014           continue;
1015       }
1016
1017       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1018       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1019       // operations.  Based on this, decide what we want to do.
1020       MVT MemVT;
1021       if (N->getOpcode() == ISD::FP_ROUND)
1022         MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1023       else
1024         MemVT = SrcIsSSE ? SrcVT : DstVT;
1025
1026       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1027       SDLoc dl(N);
1028
1029       // FIXME: optimize the case where the src/dest is a load or store?
1030
1031       SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
1032                                           MemTmp, MachinePointerInfo(), MemVT);
1033       SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1034                                           MachinePointerInfo(), MemVT);
1035
1036       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1037       // extload we created.  This will cause general havok on the dag because
1038       // anything below the conversion could be folded into other existing nodes.
1039       // To avoid invalidating 'I', back it up to the convert node.
1040       --I;
1041       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1042       break;
1043     }
1044
1045     //The sequence of events for lowering STRICT_FP versions of these nodes requires
1046     //dealing with the chain differently, as there is already a preexisting chain.
1047     case ISD::STRICT_FP_ROUND:
1048     case ISD::STRICT_FP_EXTEND:
1049     {
1050       MVT SrcVT = N->getOperand(1).getSimpleValueType();
1051       MVT DstVT = N->getSimpleValueType(0);
1052
1053       // If any of the sources are vectors, no fp stack involved.
1054       if (SrcVT.isVector() || DstVT.isVector())
1055         continue;
1056
1057       // If the source and destination are SSE registers, then this is a legal
1058       // conversion that should not be lowered.
1059       const X86TargetLowering *X86Lowering =
1060           static_cast<const X86TargetLowering *>(TLI);
1061       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1062       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1063       if (SrcIsSSE && DstIsSSE)
1064         continue;
1065
1066       if (!SrcIsSSE && !DstIsSSE) {
1067         // If this is an FPStack extension, it is a noop.
1068         if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1069           continue;
1070         // If this is a value-preserving FPStack truncation, it is a noop.
1071         if (N->getConstantOperandVal(2))
1072           continue;
1073       }
1074
1075       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1076       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1077       // operations.  Based on this, decide what we want to do.
1078       MVT MemVT;
1079       if (N->getOpcode() == ISD::STRICT_FP_ROUND)
1080         MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1081       else
1082         MemVT = SrcIsSSE ? SrcVT : DstVT;
1083
1084       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1085       SDLoc dl(N);
1086
1087       // FIXME: optimize the case where the src/dest is a load or store?
1088
1089       //Since the operation is StrictFP, use the preexisting chain.
1090       SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
1091                                 MemTmp, MachinePointerInfo(), MemVT);
1092       SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1093                                           MachinePointerInfo(), MemVT);
1094
1095       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1096       // extload we created.  This will cause general havok on the dag because
1097       // anything below the conversion could be folded into other existing nodes.
1098       // To avoid invalidating 'I', back it up to the convert node.
1099       --I;
1100       CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1101       break;
1102     }
1103     }
1104
1105
1106     // Now that we did that, the node is dead.  Increment the iterator to the
1107     // next node to process, then delete N.
1108     ++I;
1109     CurDAG->DeleteNode(N);
1110   }
1111
1112   // The load+call transform above can leave some dead nodes in the graph. Make
1113   // sure we remove them. Its possible some of the other transforms do to so
1114   // just remove dead nodes unconditionally.
1115   CurDAG->RemoveDeadNodes();
1116 }
1117
1118 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1119 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1120   unsigned Opc = N->getMachineOpcode();
1121   if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1122       Opc != X86::MOVSX64rr8)
1123     return false;
1124
1125   SDValue N0 = N->getOperand(0);
1126
1127   // We need to be extracting the lower bit of an extend.
1128   if (!N0.isMachineOpcode() ||
1129       N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1130       N0.getConstantOperandVal(1) != X86::sub_8bit)
1131     return false;
1132
1133   // We're looking for either a movsx or movzx to match the original opcode.
1134   unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1135                                                 : X86::MOVSX32rr8_NOREX;
1136   SDValue N00 = N0.getOperand(0);
1137   if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1138     return false;
1139
1140   if (Opc == X86::MOVSX64rr8) {
1141     // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1142     // to 64.
1143     MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1144                                                    MVT::i64, N00);
1145     ReplaceUses(N, Extend);
1146   } else {
1147     // Ok we can drop this extend and just use the original extend.
1148     ReplaceUses(N, N00.getNode());
1149   }
1150
1151   return true;
1152 }
1153
1154 void X86DAGToDAGISel::PostprocessISelDAG() {
1155   // Skip peepholes at -O0.
1156   if (TM.getOptLevel() == CodeGenOpt::None)
1157     return;
1158
1159   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1160
1161   bool MadeChange = false;
1162   while (Position != CurDAG->allnodes_begin()) {
1163     SDNode *N = &*--Position;
1164     // Skip dead nodes and any non-machine opcodes.
1165     if (N->use_empty() || !N->isMachineOpcode())
1166       continue;
1167
1168     if (tryOptimizeRem8Extend(N)) {
1169       MadeChange = true;
1170       continue;
1171     }
1172
1173     // Look for a TESTrr+ANDrr pattern where both operands of the test are
1174     // the same. Rewrite to remove the AND.
1175     unsigned Opc = N->getMachineOpcode();
1176     if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1177          Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1178         N->getOperand(0) == N->getOperand(1) &&
1179         N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1180         N->getOperand(0).isMachineOpcode()) {
1181       SDValue And = N->getOperand(0);
1182       unsigned N0Opc = And.getMachineOpcode();
1183       if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1184           N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
1185         MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1186                                                      MVT::i32,
1187                                                      And.getOperand(0),
1188                                                      And.getOperand(1));
1189         ReplaceUses(N, Test);
1190         MadeChange = true;
1191         continue;
1192       }
1193       if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1194           N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
1195         unsigned NewOpc;
1196         switch (N0Opc) {
1197         case X86::AND8rm:  NewOpc = X86::TEST8mr; break;
1198         case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1199         case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1200         case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1201         }
1202
1203         // Need to swap the memory and register operand.
1204         SDValue Ops[] = { And.getOperand(1),
1205                           And.getOperand(2),
1206                           And.getOperand(3),
1207                           And.getOperand(4),
1208                           And.getOperand(5),
1209                           And.getOperand(0),
1210                           And.getOperand(6)  /* Chain */ };
1211         MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1212                                                      MVT::i32, MVT::Other, Ops);
1213         ReplaceUses(N, Test);
1214         MadeChange = true;
1215         continue;
1216       }
1217     }
1218
1219     // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1220     // used. We're doing this late so we can prefer to fold the AND into masked
1221     // comparisons. Doing that can be better for the live range of the mask
1222     // register.
1223     if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1224          Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1225         N->getOperand(0) == N->getOperand(1) &&
1226         N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1227         N->getOperand(0).isMachineOpcode() &&
1228         onlyUsesZeroFlag(SDValue(N, 0))) {
1229       SDValue And = N->getOperand(0);
1230       unsigned N0Opc = And.getMachineOpcode();
1231       // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1232       // KAND instructions and KTEST use the same ISA feature.
1233       if (N0Opc == X86::KANDBrr ||
1234           (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1235           N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1236         unsigned NewOpc;
1237         switch (Opc) {
1238         default: llvm_unreachable("Unexpected opcode!");
1239         case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1240         case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1241         case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1242         case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1243         }
1244         MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1245                                                       MVT::i32,
1246                                                       And.getOperand(0),
1247                                                       And.getOperand(1));
1248         ReplaceUses(N, KTest);
1249         MadeChange = true;
1250         continue;
1251       }
1252     }
1253
1254     // Attempt to remove vectors moves that were inserted to zero upper bits.
1255     if (Opc != TargetOpcode::SUBREG_TO_REG)
1256       continue;
1257
1258     unsigned SubRegIdx = N->getConstantOperandVal(2);
1259     if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1260       continue;
1261
1262     SDValue Move = N->getOperand(1);
1263     if (!Move.isMachineOpcode())
1264       continue;
1265
1266     // Make sure its one of the move opcodes we recognize.
1267     switch (Move.getMachineOpcode()) {
1268     default:
1269       continue;
1270     case X86::VMOVAPDrr:       case X86::VMOVUPDrr:
1271     case X86::VMOVAPSrr:       case X86::VMOVUPSrr:
1272     case X86::VMOVDQArr:       case X86::VMOVDQUrr:
1273     case X86::VMOVAPDYrr:      case X86::VMOVUPDYrr:
1274     case X86::VMOVAPSYrr:      case X86::VMOVUPSYrr:
1275     case X86::VMOVDQAYrr:      case X86::VMOVDQUYrr:
1276     case X86::VMOVAPDZ128rr:   case X86::VMOVUPDZ128rr:
1277     case X86::VMOVAPSZ128rr:   case X86::VMOVUPSZ128rr:
1278     case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1279     case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1280     case X86::VMOVAPDZ256rr:   case X86::VMOVUPDZ256rr:
1281     case X86::VMOVAPSZ256rr:   case X86::VMOVUPSZ256rr:
1282     case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1283     case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1284       break;
1285     }
1286
1287     SDValue In = Move.getOperand(0);
1288     if (!In.isMachineOpcode() ||
1289         In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1290       continue;
1291
1292     // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1293     // the SHA instructions which use a legacy encoding.
1294     uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1295     if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1296         (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1297         (TSFlags & X86II::EncodingMask) != X86II::XOP)
1298       continue;
1299
1300     // Producing instruction is another vector instruction. We can drop the
1301     // move.
1302     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1303     MadeChange = true;
1304   }
1305
1306   if (MadeChange)
1307     CurDAG->RemoveDeadNodes();
1308 }
1309
1310
1311 /// Emit any code that needs to be executed only in the main function.
1312 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1313   if (Subtarget->isTargetCygMing()) {
1314     TargetLowering::ArgListTy Args;
1315     auto &DL = CurDAG->getDataLayout();
1316
1317     TargetLowering::CallLoweringInfo CLI(*CurDAG);
1318     CLI.setChain(CurDAG->getRoot())
1319         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1320                    CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1321                    std::move(Args));
1322     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1323     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1324     CurDAG->setRoot(Result.second);
1325   }
1326 }
1327
1328 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1329   // If this is main, emit special code for main.
1330   const Function &F = MF->getFunction();
1331   if (F.hasExternalLinkage() && F.getName() == "main")
1332     emitSpecialCodeForMain();
1333 }
1334
1335 static bool isDispSafeForFrameIndex(int64_t Val) {
1336   // On 64-bit platforms, we can run into an issue where a frame index
1337   // includes a displacement that, when added to the explicit displacement,
1338   // will overflow the displacement field. Assuming that the frame index
1339   // displacement fits into a 31-bit integer  (which is only slightly more
1340   // aggressive than the current fundamental assumption that it fits into
1341   // a 32-bit integer), a 31-bit disp should always be safe.
1342   return isInt<31>(Val);
1343 }
1344
1345 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1346                                             X86ISelAddressMode &AM) {
1347   // If there's no offset to fold, we don't need to do any work.
1348   if (Offset == 0)
1349     return false;
1350
1351   // Cannot combine ExternalSymbol displacements with integer offsets.
1352   if (AM.ES || AM.MCSym)
1353     return true;
1354
1355   int64_t Val = AM.Disp + Offset;
1356   CodeModel::Model M = TM.getCodeModel();
1357   if (Subtarget->is64Bit()) {
1358     if (!X86::isOffsetSuitableForCodeModel(Val, M,
1359                                            AM.hasSymbolicDisplacement()))
1360       return true;
1361     // In addition to the checks required for a register base, check that
1362     // we do not try to use an unsafe Disp with a frame index.
1363     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1364         !isDispSafeForFrameIndex(Val))
1365       return true;
1366   }
1367   AM.Disp = Val;
1368   return false;
1369
1370 }
1371
1372 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1373   SDValue Address = N->getOperand(1);
1374
1375   // load gs:0 -> GS segment register.
1376   // load fs:0 -> FS segment register.
1377   //
1378   // This optimization is valid because the GNU TLS model defines that
1379   // gs:0 (or fs:0 on X86-64) contains its own address.
1380   // For more information see http://people.redhat.com/drepper/tls.pdf
1381   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1382     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1383         !IndirectTlsSegRefs &&
1384         (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1385          Subtarget->isTargetFuchsia()))
1386       switch (N->getPointerInfo().getAddrSpace()) {
1387       case 256:
1388         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1389         return false;
1390       case 257:
1391         AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1392         return false;
1393       // Address space 258 is not handled here, because it is not used to
1394       // address TLS areas.
1395       }
1396
1397   return true;
1398 }
1399
1400 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1401 /// mode. These wrap things that will resolve down into a symbol reference.
1402 /// If no match is possible, this returns true, otherwise it returns false.
1403 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1404   // If the addressing mode already has a symbol as the displacement, we can
1405   // never match another symbol.
1406   if (AM.hasSymbolicDisplacement())
1407     return true;
1408
1409   bool IsRIPRelTLS = false;
1410   bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1411   if (IsRIPRel) {
1412     SDValue Val = N.getOperand(0);
1413     if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1414       IsRIPRelTLS = true;
1415   }
1416
1417   // We can't use an addressing mode in the 64-bit large code model.
1418   // Global TLS addressing is an exception. In the medium code model,
1419   // we use can use a mode when RIP wrappers are present.
1420   // That signifies access to globals that are known to be "near",
1421   // such as the GOT itself.
1422   CodeModel::Model M = TM.getCodeModel();
1423   if (Subtarget->is64Bit() &&
1424       ((M == CodeModel::Large && !IsRIPRelTLS) ||
1425        (M == CodeModel::Medium && !IsRIPRel)))
1426     return true;
1427
1428   // Base and index reg must be 0 in order to use %rip as base.
1429   if (IsRIPRel && AM.hasBaseOrIndexReg())
1430     return true;
1431
1432   // Make a local copy in case we can't do this fold.
1433   X86ISelAddressMode Backup = AM;
1434
1435   int64_t Offset = 0;
1436   SDValue N0 = N.getOperand(0);
1437   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1438     AM.GV = G->getGlobal();
1439     AM.SymbolFlags = G->getTargetFlags();
1440     Offset = G->getOffset();
1441   } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1442     AM.CP = CP->getConstVal();
1443     AM.Align = CP->getAlignment();
1444     AM.SymbolFlags = CP->getTargetFlags();
1445     Offset = CP->getOffset();
1446   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1447     AM.ES = S->getSymbol();
1448     AM.SymbolFlags = S->getTargetFlags();
1449   } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1450     AM.MCSym = S->getMCSymbol();
1451   } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1452     AM.JT = J->getIndex();
1453     AM.SymbolFlags = J->getTargetFlags();
1454   } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1455     AM.BlockAddr = BA->getBlockAddress();
1456     AM.SymbolFlags = BA->getTargetFlags();
1457     Offset = BA->getOffset();
1458   } else
1459     llvm_unreachable("Unhandled symbol reference node.");
1460
1461   if (foldOffsetIntoAddress(Offset, AM)) {
1462     AM = Backup;
1463     return true;
1464   }
1465
1466   if (IsRIPRel)
1467     AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1468
1469   // Commit the changes now that we know this fold is safe.
1470   return false;
1471 }
1472
1473 /// Add the specified node to the specified addressing mode, returning true if
1474 /// it cannot be done. This just pattern matches for the addressing mode.
1475 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1476   if (matchAddressRecursively(N, AM, 0))
1477     return true;
1478
1479   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1480   // a smaller encoding and avoids a scaled-index.
1481   if (AM.Scale == 2 &&
1482       AM.BaseType == X86ISelAddressMode::RegBase &&
1483       AM.Base_Reg.getNode() == nullptr) {
1484     AM.Base_Reg = AM.IndexReg;
1485     AM.Scale = 1;
1486   }
1487
1488   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1489   // because it has a smaller encoding.
1490   // TODO: Which other code models can use this?
1491   switch (TM.getCodeModel()) {
1492     default: break;
1493     case CodeModel::Small:
1494     case CodeModel::Kernel:
1495       if (Subtarget->is64Bit() &&
1496           AM.Scale == 1 &&
1497           AM.BaseType == X86ISelAddressMode::RegBase &&
1498           AM.Base_Reg.getNode() == nullptr &&
1499           AM.IndexReg.getNode() == nullptr &&
1500           AM.SymbolFlags == X86II::MO_NO_FLAG &&
1501           AM.hasSymbolicDisplacement())
1502         AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1503       break;
1504   }
1505
1506   return false;
1507 }
1508
1509 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1510                                unsigned Depth) {
1511   // Add an artificial use to this node so that we can keep track of
1512   // it if it gets CSE'd with a different node.
1513   HandleSDNode Handle(N);
1514
1515   X86ISelAddressMode Backup = AM;
1516   if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1517       !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1518     return false;
1519   AM = Backup;
1520
1521   // Try again after commuting the operands.
1522   if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1523       !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1524     return false;
1525   AM = Backup;
1526
1527   // If we couldn't fold both operands into the address at the same time,
1528   // see if we can just put each operand into a register and fold at least
1529   // the add.
1530   if (AM.BaseType == X86ISelAddressMode::RegBase &&
1531       !AM.Base_Reg.getNode() &&
1532       !AM.IndexReg.getNode()) {
1533     N = Handle.getValue();
1534     AM.Base_Reg = N.getOperand(0);
1535     AM.IndexReg = N.getOperand(1);
1536     AM.Scale = 1;
1537     return false;
1538   }
1539   N = Handle.getValue();
1540   return true;
1541 }
1542
1543 // Insert a node into the DAG at least before the Pos node's position. This
1544 // will reposition the node as needed, and will assign it a node ID that is <=
1545 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1546 // IDs! The selection DAG must no longer depend on their uniqueness when this
1547 // is used.
1548 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1549   if (N->getNodeId() == -1 ||
1550       (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
1551        SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
1552     DAG.RepositionNode(Pos->getIterator(), N.getNode());
1553     // Mark Node as invalid for pruning as after this it may be a successor to a
1554     // selected node but otherwise be in the same position of Pos.
1555     // Conservatively mark it with the same -abs(Id) to assure node id
1556     // invariant is preserved.
1557     N->setNodeId(Pos->getNodeId());
1558     SelectionDAGISel::InvalidateNodeId(N.getNode());
1559   }
1560 }
1561
1562 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1563 // safe. This allows us to convert the shift and and into an h-register
1564 // extract and a scaled index. Returns false if the simplification is
1565 // performed.
1566 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1567                                       uint64_t Mask,
1568                                       SDValue Shift, SDValue X,
1569                                       X86ISelAddressMode &AM) {
1570   if (Shift.getOpcode() != ISD::SRL ||
1571       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1572       !Shift.hasOneUse())
1573     return true;
1574
1575   int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1576   if (ScaleLog <= 0 || ScaleLog >= 4 ||
1577       Mask != (0xffu << ScaleLog))
1578     return true;
1579
1580   MVT VT = N.getSimpleValueType();
1581   SDLoc DL(N);
1582   SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1583   SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1584   SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1585   SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1586   SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1587   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1588
1589   // Insert the new nodes into the topological ordering. We must do this in
1590   // a valid topological ordering as nothing is going to go back and re-sort
1591   // these nodes. We continually insert before 'N' in sequence as this is
1592   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1593   // hierarchy left to express.
1594   insertDAGNode(DAG, N, Eight);
1595   insertDAGNode(DAG, N, Srl);
1596   insertDAGNode(DAG, N, NewMask);
1597   insertDAGNode(DAG, N, And);
1598   insertDAGNode(DAG, N, ShlCount);
1599   insertDAGNode(DAG, N, Shl);
1600   DAG.ReplaceAllUsesWith(N, Shl);
1601   DAG.RemoveDeadNode(N.getNode());
1602   AM.IndexReg = And;
1603   AM.Scale = (1 << ScaleLog);
1604   return false;
1605 }
1606
1607 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1608 // allows us to fold the shift into this addressing mode. Returns false if the
1609 // transform succeeded.
1610 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
1611                                         X86ISelAddressMode &AM) {
1612   SDValue Shift = N.getOperand(0);
1613
1614   // Use a signed mask so that shifting right will insert sign bits. These
1615   // bits will be removed when we shift the result left so it doesn't matter
1616   // what we use. This might allow a smaller immediate encoding.
1617   int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1618
1619   // If we have an any_extend feeding the AND, look through it to see if there
1620   // is a shift behind it. But only if the AND doesn't use the extended bits.
1621   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1622   bool FoundAnyExtend = false;
1623   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
1624       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
1625       isUInt<32>(Mask)) {
1626     FoundAnyExtend = true;
1627     Shift = Shift.getOperand(0);
1628   }
1629
1630   if (Shift.getOpcode() != ISD::SHL ||
1631       !isa<ConstantSDNode>(Shift.getOperand(1)))
1632     return true;
1633
1634   SDValue X = Shift.getOperand(0);
1635
1636   // Not likely to be profitable if either the AND or SHIFT node has more
1637   // than one use (unless all uses are for address computation). Besides,
1638   // isel mechanism requires their node ids to be reused.
1639   if (!N.hasOneUse() || !Shift.hasOneUse())
1640     return true;
1641
1642   // Verify that the shift amount is something we can fold.
1643   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1644   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1645     return true;
1646
1647   MVT VT = N.getSimpleValueType();
1648   SDLoc DL(N);
1649   if (FoundAnyExtend) {
1650     SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1651     insertDAGNode(DAG, N, NewX);
1652     X = NewX;
1653   }
1654
1655   SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1656   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1657   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1658
1659   // Insert the new nodes into the topological ordering. We must do this in
1660   // a valid topological ordering as nothing is going to go back and re-sort
1661   // these nodes. We continually insert before 'N' in sequence as this is
1662   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1663   // hierarchy left to express.
1664   insertDAGNode(DAG, N, NewMask);
1665   insertDAGNode(DAG, N, NewAnd);
1666   insertDAGNode(DAG, N, NewShift);
1667   DAG.ReplaceAllUsesWith(N, NewShift);
1668   DAG.RemoveDeadNode(N.getNode());
1669
1670   AM.Scale = 1 << ShiftAmt;
1671   AM.IndexReg = NewAnd;
1672   return false;
1673 }
1674
1675 // Implement some heroics to detect shifts of masked values where the mask can
1676 // be replaced by extending the shift and undoing that in the addressing mode
1677 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1678 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1679 // the addressing mode. This results in code such as:
1680 //
1681 //   int f(short *y, int *lookup_table) {
1682 //     ...
1683 //     return *y + lookup_table[*y >> 11];
1684 //   }
1685 //
1686 // Turning into:
1687 //   movzwl (%rdi), %eax
1688 //   movl %eax, %ecx
1689 //   shrl $11, %ecx
1690 //   addl (%rsi,%rcx,4), %eax
1691 //
1692 // Instead of:
1693 //   movzwl (%rdi), %eax
1694 //   movl %eax, %ecx
1695 //   shrl $9, %ecx
1696 //   andl $124, %rcx
1697 //   addl (%rsi,%rcx), %eax
1698 //
1699 // Note that this function assumes the mask is provided as a mask *after* the
1700 // value is shifted. The input chain may or may not match that, but computing
1701 // such a mask is trivial.
1702 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
1703                                     uint64_t Mask,
1704                                     SDValue Shift, SDValue X,
1705                                     X86ISelAddressMode &AM) {
1706   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1707       !isa<ConstantSDNode>(Shift.getOperand(1)))
1708     return true;
1709
1710   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1711   unsigned MaskLZ = countLeadingZeros(Mask);
1712   unsigned MaskTZ = countTrailingZeros(Mask);
1713
1714   // The amount of shift we're trying to fit into the addressing mode is taken
1715   // from the trailing zeros of the mask.
1716   unsigned AMShiftAmt = MaskTZ;
1717
1718   // There is nothing we can do here unless the mask is removing some bits.
1719   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1720   if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1721
1722   // We also need to ensure that mask is a continuous run of bits.
1723   if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1724
1725   // Scale the leading zero count down based on the actual size of the value.
1726   // Also scale it down based on the size of the shift.
1727   unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1728   if (MaskLZ < ScaleDown)
1729     return true;
1730   MaskLZ -= ScaleDown;
1731
1732   // The final check is to ensure that any masked out high bits of X are
1733   // already known to be zero. Otherwise, the mask has a semantic impact
1734   // other than masking out a couple of low bits. Unfortunately, because of
1735   // the mask, zero extensions will be removed from operands in some cases.
1736   // This code works extra hard to look through extensions because we can
1737   // replace them with zero extensions cheaply if necessary.
1738   bool ReplacingAnyExtend = false;
1739   if (X.getOpcode() == ISD::ANY_EXTEND) {
1740     unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1741                           X.getOperand(0).getSimpleValueType().getSizeInBits();
1742     // Assume that we'll replace the any-extend with a zero-extend, and
1743     // narrow the search to the extended value.
1744     X = X.getOperand(0);
1745     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1746     ReplacingAnyExtend = true;
1747   }
1748   APInt MaskedHighBits =
1749     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
1750   KnownBits Known = DAG.computeKnownBits(X);
1751   if (MaskedHighBits != Known.Zero) return true;
1752
1753   // We've identified a pattern that can be transformed into a single shift
1754   // and an addressing mode. Make it so.
1755   MVT VT = N.getSimpleValueType();
1756   if (ReplacingAnyExtend) {
1757     assert(X.getValueType() != VT);
1758     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1759     SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1760     insertDAGNode(DAG, N, NewX);
1761     X = NewX;
1762   }
1763   SDLoc DL(N);
1764   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1765   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1766   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1767   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1768
1769   // Insert the new nodes into the topological ordering. We must do this in
1770   // a valid topological ordering as nothing is going to go back and re-sort
1771   // these nodes. We continually insert before 'N' in sequence as this is
1772   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1773   // hierarchy left to express.
1774   insertDAGNode(DAG, N, NewSRLAmt);
1775   insertDAGNode(DAG, N, NewSRL);
1776   insertDAGNode(DAG, N, NewSHLAmt);
1777   insertDAGNode(DAG, N, NewSHL);
1778   DAG.ReplaceAllUsesWith(N, NewSHL);
1779   DAG.RemoveDeadNode(N.getNode());
1780
1781   AM.Scale = 1 << AMShiftAmt;
1782   AM.IndexReg = NewSRL;
1783   return false;
1784 }
1785
1786 // Transform "(X >> SHIFT) & (MASK << C1)" to
1787 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1788 // matched to a BEXTR later. Returns false if the simplification is performed.
1789 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
1790                                    uint64_t Mask,
1791                                    SDValue Shift, SDValue X,
1792                                    X86ISelAddressMode &AM,
1793                                    const X86Subtarget &Subtarget) {
1794   if (Shift.getOpcode() != ISD::SRL ||
1795       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1796       !Shift.hasOneUse() || !N.hasOneUse())
1797     return true;
1798
1799   // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1800   if (!Subtarget.hasTBM() &&
1801       !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1802     return true;
1803
1804   // We need to ensure that mask is a continuous run of bits.
1805   if (!isShiftedMask_64(Mask)) return true;
1806
1807   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1808
1809   // The amount of shift we're trying to fit into the addressing mode is taken
1810   // from the trailing zeros of the mask.
1811   unsigned AMShiftAmt = countTrailingZeros(Mask);
1812
1813   // There is nothing we can do here unless the mask is removing some bits.
1814   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1815   if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1816
1817   MVT VT = N.getSimpleValueType();
1818   SDLoc DL(N);
1819   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1820   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1821   SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1822   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1823   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1824   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1825
1826   // Insert the new nodes into the topological ordering. We must do this in
1827   // a valid topological ordering as nothing is going to go back and re-sort
1828   // these nodes. We continually insert before 'N' in sequence as this is
1829   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1830   // hierarchy left to express.
1831   insertDAGNode(DAG, N, NewSRLAmt);
1832   insertDAGNode(DAG, N, NewSRL);
1833   insertDAGNode(DAG, N, NewMask);
1834   insertDAGNode(DAG, N, NewAnd);
1835   insertDAGNode(DAG, N, NewSHLAmt);
1836   insertDAGNode(DAG, N, NewSHL);
1837   DAG.ReplaceAllUsesWith(N, NewSHL);
1838   DAG.RemoveDeadNode(N.getNode());
1839
1840   AM.Scale = 1 << AMShiftAmt;
1841   AM.IndexReg = NewAnd;
1842   return false;
1843 }
1844
1845 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1846                                               unsigned Depth) {
1847   SDLoc dl(N);
1848   LLVM_DEBUG({
1849     dbgs() << "MatchAddress: ";
1850     AM.dump(CurDAG);
1851   });
1852   // Limit recursion.
1853   if (Depth > 5)
1854     return matchAddressBase(N, AM);
1855
1856   // If this is already a %rip relative address, we can only merge immediates
1857   // into it.  Instead of handling this in every case, we handle it here.
1858   // RIP relative addressing: %rip + 32-bit displacement!
1859   if (AM.isRIPRelative()) {
1860     // FIXME: JumpTable and ExternalSymbol address currently don't like
1861     // displacements.  It isn't very important, but this should be fixed for
1862     // consistency.
1863     if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1864       return true;
1865
1866     if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1867       if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1868         return false;
1869     return true;
1870   }
1871
1872   switch (N.getOpcode()) {
1873   default: break;
1874   case ISD::LOCAL_RECOVER: {
1875     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1876       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1877         // Use the symbol and don't prefix it.
1878         AM.MCSym = ESNode->getMCSymbol();
1879         return false;
1880       }
1881     break;
1882   }
1883   case ISD::Constant: {
1884     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1885     if (!foldOffsetIntoAddress(Val, AM))
1886       return false;
1887     break;
1888   }
1889
1890   case X86ISD::Wrapper:
1891   case X86ISD::WrapperRIP:
1892     if (!matchWrapper(N, AM))
1893       return false;
1894     break;
1895
1896   case ISD::LOAD:
1897     if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1898       return false;
1899     break;
1900
1901   case ISD::FrameIndex:
1902     if (AM.BaseType == X86ISelAddressMode::RegBase &&
1903         AM.Base_Reg.getNode() == nullptr &&
1904         (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1905       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1906       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1907       return false;
1908     }
1909     break;
1910
1911   case ISD::SHL:
1912     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
1913       break;
1914
1915     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1916       unsigned Val = CN->getZExtValue();
1917       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1918       // that the base operand remains free for further matching. If
1919       // the base doesn't end up getting used, a post-processing step
1920       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1921       if (Val == 1 || Val == 2 || Val == 3) {
1922         AM.Scale = 1 << Val;
1923         SDValue ShVal = N.getOperand(0);
1924
1925         // Okay, we know that we have a scale by now.  However, if the scaled
1926         // value is an add of something and a constant, we can fold the
1927         // constant into the disp field here.
1928         if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1929           AM.IndexReg = ShVal.getOperand(0);
1930           ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1931           uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1932           if (!foldOffsetIntoAddress(Disp, AM))
1933             return false;
1934         }
1935
1936         AM.IndexReg = ShVal;
1937         return false;
1938       }
1939     }
1940     break;
1941
1942   case ISD::SRL: {
1943     // Scale must not be used already.
1944     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
1945
1946     // We only handle up to 64-bit values here as those are what matter for
1947     // addressing mode optimizations.
1948     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
1949            "Unexpected value size!");
1950
1951     SDValue And = N.getOperand(0);
1952     if (And.getOpcode() != ISD::AND) break;
1953     SDValue X = And.getOperand(0);
1954
1955     // The mask used for the transform is expected to be post-shift, but we
1956     // found the shift first so just apply the shift to the mask before passing
1957     // it down.
1958     if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1959         !isa<ConstantSDNode>(And.getOperand(1)))
1960       break;
1961     uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1962
1963     // Try to fold the mask and shift into the scale, and return false if we
1964     // succeed.
1965     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1966       return false;
1967     break;
1968   }
1969
1970   case ISD::SMUL_LOHI:
1971   case ISD::UMUL_LOHI:
1972     // A mul_lohi where we need the low part can be folded as a plain multiply.
1973     if (N.getResNo() != 0) break;
1974     LLVM_FALLTHROUGH;
1975   case ISD::MUL:
1976   case X86ISD::MUL_IMM:
1977     // X*[3,5,9] -> X+X*[2,4,8]
1978     if (AM.BaseType == X86ISelAddressMode::RegBase &&
1979         AM.Base_Reg.getNode() == nullptr &&
1980         AM.IndexReg.getNode() == nullptr) {
1981       if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
1982         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
1983             CN->getZExtValue() == 9) {
1984           AM.Scale = unsigned(CN->getZExtValue())-1;
1985
1986           SDValue MulVal = N.getOperand(0);
1987           SDValue Reg;
1988
1989           // Okay, we know that we have a scale by now.  However, if the scaled
1990           // value is an add of something and a constant, we can fold the
1991           // constant into the disp field here.
1992           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
1993               isa<ConstantSDNode>(MulVal.getOperand(1))) {
1994             Reg = MulVal.getOperand(0);
1995             ConstantSDNode *AddVal =
1996               cast<ConstantSDNode>(MulVal.getOperand(1));
1997             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1998             if (foldOffsetIntoAddress(Disp, AM))
1999               Reg = N.getOperand(0);
2000           } else {
2001             Reg = N.getOperand(0);
2002           }
2003
2004           AM.IndexReg = AM.Base_Reg = Reg;
2005           return false;
2006         }
2007     }
2008     break;
2009
2010   case ISD::SUB: {
2011     // Given A-B, if A can be completely folded into the address and
2012     // the index field with the index field unused, use -B as the index.
2013     // This is a win if a has multiple parts that can be folded into
2014     // the address. Also, this saves a mov if the base register has
2015     // other uses, since it avoids a two-address sub instruction, however
2016     // it costs an additional mov if the index register has other uses.
2017
2018     // Add an artificial use to this node so that we can keep track of
2019     // it if it gets CSE'd with a different node.
2020     HandleSDNode Handle(N);
2021
2022     // Test if the LHS of the sub can be folded.
2023     X86ISelAddressMode Backup = AM;
2024     if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2025       N = Handle.getValue();
2026       AM = Backup;
2027       break;
2028     }
2029     N = Handle.getValue();
2030     // Test if the index field is free for use.
2031     if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2032       AM = Backup;
2033       break;
2034     }
2035
2036     int Cost = 0;
2037     SDValue RHS = N.getOperand(1);
2038     // If the RHS involves a register with multiple uses, this
2039     // transformation incurs an extra mov, due to the neg instruction
2040     // clobbering its operand.
2041     if (!RHS.getNode()->hasOneUse() ||
2042         RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2043         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2044         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2045         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2046          RHS.getOperand(0).getValueType() == MVT::i32))
2047       ++Cost;
2048     // If the base is a register with multiple uses, this
2049     // transformation may save a mov.
2050     if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2051          !AM.Base_Reg.getNode()->hasOneUse()) ||
2052         AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2053       --Cost;
2054     // If the folded LHS was interesting, this transformation saves
2055     // address arithmetic.
2056     if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2057         ((AM.Disp != 0) && (Backup.Disp == 0)) +
2058         (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2059       --Cost;
2060     // If it doesn't look like it may be an overall win, don't do it.
2061     if (Cost >= 0) {
2062       AM = Backup;
2063       break;
2064     }
2065
2066     // Ok, the transformation is legal and appears profitable. Go for it.
2067     // Negation will be emitted later to avoid creating dangling nodes if this
2068     // was an unprofitable LEA.
2069     AM.IndexReg = RHS;
2070     AM.NegateIndex = true;
2071     AM.Scale = 1;
2072     return false;
2073   }
2074
2075   case ISD::ADD:
2076     if (!matchAdd(N, AM, Depth))
2077       return false;
2078     break;
2079
2080   case ISD::OR:
2081     // We want to look through a transform in InstCombine and DAGCombiner that
2082     // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2083     // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2084     // An 'lea' can then be used to match the shift (multiply) and add:
2085     // and $1, %esi
2086     // lea (%rsi, %rdi, 8), %rax
2087     if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2088         !matchAdd(N, AM, Depth))
2089       return false;
2090     break;
2091
2092   case ISD::AND: {
2093     // Perform some heroic transforms on an and of a constant-count shift
2094     // with a constant to enable use of the scaled offset field.
2095
2096     // Scale must not be used already.
2097     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2098
2099     // We only handle up to 64-bit values here as those are what matter for
2100     // addressing mode optimizations.
2101     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2102            "Unexpected value size!");
2103
2104     if (!isa<ConstantSDNode>(N.getOperand(1)))
2105       break;
2106
2107     if (N.getOperand(0).getOpcode() == ISD::SRL) {
2108       SDValue Shift = N.getOperand(0);
2109       SDValue X = Shift.getOperand(0);
2110
2111       uint64_t Mask = N.getConstantOperandVal(1);
2112
2113       // Try to fold the mask and shift into an extract and scale.
2114       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2115         return false;
2116
2117       // Try to fold the mask and shift directly into the scale.
2118       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2119         return false;
2120
2121       // Try to fold the mask and shift into BEXTR and scale.
2122       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2123         return false;
2124     }
2125
2126     // Try to swap the mask and shift to place shifts which can be done as
2127     // a scale on the outside of the mask.
2128     if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2129       return false;
2130
2131     break;
2132   }
2133   case ISD::ZERO_EXTEND: {
2134     // Try to widen a zexted shift left to the same size as its use, so we can
2135     // match the shift as a scale factor.
2136     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2137       break;
2138     if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
2139       break;
2140
2141     // Give up if the shift is not a valid scale factor [1,2,3].
2142     SDValue Shl = N.getOperand(0);
2143     auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2144     if (!ShAmtC || ShAmtC->getZExtValue() > 3)
2145       break;
2146
2147     // The narrow shift must only shift out zero bits (it must be 'nuw').
2148     // That makes it safe to widen to the destination type.
2149     APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
2150                                             ShAmtC->getZExtValue());
2151     if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2152       break;
2153
2154     // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2155     MVT VT = N.getSimpleValueType();
2156     SDLoc DL(N);
2157     SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2158     SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2159
2160     // Convert the shift to scale factor.
2161     AM.Scale = 1 << ShAmtC->getZExtValue();
2162     AM.IndexReg = Zext;
2163
2164     insertDAGNode(*CurDAG, N, Zext);
2165     insertDAGNode(*CurDAG, N, NewShl);
2166     CurDAG->ReplaceAllUsesWith(N, NewShl);
2167     CurDAG->RemoveDeadNode(N.getNode());
2168     return false;
2169   }
2170   }
2171
2172   return matchAddressBase(N, AM);
2173 }
2174
2175 /// Helper for MatchAddress. Add the specified node to the
2176 /// specified addressing mode without any further recursion.
2177 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2178   // Is the base register already occupied?
2179   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2180     // If so, check to see if the scale index register is set.
2181     if (!AM.IndexReg.getNode()) {
2182       AM.IndexReg = N;
2183       AM.Scale = 1;
2184       return false;
2185     }
2186
2187     // Otherwise, we cannot select it.
2188     return true;
2189   }
2190
2191   // Default, generate it as a register.
2192   AM.BaseType = X86ISelAddressMode::RegBase;
2193   AM.Base_Reg = N;
2194   return false;
2195 }
2196
2197 /// Helper for selectVectorAddr. Handles things that can be folded into a
2198 /// gather scatter address. The index register and scale should have already
2199 /// been handled.
2200 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2201   // TODO: Support other operations.
2202   switch (N.getOpcode()) {
2203   case ISD::Constant: {
2204     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2205     if (!foldOffsetIntoAddress(Val, AM))
2206       return false;
2207     break;
2208   }
2209   case X86ISD::Wrapper:
2210     if (!matchWrapper(N, AM))
2211       return false;
2212     break;
2213   }
2214
2215   return matchAddressBase(N, AM);
2216 }
2217
2218 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
2219                                        SDValue &Scale, SDValue &Index,
2220                                        SDValue &Disp, SDValue &Segment) {
2221   X86ISelAddressMode AM;
2222   auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
2223   AM.IndexReg = Mgs->getIndex();
2224   AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
2225
2226   unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2227   // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2228   if (AddrSpace == 256)
2229     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2230   if (AddrSpace == 257)
2231     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2232   if (AddrSpace == 258)
2233     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2234
2235   SDLoc DL(N);
2236   MVT VT = N.getSimpleValueType();
2237
2238   // Try to match into the base and displacement fields.
2239   if (matchVectorAddress(N, AM))
2240     return false;
2241
2242   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2243   return true;
2244 }
2245
2246 /// Returns true if it is able to pattern match an addressing mode.
2247 /// It returns the operands which make up the maximal addressing mode it can
2248 /// match by reference.
2249 ///
2250 /// Parent is the parent node of the addr operand that is being matched.  It
2251 /// is always a load, store, atomic node, or null.  It is only null when
2252 /// checking memory operands for inline asm nodes.
2253 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2254                                  SDValue &Scale, SDValue &Index,
2255                                  SDValue &Disp, SDValue &Segment) {
2256   X86ISelAddressMode AM;
2257
2258   if (Parent &&
2259       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2260       // that are not a MemSDNode, and thus don't have proper addrspace info.
2261       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2262       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2263       Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2264       Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2265       Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2266       Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2267       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2268     unsigned AddrSpace =
2269       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2270     // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2271     if (AddrSpace == 256)
2272       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2273     if (AddrSpace == 257)
2274       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2275     if (AddrSpace == 258)
2276       AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2277   }
2278
2279   // Save the DL and VT before calling matchAddress, it can invalidate N.
2280   SDLoc DL(N);
2281   MVT VT = N.getSimpleValueType();
2282
2283   if (matchAddress(N, AM))
2284     return false;
2285
2286   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2287   return true;
2288 }
2289
2290 // We can only fold a load if all nodes between it and the root node have a
2291 // single use. If there are additional uses, we could end up duplicating the
2292 // load.
2293 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2294   while (User != Root) {
2295     if (!User->hasOneUse())
2296       return false;
2297     User = *User->use_begin();
2298   }
2299
2300   return true;
2301 }
2302
2303 /// Match a scalar SSE load. In particular, we want to match a load whose top
2304 /// elements are either undef or zeros. The load flavor is derived from the
2305 /// type of N, which is either v4f32 or v2f64.
2306 ///
2307 /// We also return:
2308 ///   PatternChainNode: this is the matched node that has a chain input and
2309 ///   output.
2310 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2311                                           SDValue N, SDValue &Base,
2312                                           SDValue &Scale, SDValue &Index,
2313                                           SDValue &Disp, SDValue &Segment,
2314                                           SDValue &PatternNodeWithChain) {
2315   if (!hasSingleUsesFromRoot(Root, Parent))
2316     return false;
2317
2318   // We can allow a full vector load here since narrowing a load is ok unless
2319   // it's volatile or atomic.
2320   if (ISD::isNON_EXTLoad(N.getNode())) {
2321     LoadSDNode *LD = cast<LoadSDNode>(N);
2322     if (LD->isSimple() &&
2323         IsProfitableToFold(N, LD, Root) &&
2324         IsLegalToFold(N, Parent, Root, OptLevel)) {
2325       PatternNodeWithChain = N;
2326       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2327                         Segment);
2328     }
2329   }
2330
2331   // We can also match the special zero extended load opcode.
2332   if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2333     PatternNodeWithChain = N;
2334     if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2335         IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2336       auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2337       return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2338                         Segment);
2339     }
2340   }
2341
2342   // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2343   // once. Otherwise the load might get duplicated and the chain output of the
2344   // duplicate load will not be observed by all dependencies.
2345   if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2346     PatternNodeWithChain = N.getOperand(0);
2347     if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2348         IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2349         IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2350       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2351       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2352                         Segment);
2353     }
2354   }
2355
2356   return false;
2357 }
2358
2359
2360 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2361   if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2362     uint64_t ImmVal = CN->getZExtValue();
2363     if (!isUInt<32>(ImmVal))
2364       return false;
2365
2366     Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2367     return true;
2368   }
2369
2370   // In static codegen with small code model, we can get the address of a label
2371   // into a register with 'movl'
2372   if (N->getOpcode() != X86ISD::Wrapper)
2373     return false;
2374
2375   N = N.getOperand(0);
2376
2377   // At least GNU as does not accept 'movl' for TPOFF relocations.
2378   // FIXME: We could use 'movl' when we know we are targeting MC.
2379   if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2380     return false;
2381
2382   Imm = N;
2383   if (N->getOpcode() != ISD::TargetGlobalAddress)
2384     return TM.getCodeModel() == CodeModel::Small;
2385
2386   Optional<ConstantRange> CR =
2387       cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2388   if (!CR)
2389     return TM.getCodeModel() == CodeModel::Small;
2390
2391   return CR->getUnsignedMax().ult(1ull << 32);
2392 }
2393
2394 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2395                                          SDValue &Scale, SDValue &Index,
2396                                          SDValue &Disp, SDValue &Segment) {
2397   // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2398   SDLoc DL(N);
2399
2400   if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2401     return false;
2402
2403   RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
2404   if (RN && RN->getReg() == 0)
2405     Base = CurDAG->getRegister(0, MVT::i64);
2406   else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2407     // Base could already be %rip, particularly in the x32 ABI.
2408     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2409                                                      MVT::i64), 0);
2410     Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2411                                          Base);
2412   }
2413
2414   RN = dyn_cast<RegisterSDNode>(Index);
2415   if (RN && RN->getReg() == 0)
2416     Index = CurDAG->getRegister(0, MVT::i64);
2417   else {
2418     assert(Index.getValueType() == MVT::i32 &&
2419            "Expect to be extending 32-bit registers for use in LEA");
2420     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2421                                                      MVT::i64), 0);
2422     Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2423                                           Index);
2424   }
2425
2426   return true;
2427 }
2428
2429 /// Calls SelectAddr and determines if the maximal addressing
2430 /// mode it matches can be cost effectively emitted as an LEA instruction.
2431 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2432                                     SDValue &Base, SDValue &Scale,
2433                                     SDValue &Index, SDValue &Disp,
2434                                     SDValue &Segment) {
2435   X86ISelAddressMode AM;
2436
2437   // Save the DL and VT before calling matchAddress, it can invalidate N.
2438   SDLoc DL(N);
2439   MVT VT = N.getSimpleValueType();
2440
2441   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2442   // segments.
2443   SDValue Copy = AM.Segment;
2444   SDValue T = CurDAG->getRegister(0, MVT::i32);
2445   AM.Segment = T;
2446   if (matchAddress(N, AM))
2447     return false;
2448   assert (T == AM.Segment);
2449   AM.Segment = Copy;
2450
2451   unsigned Complexity = 0;
2452   if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
2453     Complexity = 1;
2454   else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2455     Complexity = 4;
2456
2457   if (AM.IndexReg.getNode())
2458     Complexity++;
2459
2460   // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2461   // a simple shift.
2462   if (AM.Scale > 1)
2463     Complexity++;
2464
2465   // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2466   // to a LEA. This is determined with some experimentation but is by no means
2467   // optimal (especially for code size consideration). LEA is nice because of
2468   // its three-address nature. Tweak the cost function again when we can run
2469   // convertToThreeAddress() at register allocation time.
2470   if (AM.hasSymbolicDisplacement()) {
2471     // For X86-64, always use LEA to materialize RIP-relative addresses.
2472     if (Subtarget->is64Bit())
2473       Complexity = 4;
2474     else
2475       Complexity += 2;
2476   }
2477
2478   // Heuristic: try harder to form an LEA from ADD if the operands set flags.
2479   // Unlike ADD, LEA does not affect flags, so we will be less likely to require
2480   // duplicating flag-producing instructions later in the pipeline.
2481   if (N.getOpcode() == ISD::ADD) {
2482     auto isMathWithFlags = [](SDValue V) {
2483       switch (V.getOpcode()) {
2484       case X86ISD::ADD:
2485       case X86ISD::SUB:
2486       case X86ISD::ADC:
2487       case X86ISD::SBB:
2488       /* TODO: These opcodes can be added safely, but we may want to justify
2489                their inclusion for different reasons (better for reg-alloc).
2490       case X86ISD::SMUL:
2491       case X86ISD::UMUL:
2492       case X86ISD::OR:
2493       case X86ISD::XOR:
2494       case X86ISD::AND:
2495       */
2496         // Value 1 is the flag output of the node - verify it's not dead.
2497         return !SDValue(V.getNode(), 1).use_empty();
2498       default:
2499         return false;
2500       }
2501     };
2502     // TODO: This could be an 'or' rather than 'and' to make the transform more
2503     //       likely to happen. We might want to factor in whether there's a
2504     //       load folding opportunity for the math op that disappears with LEA.
2505     if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
2506       Complexity++;
2507   }
2508
2509   if (AM.Disp)
2510     Complexity++;
2511
2512   // If it isn't worth using an LEA, reject it.
2513   if (Complexity <= 2)
2514     return false;
2515
2516   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2517   return true;
2518 }
2519
2520 /// This is only run on TargetGlobalTLSAddress nodes.
2521 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2522                                         SDValue &Scale, SDValue &Index,
2523                                         SDValue &Disp, SDValue &Segment) {
2524   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
2525   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2526
2527   X86ISelAddressMode AM;
2528   AM.GV = GA->getGlobal();
2529   AM.Disp += GA->getOffset();
2530   AM.SymbolFlags = GA->getTargetFlags();
2531
2532   MVT VT = N.getSimpleValueType();
2533   if (VT == MVT::i32) {
2534     AM.Scale = 1;
2535     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2536   }
2537
2538   getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2539   return true;
2540 }
2541
2542 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2543   if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2544     Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2545                                    N.getValueType());
2546     return true;
2547   }
2548
2549   // Keep track of the original value type and whether this value was
2550   // truncated. If we see a truncation from pointer type to VT that truncates
2551   // bits that are known to be zero, we can use a narrow reference.
2552   EVT VT = N.getValueType();
2553   bool WasTruncated = false;
2554   if (N.getOpcode() == ISD::TRUNCATE) {
2555     WasTruncated = true;
2556     N = N.getOperand(0);
2557   }
2558
2559   if (N.getOpcode() != X86ISD::Wrapper)
2560     return false;
2561
2562   // We can only use non-GlobalValues as immediates if they were not truncated,
2563   // as we do not have any range information. If we have a GlobalValue and the
2564   // address was not truncated, we can select it as an operand directly.
2565   unsigned Opc = N.getOperand(0)->getOpcode();
2566   if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2567     Op = N.getOperand(0);
2568     // We can only select the operand directly if we didn't have to look past a
2569     // truncate.
2570     return !WasTruncated;
2571   }
2572
2573   // Check that the global's range fits into VT.
2574   auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2575   Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2576   if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2577     return false;
2578
2579   // Okay, we can use a narrow reference.
2580   Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2581                                       GA->getOffset(), GA->getTargetFlags());
2582   return true;
2583 }
2584
2585 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2586                                   SDValue &Base, SDValue &Scale,
2587                                   SDValue &Index, SDValue &Disp,
2588                                   SDValue &Segment) {
2589   assert(Root && P && "Unknown root/parent nodes");
2590   if (!ISD::isNON_EXTLoad(N.getNode()) ||
2591       !IsProfitableToFold(N, P, Root) ||
2592       !IsLegalToFold(N, P, Root, OptLevel))
2593     return false;
2594
2595   return selectAddr(N.getNode(),
2596                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
2597 }
2598
2599 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
2600                                        SDValue &Base, SDValue &Scale,
2601                                        SDValue &Index, SDValue &Disp,
2602                                        SDValue &Segment) {
2603   assert(Root && P && "Unknown root/parent nodes");
2604   if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
2605       !IsProfitableToFold(N, P, Root) ||
2606       !IsLegalToFold(N, P, Root, OptLevel))
2607     return false;
2608
2609   return selectAddr(N.getNode(),
2610                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
2611 }
2612
2613 /// Return an SDNode that returns the value of the global base register.
2614 /// Output instructions required to initialize the global base register,
2615 /// if necessary.
2616 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2617   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2618   auto &DL = MF->getDataLayout();
2619   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2620 }
2621
2622 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2623   if (N->getOpcode() == ISD::TRUNCATE)
2624     N = N->getOperand(0).getNode();
2625   if (N->getOpcode() != X86ISD::Wrapper)
2626     return false;
2627
2628   auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2629   if (!GA)
2630     return false;
2631
2632   Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2633   return CR && CR->getSignedMin().sge(-1ull << Width) &&
2634          CR->getSignedMax().slt(1ull << Width);
2635 }
2636
2637 static X86::CondCode getCondFromNode(SDNode *N) {
2638   assert(N->isMachineOpcode() && "Unexpected node");
2639   X86::CondCode CC = X86::COND_INVALID;
2640   unsigned Opc = N->getMachineOpcode();
2641   if (Opc == X86::JCC_1)
2642     CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2643   else if (Opc == X86::SETCCr)
2644     CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2645   else if (Opc == X86::SETCCm)
2646     CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2647   else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2648            Opc == X86::CMOV64rr)
2649     CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2650   else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2651            Opc == X86::CMOV64rm)
2652     CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2653
2654   return CC;
2655 }
2656
2657 /// Test whether the given X86ISD::CMP node has any users that use a flag
2658 /// other than ZF.
2659 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2660   // Examine each user of the node.
2661   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2662          UI != UE; ++UI) {
2663     // Only check things that use the flags.
2664     if (UI.getUse().getResNo() != Flags.getResNo())
2665       continue;
2666     // Only examine CopyToReg uses that copy to EFLAGS.
2667     if (UI->getOpcode() != ISD::CopyToReg ||
2668         cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2669       return false;
2670     // Examine each user of the CopyToReg use.
2671     for (SDNode::use_iterator FlagUI = UI->use_begin(),
2672            FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2673       // Only examine the Flag result.
2674       if (FlagUI.getUse().getResNo() != 1) continue;
2675       // Anything unusual: assume conservatively.
2676       if (!FlagUI->isMachineOpcode()) return false;
2677       // Examine the condition code of the user.
2678       X86::CondCode CC = getCondFromNode(*FlagUI);
2679
2680       switch (CC) {
2681       // Comparisons which only use the zero flag.
2682       case X86::COND_E: case X86::COND_NE:
2683         continue;
2684       // Anything else: assume conservatively.
2685       default:
2686         return false;
2687       }
2688     }
2689   }
2690   return true;
2691 }
2692
2693 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2694 /// flag to be accurate.
2695 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2696   // Examine each user of the node.
2697   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2698          UI != UE; ++UI) {
2699     // Only check things that use the flags.
2700     if (UI.getUse().getResNo() != Flags.getResNo())
2701       continue;
2702     // Only examine CopyToReg uses that copy to EFLAGS.
2703     if (UI->getOpcode() != ISD::CopyToReg ||
2704         cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2705       return false;
2706     // Examine each user of the CopyToReg use.
2707     for (SDNode::use_iterator FlagUI = UI->use_begin(),
2708            FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2709       // Only examine the Flag result.
2710       if (FlagUI.getUse().getResNo() != 1) continue;
2711       // Anything unusual: assume conservatively.
2712       if (!FlagUI->isMachineOpcode()) return false;
2713       // Examine the condition code of the user.
2714       X86::CondCode CC = getCondFromNode(*FlagUI);
2715
2716       switch (CC) {
2717       // Comparisons which don't examine the SF flag.
2718       case X86::COND_A: case X86::COND_AE:
2719       case X86::COND_B: case X86::COND_BE:
2720       case X86::COND_E: case X86::COND_NE:
2721       case X86::COND_O: case X86::COND_NO:
2722       case X86::COND_P: case X86::COND_NP:
2723         continue;
2724       // Anything else: assume conservatively.
2725       default:
2726         return false;
2727       }
2728     }
2729   }
2730   return true;
2731 }
2732
2733 static bool mayUseCarryFlag(X86::CondCode CC) {
2734   switch (CC) {
2735   // Comparisons which don't examine the CF flag.
2736   case X86::COND_O: case X86::COND_NO:
2737   case X86::COND_E: case X86::COND_NE:
2738   case X86::COND_S: case X86::COND_NS:
2739   case X86::COND_P: case X86::COND_NP:
2740   case X86::COND_L: case X86::COND_GE:
2741   case X86::COND_G: case X86::COND_LE:
2742     return false;
2743   // Anything else: assume conservatively.
2744   default:
2745     return true;
2746   }
2747 }
2748
2749 /// Test whether the given node which sets flags has any uses which require the
2750 /// CF flag to be accurate.
2751  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2752   // Examine each user of the node.
2753   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2754          UI != UE; ++UI) {
2755     // Only check things that use the flags.
2756     if (UI.getUse().getResNo() != Flags.getResNo())
2757       continue;
2758
2759     unsigned UIOpc = UI->getOpcode();
2760
2761     if (UIOpc == ISD::CopyToReg) {
2762       // Only examine CopyToReg uses that copy to EFLAGS.
2763       if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2764         return false;
2765       // Examine each user of the CopyToReg use.
2766       for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2767            FlagUI != FlagUE; ++FlagUI) {
2768         // Only examine the Flag result.
2769         if (FlagUI.getUse().getResNo() != 1)
2770           continue;
2771         // Anything unusual: assume conservatively.
2772         if (!FlagUI->isMachineOpcode())
2773           return false;
2774         // Examine the condition code of the user.
2775         X86::CondCode CC = getCondFromNode(*FlagUI);
2776
2777         if (mayUseCarryFlag(CC))
2778           return false;
2779       }
2780
2781       // This CopyToReg is ok. Move on to the next user.
2782       continue;
2783     }
2784
2785     // This might be an unselected node. So look for the pre-isel opcodes that
2786     // use flags.
2787     unsigned CCOpNo;
2788     switch (UIOpc) {
2789     default:
2790       // Something unusual. Be conservative.
2791       return false;
2792     case X86ISD::SETCC:       CCOpNo = 0; break;
2793     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2794     case X86ISD::CMOV:        CCOpNo = 2; break;
2795     case X86ISD::BRCOND:      CCOpNo = 2; break;
2796     }
2797
2798     X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2799     if (mayUseCarryFlag(CC))
2800       return false;
2801   }
2802   return true;
2803 }
2804
2805 /// Check whether or not the chain ending in StoreNode is suitable for doing
2806 /// the {load; op; store} to modify transformation.
2807 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
2808                                         SDValue StoredVal, SelectionDAG *CurDAG,
2809                                         unsigned LoadOpNo,
2810                                         LoadSDNode *&LoadNode,
2811                                         SDValue &InputChain) {
2812   // Is the stored value result 0 of the operation?
2813   if (StoredVal.getResNo() != 0) return false;
2814
2815   // Are there other uses of the operation other than the store?
2816   if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2817
2818   // Is the store non-extending and non-indexed?
2819   if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2820     return false;
2821
2822   SDValue Load = StoredVal->getOperand(LoadOpNo);
2823   // Is the stored value a non-extending and non-indexed load?
2824   if (!ISD::isNormalLoad(Load.getNode())) return false;
2825
2826   // Return LoadNode by reference.
2827   LoadNode = cast<LoadSDNode>(Load);
2828
2829   // Is store the only read of the loaded value?
2830   if (!Load.hasOneUse())
2831     return false;
2832
2833   // Is the address of the store the same as the load?
2834   if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2835       LoadNode->getOffset() != StoreNode->getOffset())
2836     return false;
2837
2838   bool FoundLoad = false;
2839   SmallVector<SDValue, 4> ChainOps;
2840   SmallVector<const SDNode *, 4> LoopWorklist;
2841   SmallPtrSet<const SDNode *, 16> Visited;
2842   const unsigned int Max = 1024;
2843
2844   //  Visualization of Load-Op-Store fusion:
2845   // -------------------------
2846   // Legend:
2847   //    *-lines = Chain operand dependencies.
2848   //    |-lines = Normal operand dependencies.
2849   //    Dependencies flow down and right. n-suffix references multiple nodes.
2850   //
2851   //        C                        Xn  C
2852   //        *                         *  *
2853   //        *                          * *
2854   //  Xn  A-LD    Yn                    TF         Yn
2855   //   *    * \   |                       *        |
2856   //    *   *  \  |                        *       |
2857   //     *  *   \ |             =>       A--LD_OP_ST
2858   //      * *    \|                                 \
2859   //       TF    OP                                  \
2860   //         *   | \                                  Zn
2861   //          *  |  \
2862   //         A-ST    Zn
2863   //
2864
2865   // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2866   //                                      #2: Yn -> LD
2867   //                                      #3: ST -> Zn
2868
2869   // Ensure the transform is safe by checking for the dual
2870   // dependencies to make sure we do not induce a loop.
2871
2872   // As LD is a predecessor to both OP and ST we can do this by checking:
2873   //  a). if LD is a predecessor to a member of Xn or Yn.
2874   //  b). if a Zn is a predecessor to ST.
2875
2876   // However, (b) can only occur through being a chain predecessor to
2877   // ST, which is the same as Zn being a member or predecessor of Xn,
2878   // which is a subset of LD being a predecessor of Xn. So it's
2879   // subsumed by check (a).
2880
2881   SDValue Chain = StoreNode->getChain();
2882
2883   // Gather X elements in ChainOps.
2884   if (Chain == Load.getValue(1)) {
2885     FoundLoad = true;
2886     ChainOps.push_back(Load.getOperand(0));
2887   } else if (Chain.getOpcode() == ISD::TokenFactor) {
2888     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2889       SDValue Op = Chain.getOperand(i);
2890       if (Op == Load.getValue(1)) {
2891         FoundLoad = true;
2892         // Drop Load, but keep its chain. No cycle check necessary.
2893         ChainOps.push_back(Load.getOperand(0));
2894         continue;
2895       }
2896       LoopWorklist.push_back(Op.getNode());
2897       ChainOps.push_back(Op);
2898     }
2899   }
2900
2901   if (!FoundLoad)
2902     return false;
2903
2904   // Worklist is currently Xn. Add Yn to worklist.
2905   for (SDValue Op : StoredVal->ops())
2906     if (Op.getNode() != LoadNode)
2907       LoopWorklist.push_back(Op.getNode());
2908
2909   // Check (a) if Load is a predecessor to Xn + Yn
2910   if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2911                                    true))
2912     return false;
2913
2914   InputChain =
2915       CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2916   return true;
2917 }
2918
2919 // Change a chain of {load; op; store} of the same value into a simple op
2920 // through memory of that value, if the uses of the modified value and its
2921 // address are suitable.
2922 //
2923 // The tablegen pattern memory operand pattern is currently not able to match
2924 // the case where the EFLAGS on the original operation are used.
2925 //
2926 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2927 // be transferred from a node in the pattern to the result node, probably with
2928 // a new keyword. For example, we have this
2929 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2930 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2931 //   (implicit EFLAGS)]>;
2932 // but maybe need something like this
2933 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2934 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2935 //   (transferrable EFLAGS)]>;
2936 //
2937 // Until then, we manually fold these and instruction select the operation
2938 // here.
2939 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
2940   StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2941   SDValue StoredVal = StoreNode->getOperand(1);
2942   unsigned Opc = StoredVal->getOpcode();
2943
2944   // Before we try to select anything, make sure this is memory operand size
2945   // and opcode we can handle. Note that this must match the code below that
2946   // actually lowers the opcodes.
2947   EVT MemVT = StoreNode->getMemoryVT();
2948   if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
2949       MemVT != MVT::i8)
2950     return false;
2951
2952   bool IsCommutable = false;
2953   bool IsNegate = false;
2954   switch (Opc) {
2955   default:
2956     return false;
2957   case X86ISD::SUB:
2958     IsNegate = isNullConstant(StoredVal.getOperand(0));
2959     break;
2960   case X86ISD::SBB:
2961     break;
2962   case X86ISD::ADD:
2963   case X86ISD::ADC:
2964   case X86ISD::AND:
2965   case X86ISD::OR:
2966   case X86ISD::XOR:
2967     IsCommutable = true;
2968     break;
2969   }
2970
2971   unsigned LoadOpNo = IsNegate ? 1 : 0;
2972   LoadSDNode *LoadNode = nullptr;
2973   SDValue InputChain;
2974   if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2975                                    LoadNode, InputChain)) {
2976     if (!IsCommutable)
2977       return false;
2978
2979     // This operation is commutable, try the other operand.
2980     LoadOpNo = 1;
2981     if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2982                                      LoadNode, InputChain))
2983       return false;
2984   }
2985
2986   SDValue Base, Scale, Index, Disp, Segment;
2987   if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
2988                   Segment))
2989     return false;
2990
2991   auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
2992                           unsigned Opc8) {
2993     switch (MemVT.getSimpleVT().SimpleTy) {
2994     case MVT::i64:
2995       return Opc64;
2996     case MVT::i32:
2997       return Opc32;
2998     case MVT::i16:
2999       return Opc16;
3000     case MVT::i8:
3001       return Opc8;
3002     default:
3003       llvm_unreachable("Invalid size!");
3004     }
3005   };
3006
3007   MachineSDNode *Result;
3008   switch (Opc) {
3009   case X86ISD::SUB:
3010     // Handle negate.
3011     if (IsNegate) {
3012       unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3013                                      X86::NEG8m);
3014       const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3015       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3016                                       MVT::Other, Ops);
3017       break;
3018     }
3019    LLVM_FALLTHROUGH;
3020   case X86ISD::ADD:
3021     // Try to match inc/dec.
3022     if (!Subtarget->slowIncDec() || OptForSize) {
3023       bool IsOne = isOneConstant(StoredVal.getOperand(1));
3024       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3025       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3026       if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3027         unsigned NewOpc =
3028           ((Opc == X86ISD::ADD) == IsOne)
3029               ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3030               : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3031         const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3032         Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3033                                         MVT::Other, Ops);
3034         break;
3035       }
3036     }
3037     LLVM_FALLTHROUGH;
3038   case X86ISD::ADC:
3039   case X86ISD::SBB:
3040   case X86ISD::AND:
3041   case X86ISD::OR:
3042   case X86ISD::XOR: {
3043     auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3044       switch (Opc) {
3045       case X86ISD::ADD:
3046         return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3047                             X86::ADD8mr);
3048       case X86ISD::ADC:
3049         return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3050                             X86::ADC8mr);
3051       case X86ISD::SUB:
3052         return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3053                             X86::SUB8mr);
3054       case X86ISD::SBB:
3055         return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3056                             X86::SBB8mr);
3057       case X86ISD::AND:
3058         return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3059                             X86::AND8mr);
3060       case X86ISD::OR:
3061         return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3062       case X86ISD::XOR:
3063         return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3064                             X86::XOR8mr);
3065       default:
3066         llvm_unreachable("Invalid opcode!");
3067       }
3068     };
3069     auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3070       switch (Opc) {
3071       case X86ISD::ADD:
3072         return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3073       case X86ISD::ADC:
3074         return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3075       case X86ISD::SUB:
3076         return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3077       case X86ISD::SBB:
3078         return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3079       case X86ISD::AND:
3080         return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3081       case X86ISD::OR:
3082         return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3083       case X86ISD::XOR:
3084         return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3085       default:
3086         llvm_unreachable("Invalid opcode!");
3087       }
3088     };
3089     auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3090       switch (Opc) {
3091       case X86ISD::ADD:
3092         return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3093                             X86::ADD8mi);
3094       case X86ISD::ADC:
3095         return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3096                             X86::ADC8mi);
3097       case X86ISD::SUB:
3098         return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3099                             X86::SUB8mi);
3100       case X86ISD::SBB:
3101         return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3102                             X86::SBB8mi);
3103       case X86ISD::AND:
3104         return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3105                             X86::AND8mi);
3106       case X86ISD::OR:
3107         return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3108                             X86::OR8mi);
3109       case X86ISD::XOR:
3110         return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3111                             X86::XOR8mi);
3112       default:
3113         llvm_unreachable("Invalid opcode!");
3114       }
3115     };
3116
3117     unsigned NewOpc = SelectRegOpcode(Opc);
3118     SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3119
3120     // See if the operand is a constant that we can fold into an immediate
3121     // operand.
3122     if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3123       int64_t OperandV = OperandC->getSExtValue();
3124
3125       // Check if we can shrink the operand enough to fit in an immediate (or
3126       // fit into a smaller immediate) by negating it and switching the
3127       // operation.
3128       if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3129           ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3130            (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3131             isInt<32>(-OperandV))) &&
3132           hasNoCarryFlagUses(StoredVal.getValue(1))) {
3133         OperandV = -OperandV;
3134         Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3135       }
3136
3137       // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3138       // the larger immediate operand.
3139       if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
3140         Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3141         NewOpc = SelectImm8Opcode(Opc);
3142       } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3143         Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3144         NewOpc = SelectImmOpcode(Opc);
3145       }
3146     }
3147
3148     if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3149       SDValue CopyTo =
3150           CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3151                                StoredVal.getOperand(2), SDValue());
3152
3153       const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3154                              Segment, Operand, CopyTo, CopyTo.getValue(1)};
3155       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3156                                       Ops);
3157     } else {
3158       const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3159                              Segment, Operand, InputChain};
3160       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3161                                       Ops);
3162     }
3163     break;
3164   }
3165   default:
3166     llvm_unreachable("Invalid opcode!");
3167   }
3168
3169   MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3170                                  LoadNode->getMemOperand()};
3171   CurDAG->setNodeMemRefs(Result, MemOps);
3172
3173   // Update Load Chain uses as well.
3174   ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3175   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3176   ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3177   CurDAG->RemoveDeadNode(Node);
3178   return true;
3179 }
3180
3181 // See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3182 // Where Mask is one of the following patterns:
3183 //   a) x &  (1 << nbits) - 1
3184 //   b) x & ~(-1 << nbits)
3185 //   c) x &  (-1 >> (32 - y))
3186 //   d) x << (32 - y) >> (32 - y)
3187 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3188   assert(
3189       (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3190       "Should be either an and-mask, or right-shift after clearing high bits.");
3191
3192   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3193   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3194     return false;
3195
3196   MVT NVT = Node->getSimpleValueType(0);
3197
3198   // Only supported for 32 and 64 bits.
3199   if (NVT != MVT::i32 && NVT != MVT::i64)
3200     return false;
3201
3202   SDValue NBits;
3203
3204   // If we have BMI2's BZHI, we are ok with muti-use patterns.
3205   // Else, if we only have BMI1's BEXTR, we require one-use.
3206   const bool CanHaveExtraUses = Subtarget->hasBMI2();
3207   auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3208     return CanHaveExtraUses ||
3209            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3210   };
3211   auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
3212   auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
3213
3214   auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3215     if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3216       assert(V.getSimpleValueType() == MVT::i32 &&
3217              V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3218              "Expected i64 -> i32 truncation");
3219       V = V.getOperand(0);
3220     }
3221     return V;
3222   };
3223
3224   // a) x & ((1 << nbits) + (-1))
3225   auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
3226                         &NBits](SDValue Mask) -> bool {
3227     // Match `add`. Must only have one use!
3228     if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3229       return false;
3230     // We should be adding all-ones constant (i.e. subtracting one.)
3231     if (!isAllOnesConstant(Mask->getOperand(1)))
3232       return false;
3233     // Match `1 << nbits`. Might be truncated. Must only have one use!
3234     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3235     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3236       return false;
3237     if (!isOneConstant(M0->getOperand(0)))
3238       return false;
3239     NBits = M0->getOperand(1);
3240     return true;
3241   };
3242
3243   auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3244     V = peekThroughOneUseTruncation(V);
3245     return CurDAG->MaskedValueIsAllOnes(
3246         V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3247                                 NVT.getSizeInBits()));
3248   };
3249
3250   // b) x & ~(-1 << nbits)
3251   auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3252                         &NBits](SDValue Mask) -> bool {
3253     // Match `~()`. Must only have one use!
3254     if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3255       return false;
3256     // The -1 only has to be all-ones for the final Node's NVT.
3257     if (!isAllOnes(Mask->getOperand(1)))
3258       return false;
3259     // Match `-1 << nbits`. Might be truncated. Must only have one use!
3260     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3261     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3262       return false;
3263     // The -1 only has to be all-ones for the final Node's NVT.
3264     if (!isAllOnes(M0->getOperand(0)))
3265       return false;
3266     NBits = M0->getOperand(1);
3267     return true;
3268   };
3269
3270   // Match potentially-truncated (bitwidth - y)
3271   auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
3272                                              unsigned Bitwidth) {
3273     // Skip over a truncate of the shift amount.
3274     if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3275       ShiftAmt = ShiftAmt.getOperand(0);
3276       // The trunc should have been the only user of the real shift amount.
3277       if (!checkOneUse(ShiftAmt))
3278         return false;
3279     }
3280     // Match the shift amount as: (bitwidth - y). It should go away, too.
3281     if (ShiftAmt.getOpcode() != ISD::SUB)
3282       return false;
3283     auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3284     if (!V0 || V0->getZExtValue() != Bitwidth)
3285       return false;
3286     NBits = ShiftAmt.getOperand(1);
3287     return true;
3288   };
3289
3290   // c) x &  (-1 >> (32 - y))
3291   auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
3292                         matchShiftAmt](SDValue Mask) -> bool {
3293     // The mask itself may be truncated.
3294     Mask = peekThroughOneUseTruncation(Mask);
3295     unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3296     // Match `l>>`. Must only have one use!
3297     if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3298       return false;
3299     // We should be shifting truly all-ones constant.
3300     if (!isAllOnesConstant(Mask.getOperand(0)))
3301       return false;
3302     SDValue M1 = Mask.getOperand(1);
3303     // The shift amount should not be used externally.
3304     if (!checkOneUse(M1))
3305       return false;
3306     return matchShiftAmt(M1, Bitwidth);
3307   };
3308
3309   SDValue X;
3310
3311   // d) x << (32 - y) >> (32 - y)
3312   auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
3313                         &X](SDNode *Node) -> bool {
3314     if (Node->getOpcode() != ISD::SRL)
3315       return false;
3316     SDValue N0 = Node->getOperand(0);
3317     if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
3318       return false;
3319     unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3320     SDValue N1 = Node->getOperand(1);
3321     SDValue N01 = N0->getOperand(1);
3322     // Both of the shifts must be by the exact same value.
3323     // There should not be any uses of the shift amount outside of the pattern.
3324     if (N1 != N01 || !checkTwoUse(N1))
3325       return false;
3326     if (!matchShiftAmt(N1, Bitwidth))
3327       return false;
3328     X = N0->getOperand(0);
3329     return true;
3330   };
3331
3332   auto matchLowBitMask = [matchPatternA, matchPatternB,
3333                           matchPatternC](SDValue Mask) -> bool {
3334     return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3335   };
3336
3337   if (Node->getOpcode() == ISD::AND) {
3338     X = Node->getOperand(0);
3339     SDValue Mask = Node->getOperand(1);
3340
3341     if (matchLowBitMask(Mask)) {
3342       // Great.
3343     } else {
3344       std::swap(X, Mask);
3345       if (!matchLowBitMask(Mask))
3346         return false;
3347     }
3348   } else if (!matchPatternD(Node))
3349     return false;
3350
3351   SDLoc DL(Node);
3352
3353   // Truncate the shift amount.
3354   NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3355   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3356
3357   // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3358   // All the other bits are undefined, we do not care about them.
3359   SDValue ImplDef = SDValue(
3360       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3361   insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3362
3363   SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3364   insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3365   NBits = SDValue(
3366       CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
3367                              NBits, SRIdxVal), 0);
3368   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3369
3370   if (Subtarget->hasBMI2()) {
3371     // Great, just emit the the BZHI..
3372     if (NVT != MVT::i32) {
3373       // But have to place the bit count into the wide-enough register first.
3374       NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3375       insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3376     }
3377
3378     SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3379     ReplaceNode(Node, Extract.getNode());
3380     SelectCode(Extract.getNode());
3381     return true;
3382   }
3383
3384   // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3385   // *logically* shifted (potentially with one-use trunc inbetween),
3386   // and the truncation was the only use of the shift,
3387   // and if so look past one-use truncation.
3388   {
3389     SDValue RealX = peekThroughOneUseTruncation(X);
3390     // FIXME: only if the shift is one-use?
3391     if (RealX != X && RealX.getOpcode() == ISD::SRL)
3392       X = RealX;
3393   }
3394
3395   MVT XVT = X.getSimpleValueType();
3396
3397   // Else, emitting BEXTR requires one more step.
3398   // The 'control' of BEXTR has the pattern of:
3399   // [15...8 bit][ 7...0 bit] location
3400   // [ bit count][     shift] name
3401   // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
3402
3403   // Shift NBits left by 8 bits, thus producing 'control'.
3404   // This makes the low 8 bits to be zero.
3405   SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3406   SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3407   insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3408
3409   // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3410   // FIXME: only if the shift is one-use?
3411   if (X.getOpcode() == ISD::SRL) {
3412     SDValue ShiftAmt = X.getOperand(1);
3413     X = X.getOperand(0);
3414
3415     assert(ShiftAmt.getValueType() == MVT::i8 &&
3416            "Expected shift amount to be i8");
3417
3418     // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3419     // We could zext to i16 in some form, but we intentionally don't do that.
3420     SDValue OrigShiftAmt = ShiftAmt;
3421     ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3422     insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3423
3424     // And now 'or' these low 8 bits of shift amount into the 'control'.
3425     Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3426     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3427   }
3428
3429   // But have to place the 'control' into the wide-enough register first.
3430   if (XVT != MVT::i32) {
3431     Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3432     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3433   }
3434
3435   // And finally, form the BEXTR itself.
3436   SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3437
3438   // The 'X' was originally truncated. Do that now.
3439   if (XVT != NVT) {
3440     insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3441     Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3442   }
3443
3444   ReplaceNode(Node, Extract.getNode());
3445   SelectCode(Extract.getNode());
3446
3447   return true;
3448 }
3449
3450 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3451 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3452   MVT NVT = Node->getSimpleValueType(0);
3453   SDLoc dl(Node);
3454
3455   SDValue N0 = Node->getOperand(0);
3456   SDValue N1 = Node->getOperand(1);
3457
3458   // If we have TBM we can use an immediate for the control. If we have BMI
3459   // we should only do this if the BEXTR instruction is implemented well.
3460   // Otherwise moving the control into a register makes this more costly.
3461   // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3462   // hoisting the move immediate would make it worthwhile with a less optimal
3463   // BEXTR?
3464   bool PreferBEXTR =
3465       Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
3466   if (!PreferBEXTR && !Subtarget->hasBMI2())
3467     return nullptr;
3468
3469   // Must have a shift right.
3470   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3471     return nullptr;
3472
3473   // Shift can't have additional users.
3474   if (!N0->hasOneUse())
3475     return nullptr;
3476
3477   // Only supported for 32 and 64 bits.
3478   if (NVT != MVT::i32 && NVT != MVT::i64)
3479     return nullptr;
3480
3481   // Shift amount and RHS of and must be constant.
3482   ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3483   ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3484   if (!MaskCst || !ShiftCst)
3485     return nullptr;
3486
3487   // And RHS must be a mask.
3488   uint64_t Mask = MaskCst->getZExtValue();
3489   if (!isMask_64(Mask))
3490     return nullptr;
3491
3492   uint64_t Shift = ShiftCst->getZExtValue();
3493   uint64_t MaskSize = countPopulation(Mask);
3494
3495   // Don't interfere with something that can be handled by extracting AH.
3496   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3497   if (Shift == 8 && MaskSize == 8)
3498     return nullptr;
3499
3500   // Make sure we are only using bits that were in the original value, not
3501   // shifted in.
3502   if (Shift + MaskSize > NVT.getSizeInBits())
3503     return nullptr;
3504
3505   // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
3506   // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
3507   // does not fit into 32 bits. Load folding is not a sufficient reason.
3508   if (!PreferBEXTR && MaskSize <= 32)
3509     return nullptr;
3510
3511   SDValue Control;
3512   unsigned ROpc, MOpc;
3513
3514   if (!PreferBEXTR) {
3515     assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
3516     // If we can't make use of BEXTR then we can't fuse shift+mask stages.
3517     // Let's perform the mask first, and apply shift later. Note that we need to
3518     // widen the mask to account for the fact that we'll apply shift afterwards!
3519     Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
3520     ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
3521     MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
3522     unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3523     Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3524   } else {
3525     // The 'control' of BEXTR has the pattern of:
3526     // [15...8 bit][ 7...0 bit] location
3527     // [ bit count][     shift] name
3528     // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
3529     Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3530     if (Subtarget->hasTBM()) {
3531       ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3532       MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3533     } else {
3534       assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
3535       // BMI requires the immediate to placed in a register.
3536       ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3537       MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3538       unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3539       Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3540     }
3541   }
3542
3543   MachineSDNode *NewNode;
3544   SDValue Input = N0->getOperand(0);
3545   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3546   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3547     SDValue Ops[] = {
3548         Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
3549     SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3550     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3551     // Update the chain.
3552     ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3553     // Record the mem-refs
3554     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3555   } else {
3556     NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
3557   }
3558
3559   if (!PreferBEXTR) {
3560     // We still need to apply the shift.
3561     SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
3562     unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
3563     NewNode =
3564         CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
3565   }
3566
3567   return NewNode;
3568 }
3569
3570 // Emit a PCMISTR(I/M) instruction.
3571 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3572                                              bool MayFoldLoad, const SDLoc &dl,
3573                                              MVT VT, SDNode *Node) {
3574   SDValue N0 = Node->getOperand(0);
3575   SDValue N1 = Node->getOperand(1);
3576   SDValue Imm = Node->getOperand(2);
3577   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3578   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3579
3580   // Try to fold a load. No need to check alignment.
3581   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3582   if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3583     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3584                       N1.getOperand(0) };
3585     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3586     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3587     // Update the chain.
3588     ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3589     // Record the mem-refs
3590     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3591     return CNode;
3592   }
3593
3594   SDValue Ops[] = { N0, N1, Imm };
3595   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3596   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3597   return CNode;
3598 }
3599
3600 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3601 // to emit a second instruction after this one. This is needed since we have two
3602 // copyToReg nodes glued before this and we need to continue that glue through.
3603 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3604                                              bool MayFoldLoad, const SDLoc &dl,
3605                                              MVT VT, SDNode *Node,
3606                                              SDValue &InFlag) {
3607   SDValue N0 = Node->getOperand(0);
3608   SDValue N2 = Node->getOperand(2);
3609   SDValue Imm = Node->getOperand(4);
3610   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3611   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3612
3613   // Try to fold a load. No need to check alignment.
3614   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3615   if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3616     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3617                       N2.getOperand(0), InFlag };
3618     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3619     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3620     InFlag = SDValue(CNode, 3);
3621     // Update the chain.
3622     ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3623     // Record the mem-refs
3624     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3625     return CNode;
3626   }
3627
3628   SDValue Ops[] = { N0, N2, Imm, InFlag };
3629   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3630   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3631   InFlag = SDValue(CNode, 2);
3632   return CNode;
3633 }
3634
3635 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3636   EVT VT = N->getValueType(0);
3637
3638   // Only handle scalar shifts.
3639   if (VT.isVector())
3640     return false;
3641
3642   // Narrower shifts only mask to 5 bits in hardware.
3643   unsigned Size = VT == MVT::i64 ? 64 : 32;
3644
3645   SDValue OrigShiftAmt = N->getOperand(1);
3646   SDValue ShiftAmt = OrigShiftAmt;
3647   SDLoc DL(N);
3648
3649   // Skip over a truncate of the shift amount.
3650   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3651     ShiftAmt = ShiftAmt->getOperand(0);
3652
3653   // This function is called after X86DAGToDAGISel::matchBitExtract(),
3654   // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3655
3656   SDValue NewShiftAmt;
3657   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3658     SDValue Add0 = ShiftAmt->getOperand(0);
3659     SDValue Add1 = ShiftAmt->getOperand(1);
3660     // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3661     // to avoid the ADD/SUB.
3662     if (isa<ConstantSDNode>(Add1) &&
3663         cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3664       NewShiftAmt = Add0;
3665     // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3666     // generate a NEG instead of a SUB of a constant.
3667     } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3668                isa<ConstantSDNode>(Add0) &&
3669                cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3670                cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3671       // Insert a negate op.
3672       // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3673       // that uses it that's not a shift.
3674       EVT SubVT = ShiftAmt.getValueType();
3675       SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3676       SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3677       NewShiftAmt = Neg;
3678
3679       // Insert these operands into a valid topological order so they can
3680       // get selected independently.
3681       insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3682       insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3683     } else
3684       return false;
3685   } else
3686     return false;
3687
3688   if (NewShiftAmt.getValueType() != MVT::i8) {
3689     // Need to truncate the shift amount.
3690     NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3691     // Add to a correct topological ordering.
3692     insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3693   }
3694
3695   // Insert a new mask to keep the shift amount legal. This should be removed
3696   // by isel patterns.
3697   NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3698                                 CurDAG->getConstant(Size - 1, DL, MVT::i8));
3699   // Place in a correct topological ordering.
3700   insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3701
3702   SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3703                                                    NewShiftAmt);
3704   if (UpdatedNode != N) {
3705     // If we found an existing node, we should replace ourselves with that node
3706     // and wait for it to be selected after its other users.
3707     ReplaceNode(N, UpdatedNode);
3708     return true;
3709   }
3710
3711   // If the original shift amount is now dead, delete it so that we don't run
3712   // it through isel.
3713   if (OrigShiftAmt.getNode()->use_empty())
3714     CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3715
3716   // Now that we've optimized the shift amount, defer to normal isel to get
3717   // load folding and legacy vs BMI2 selection without repeating it here.
3718   SelectCode(N);
3719   return true;
3720 }
3721
3722 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3723   MVT NVT = N->getSimpleValueType(0);
3724   unsigned Opcode = N->getOpcode();
3725   SDLoc dl(N);
3726
3727   // For operations of the form (x << C1) op C2, check if we can use a smaller
3728   // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3729   SDValue Shift = N->getOperand(0);
3730   SDValue N1 = N->getOperand(1);
3731
3732   ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
3733   if (!Cst)
3734     return false;
3735
3736   int64_t Val = Cst->getSExtValue();
3737
3738   // If we have an any_extend feeding the AND, look through it to see if there
3739   // is a shift behind it. But only if the AND doesn't use the extended bits.
3740   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3741   bool FoundAnyExtend = false;
3742   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
3743       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
3744       isUInt<32>(Val)) {
3745     FoundAnyExtend = true;
3746     Shift = Shift.getOperand(0);
3747   }
3748
3749   if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
3750     return false;
3751
3752   // i8 is unshrinkable, i16 should be promoted to i32.
3753   if (NVT != MVT::i32 && NVT != MVT::i64)
3754     return false;
3755
3756   ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3757   if (!ShlCst)
3758     return false;
3759
3760   uint64_t ShAmt = ShlCst->getZExtValue();
3761
3762   // Make sure that we don't change the operation by removing bits.
3763   // This only matters for OR and XOR, AND is unaffected.
3764   uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3765   if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3766     return false;
3767
3768   // Check the minimum bitwidth for the new constant.
3769   // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3770   auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3771     if (Opcode == ISD::AND) {
3772       // AND32ri is the same as AND64ri32 with zext imm.
3773       // Try this before sign extended immediates below.
3774       ShiftedVal = (uint64_t)Val >> ShAmt;
3775       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3776         return true;
3777       // Also swap order when the AND can become MOVZX.
3778       if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
3779         return true;
3780     }
3781     ShiftedVal = Val >> ShAmt;
3782     if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
3783         (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
3784       return true;
3785     if (Opcode != ISD::AND) {
3786       // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3787       ShiftedVal = (uint64_t)Val >> ShAmt;
3788       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3789         return true;
3790     }
3791     return false;
3792   };
3793
3794   int64_t ShiftedVal;
3795   if (!CanShrinkImmediate(ShiftedVal))
3796     return false;
3797
3798   // Ok, we can reorder to get a smaller immediate.
3799
3800   // But, its possible the original immediate allowed an AND to become MOVZX.
3801   // Doing this late due to avoid the MakedValueIsZero call as late as
3802   // possible.
3803   if (Opcode == ISD::AND) {
3804     // Find the smallest zext this could possibly be.
3805     unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3806     ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3807
3808     // Figure out which bits need to be zero to achieve that mask.
3809     APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3810                                             ZExtWidth);
3811     NeededMask &= ~Cst->getAPIntValue();
3812
3813     if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3814       return false;
3815   }
3816
3817   SDValue X = Shift.getOperand(0);
3818   if (FoundAnyExtend) {
3819     SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3820     insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3821     X = NewX;
3822   }
3823
3824   SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3825   insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3826   SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3827   insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3828   SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3829                                    Shift.getOperand(1));
3830   ReplaceNode(N, NewSHL.getNode());
3831   SelectCode(NewSHL.getNode());
3832   return true;
3833 }
3834
3835 /// Convert vector increment or decrement to sub/add with an all-ones constant:
3836 /// add X, <1, 1...> --> sub X, <-1, -1...>
3837 /// sub X, <1, 1...> --> add X, <-1, -1...>
3838 /// The all-ones vector constant can be materialized using a pcmpeq instruction
3839 /// that is commonly recognized as an idiom (has no register dependency), so
3840 /// that's better/smaller than loading a splat 1 constant.
3841 bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
3842   assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
3843          "Unexpected opcode for increment/decrement transform");
3844
3845   EVT VT = Node->getValueType(0);
3846   assert(VT.isVector() && "Should only be called for vectors.");
3847
3848   SDValue X = Node->getOperand(0);
3849   SDValue OneVec = Node->getOperand(1);
3850
3851   APInt SplatVal;
3852   if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
3853     return false;
3854
3855   SDLoc DL(Node);
3856   SDValue OneConstant, AllOnesVec;
3857
3858   APInt Ones = APInt::getAllOnesValue(32);
3859   assert(VT.getSizeInBits() % 32 == 0 &&
3860          "Expected bit count to be a multiple of 32");
3861   OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
3862   insertDAGNode(*CurDAG, X, OneConstant);
3863
3864   unsigned NumElts = VT.getSizeInBits() / 32;
3865   assert(NumElts > 0 && "Expected to get non-empty vector.");
3866   AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
3867                                            DL, OneConstant);
3868   insertDAGNode(*CurDAG, X, AllOnesVec);
3869
3870   AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
3871   insertDAGNode(*CurDAG, X, AllOnesVec);
3872
3873   unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
3874   SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
3875
3876   ReplaceNode(Node, NewNode.getNode());
3877   SelectCode(NewNode.getNode());
3878   return true;
3879 }
3880
3881 /// If the high bits of an 'and' operand are known zero, try setting the
3882 /// high bits of an 'and' constant operand to produce a smaller encoding by
3883 /// creating a small, sign-extended negative immediate rather than a large
3884 /// positive one. This reverses a transform in SimplifyDemandedBits that
3885 /// shrinks mask constants by clearing bits. There is also a possibility that
3886 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3887 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3888 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3889   // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3890   // have immediate operands.
3891   MVT VT = And->getSimpleValueType(0);
3892   if (VT != MVT::i32 && VT != MVT::i64)
3893     return false;
3894
3895   auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3896   if (!And1C)
3897     return false;
3898
3899   // Bail out if the mask constant is already negative. It's can't shrink more.
3900   // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3901   // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3902   // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3903   // are negative too.
3904   APInt MaskVal = And1C->getAPIntValue();
3905   unsigned MaskLZ = MaskVal.countLeadingZeros();
3906   if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
3907     return false;
3908
3909   // Don't extend into the upper 32 bits of a 64 bit mask.
3910   if (VT == MVT::i64 && MaskLZ >= 32) {
3911     MaskLZ -= 32;
3912     MaskVal = MaskVal.trunc(32);
3913   }
3914
3915   SDValue And0 = And->getOperand(0);
3916   APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3917   APInt NegMaskVal = MaskVal | HighZeros;
3918
3919   // If a negative constant would not allow a smaller encoding, there's no need
3920   // to continue. Only change the constant when we know it's a win.
3921   unsigned MinWidth = NegMaskVal.getMinSignedBits();
3922   if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
3923     return false;
3924
3925   // Extend masks if we truncated above.
3926   if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
3927     NegMaskVal = NegMaskVal.zext(64);
3928     HighZeros = HighZeros.zext(64);
3929   }
3930
3931   // The variable operand must be all zeros in the top bits to allow using the
3932   // new, negative constant as the mask.
3933   if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3934     return false;
3935
3936   // Check if the mask is -1. In that case, this is an unnecessary instruction
3937   // that escaped earlier analysis.
3938   if (NegMaskVal.isAllOnesValue()) {
3939     ReplaceNode(And, And0.getNode());
3940     return true;
3941   }
3942
3943   // A negative mask allows a smaller encoding. Create a new 'and' node.
3944   SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
3945   SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
3946   ReplaceNode(And, NewAnd.getNode());
3947   SelectCode(NewAnd.getNode());
3948   return true;
3949 }
3950
3951 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
3952                               bool FoldedBCast, bool Masked) {
3953   if (Masked) {
3954     if (FoldedLoad) {
3955       switch (TestVT.SimpleTy) {
3956       default: llvm_unreachable("Unexpected VT!");
3957       case MVT::v16i8:
3958         return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
3959       case MVT::v8i16:
3960         return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
3961       case MVT::v4i32:
3962         return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
3963       case MVT::v2i64:
3964         return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
3965       case MVT::v32i8:
3966         return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
3967       case MVT::v16i16:
3968         return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
3969       case MVT::v8i32:
3970         return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
3971       case MVT::v4i64:
3972         return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
3973       case MVT::v64i8:
3974         return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
3975       case MVT::v32i16:
3976         return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
3977       case MVT::v16i32:
3978         return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
3979       case MVT::v8i64:
3980         return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
3981       }
3982     }
3983
3984     if (FoldedBCast) {
3985       switch (TestVT.SimpleTy) {
3986       default: llvm_unreachable("Unexpected VT!");
3987       case MVT::v4i32:
3988         return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
3989       case MVT::v2i64:
3990         return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
3991       case MVT::v8i32:
3992         return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
3993       case MVT::v4i64:
3994         return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
3995       case MVT::v16i32:
3996         return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
3997       case MVT::v8i64:
3998         return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
3999       }
4000     }
4001
4002     switch (TestVT.SimpleTy) {
4003     default: llvm_unreachable("Unexpected VT!");
4004     case MVT::v16i8:
4005       return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
4006     case MVT::v8i16:
4007       return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
4008     case MVT::v4i32:
4009       return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
4010     case MVT::v2i64:
4011       return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
4012     case MVT::v32i8:
4013       return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
4014     case MVT::v16i16:
4015       return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
4016     case MVT::v8i32:
4017       return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
4018     case MVT::v4i64:
4019       return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
4020     case MVT::v64i8:
4021       return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
4022     case MVT::v32i16:
4023       return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
4024     case MVT::v16i32:
4025       return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
4026     case MVT::v8i64:
4027       return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
4028     }
4029   }
4030
4031   if (FoldedLoad) {
4032     switch (TestVT.SimpleTy) {
4033     default: llvm_unreachable("Unexpected VT!");
4034     case MVT::v16i8:
4035       return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
4036     case MVT::v8i16:
4037       return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
4038     case MVT::v4i32:
4039       return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
4040     case MVT::v2i64:
4041       return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
4042     case MVT::v32i8:
4043       return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
4044     case MVT::v16i16:
4045       return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
4046     case MVT::v8i32:
4047       return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
4048     case MVT::v4i64:
4049       return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
4050     case MVT::v64i8:
4051       return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
4052     case MVT::v32i16:
4053       return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
4054     case MVT::v16i32:
4055       return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
4056     case MVT::v8i64:
4057       return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
4058     }
4059   }
4060
4061   if (FoldedBCast) {
4062     switch (TestVT.SimpleTy) {
4063     default: llvm_unreachable("Unexpected VT!");
4064     case MVT::v4i32:
4065       return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
4066     case MVT::v2i64:
4067       return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
4068     case MVT::v8i32:
4069       return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
4070     case MVT::v4i64:
4071       return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
4072     case MVT::v16i32:
4073       return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
4074     case MVT::v8i64:
4075       return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
4076     }
4077   }
4078
4079   switch (TestVT.SimpleTy) {
4080   default: llvm_unreachable("Unexpected VT!");
4081   case MVT::v16i8:
4082     return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
4083   case MVT::v8i16:
4084     return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
4085   case MVT::v4i32:
4086     return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
4087   case MVT::v2i64:
4088     return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
4089   case MVT::v32i8:
4090     return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
4091   case MVT::v16i16:
4092     return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
4093   case MVT::v8i32:
4094     return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
4095   case MVT::v4i64:
4096     return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
4097   case MVT::v64i8:
4098     return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
4099   case MVT::v32i16:
4100     return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
4101   case MVT::v16i32:
4102     return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
4103   case MVT::v8i64:
4104     return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
4105   }
4106 }
4107
4108 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4109 // to form a masked operation.
4110 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4111                                  SDValue InMask) {
4112   assert(Subtarget->hasAVX512() && "Expected AVX512!");
4113   assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4114          "Unexpected VT!");
4115
4116   // Look for equal and not equal compares.
4117   ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4118   if (CC != ISD::SETEQ && CC != ISD::SETNE)
4119     return false;
4120
4121   SDValue SetccOp0 = Setcc.getOperand(0);
4122   SDValue SetccOp1 = Setcc.getOperand(1);
4123
4124   // Canonicalize the all zero vector to the RHS.
4125   if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4126     std::swap(SetccOp0, SetccOp1);
4127
4128   // See if we're comparing against zero.
4129   if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4130     return false;
4131
4132   SDValue N0 = SetccOp0;
4133
4134   MVT CmpVT = N0.getSimpleValueType();
4135   MVT CmpSVT = CmpVT.getVectorElementType();
4136
4137   // Start with both operands the same. We'll try to refine this.
4138   SDValue Src0 = N0;
4139   SDValue Src1 = N0;
4140
4141   {
4142     // Look through single use bitcasts.
4143     SDValue N0Temp = N0;
4144     if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4145       N0Temp = N0.getOperand(0);
4146
4147      // Look for single use AND.
4148     if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4149       Src0 = N0Temp.getOperand(0);
4150       Src1 = N0Temp.getOperand(1);
4151     }
4152   }
4153
4154   // Without VLX we need to widen the load.
4155   bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4156
4157   // We can only fold loads if the sources are unique.
4158   bool CanFoldLoads = Src0 != Src1;
4159
4160   // Try to fold loads unless we need to widen.
4161   bool FoldedLoad = false;
4162   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
4163   if (!Widen && CanFoldLoads) {
4164     Load = Src1;
4165     FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
4166                              Tmp4);
4167     if (!FoldedLoad) {
4168       // And is computative.
4169       Load = Src0;
4170       FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
4171                                Tmp3, Tmp4);
4172       if (FoldedLoad)
4173         std::swap(Src0, Src1);
4174     }
4175   }
4176
4177   auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
4178     // Look through single use bitcasts.
4179     if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
4180       Parent = Src.getNode();
4181       Src = Src.getOperand(0);
4182     }
4183
4184     if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
4185       auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
4186       if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
4187         return Src;
4188     }
4189
4190     return SDValue();
4191   };
4192
4193   // If we didn't fold a load, try to match broadcast. No widening limitation
4194   // for this. But only 32 and 64 bit types are supported.
4195   bool FoldedBCast = false;
4196   if (!FoldedLoad && CanFoldLoads &&
4197       (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
4198     SDNode *ParentNode = N0.getNode();
4199     if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4200       FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4201                                      Tmp1, Tmp2, Tmp3, Tmp4);
4202     }
4203
4204     // Try the other operand.
4205     if (!FoldedBCast) {
4206       SDNode *ParentNode = N0.getNode();
4207       if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4208         FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4209                                        Tmp1, Tmp2, Tmp3, Tmp4);
4210         if (FoldedBCast)
4211           std::swap(Src0, Src1);
4212       }
4213     }
4214   }
4215
4216   auto getMaskRC = [](MVT MaskVT) {
4217     switch (MaskVT.SimpleTy) {
4218     default: llvm_unreachable("Unexpected VT!");
4219     case MVT::v2i1:  return X86::VK2RegClassID;
4220     case MVT::v4i1:  return X86::VK4RegClassID;
4221     case MVT::v8i1:  return X86::VK8RegClassID;
4222     case MVT::v16i1: return X86::VK16RegClassID;
4223     case MVT::v32i1: return X86::VK32RegClassID;
4224     case MVT::v64i1: return X86::VK64RegClassID;
4225     }
4226   };
4227
4228   bool IsMasked = InMask.getNode() != nullptr;
4229
4230   SDLoc dl(Root);
4231
4232   MVT ResVT = Setcc.getSimpleValueType();
4233   MVT MaskVT = ResVT;
4234   if (Widen) {
4235     // Widen the inputs using insert_subreg or copy_to_regclass.
4236     unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4237     unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4238     unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4239     CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4240     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4241     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4242                                                      CmpVT), 0);
4243     Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4244
4245     assert(!FoldedLoad && "Shouldn't have folded the load");
4246     if (!FoldedBCast)
4247       Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4248
4249     if (IsMasked) {
4250       // Widen the mask.
4251       unsigned RegClass = getMaskRC(MaskVT);
4252       SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4253       InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4254                                               dl, MaskVT, InMask, RC), 0);
4255     }
4256   }
4257
4258   bool IsTestN = CC == ISD::SETEQ;
4259   unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4260                                IsMasked);
4261
4262   MachineSDNode *CNode;
4263   if (FoldedLoad || FoldedBCast) {
4264     SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4265
4266     if (IsMasked) {
4267       SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4268                         Load.getOperand(0) };
4269       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4270     } else {
4271       SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4272                         Load.getOperand(0) };
4273       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4274     }
4275
4276     // Update the chain.
4277     ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4278     // Record the mem-refs
4279     CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
4280   } else {
4281     if (IsMasked)
4282       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4283     else
4284       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4285   }
4286
4287   // If we widened, we need to shrink the mask VT.
4288   if (Widen) {
4289     unsigned RegClass = getMaskRC(ResVT);
4290     SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4291     CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4292                                    dl, ResVT, SDValue(CNode, 0), RC);
4293   }
4294
4295   ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4296   CurDAG->RemoveDeadNode(Root);
4297   return true;
4298 }
4299
4300 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4301 // into vpternlog.
4302 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4303   assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4304
4305   MVT NVT = N->getSimpleValueType(0);
4306
4307   // Make sure we support VPTERNLOG.
4308   if (!NVT.isVector() || !Subtarget->hasAVX512())
4309     return false;
4310
4311   // We need VLX for 128/256-bit.
4312   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4313     return false;
4314
4315   SDValue N0 = N->getOperand(0);
4316   SDValue N1 = N->getOperand(1);
4317
4318   // Canonicalize AND to LHS.
4319   if (N1.getOpcode() == ISD::AND)
4320     std::swap(N0, N1);
4321
4322   if (N0.getOpcode() != ISD::AND ||
4323       N1.getOpcode() != X86ISD::ANDNP ||
4324       !N0.hasOneUse() || !N1.hasOneUse())
4325     return false;
4326
4327   // ANDN is not commutable, use it to pick down A and C.
4328   SDValue A = N1.getOperand(0);
4329   SDValue C = N1.getOperand(1);
4330
4331   // AND is commutable, if one operand matches A, the other operand is B.
4332   // Otherwise this isn't a match.
4333   SDValue B;
4334   if (N0.getOperand(0) == A)
4335     B = N0.getOperand(1);
4336   else if (N0.getOperand(1) == A)
4337     B = N0.getOperand(0);
4338   else
4339     return false;
4340
4341   SDLoc dl(N);
4342   SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
4343   SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
4344   ReplaceNode(N, Ternlog.getNode());
4345   SelectCode(Ternlog.getNode());
4346   return true;
4347 }
4348
4349 void X86DAGToDAGISel::Select(SDNode *Node) {
4350   MVT NVT = Node->getSimpleValueType(0);
4351   unsigned Opcode = Node->getOpcode();
4352   SDLoc dl(Node);
4353
4354   if (Node->isMachineOpcode()) {
4355     LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4356     Node->setNodeId(-1);
4357     return;   // Already selected.
4358   }
4359
4360   switch (Opcode) {
4361   default: break;
4362   case ISD::INTRINSIC_VOID: {
4363     unsigned IntNo = Node->getConstantOperandVal(1);
4364     switch (IntNo) {
4365     default: break;
4366     case Intrinsic::x86_sse3_monitor:
4367     case Intrinsic::x86_monitorx:
4368     case Intrinsic::x86_clzero: {
4369       bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4370
4371       unsigned Opc = 0;
4372       switch (IntNo) {
4373       default: llvm_unreachable("Unexpected intrinsic!");
4374       case Intrinsic::x86_sse3_monitor:
4375         if (!Subtarget->hasSSE3())
4376           break;
4377         Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
4378         break;
4379       case Intrinsic::x86_monitorx:
4380         if (!Subtarget->hasMWAITX())
4381           break;
4382         Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
4383         break;
4384       case Intrinsic::x86_clzero:
4385         if (!Subtarget->hasCLZERO())
4386           break;
4387         Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
4388         break;
4389       }
4390
4391       if (Opc) {
4392         unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
4393         SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4394                                              Node->getOperand(2), SDValue());
4395         SDValue InFlag = Chain.getValue(1);
4396
4397         if (IntNo == Intrinsic::x86_sse3_monitor ||
4398             IntNo == Intrinsic::x86_monitorx) {
4399           // Copy the other two operands to ECX and EDX.
4400           Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4401                                        InFlag);
4402           InFlag = Chain.getValue(1);
4403           Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4404                                        InFlag);
4405           InFlag = Chain.getValue(1);
4406         }
4407
4408         MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4409                                                       { Chain, InFlag});
4410         ReplaceNode(Node, CNode);
4411         return;
4412       }
4413     }
4414     }
4415
4416     break;
4417   }
4418   case ISD::BRIND: {
4419     if (Subtarget->isTargetNaCl())
4420       // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4421       // leave the instruction alone.
4422       break;
4423     if (Subtarget->isTarget64BitILP32()) {
4424       // Converts a 32-bit register to a 64-bit, zero-extended version of
4425       // it. This is needed because x86-64 can do many things, but jmp %r32
4426       // ain't one of them.
4427       const SDValue &Target = Node->getOperand(1);
4428       assert(Target.getSimpleValueType() == llvm::MVT::i32);
4429       SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
4430       SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4431                                       Node->getOperand(0), ZextTarget);
4432       ReplaceNode(Node, Brind.getNode());
4433       SelectCode(ZextTarget.getNode());
4434       SelectCode(Brind.getNode());
4435       return;
4436     }
4437     break;
4438   }
4439   case X86ISD::GlobalBaseReg:
4440     ReplaceNode(Node, getGlobalBaseReg());
4441     return;
4442
4443   case ISD::BITCAST:
4444     // Just drop all 128/256/512-bit bitcasts.
4445     if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
4446         NVT == MVT::f128) {
4447       ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4448       CurDAG->RemoveDeadNode(Node);
4449       return;
4450     }
4451     break;
4452
4453   case ISD::VSELECT: {
4454     // Replace VSELECT with non-mask conditions with with BLENDV.
4455     if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
4456       break;
4457
4458     assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
4459     SDValue Blendv = CurDAG->getNode(
4460         X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
4461         Node->getOperand(1), Node->getOperand(2));
4462     ReplaceNode(Node, Blendv.getNode());
4463     SelectCode(Blendv.getNode());
4464     // We already called ReplaceUses.
4465     return;
4466   }
4467
4468   case ISD::SRL:
4469     if (matchBitExtract(Node))
4470       return;
4471     LLVM_FALLTHROUGH;
4472   case ISD::SRA:
4473   case ISD::SHL:
4474     if (tryShiftAmountMod(Node))
4475       return;
4476     break;
4477
4478   case ISD::AND:
4479     if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
4480       // Try to form a masked VPTESTM. Operands can be in either order.
4481       SDValue N0 = Node->getOperand(0);
4482       SDValue N1 = Node->getOperand(1);
4483       if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
4484           tryVPTESTM(Node, N0, N1))
4485         return;
4486       if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
4487           tryVPTESTM(Node, N1, N0))
4488         return;
4489     }
4490
4491     if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4492       ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4493       CurDAG->RemoveDeadNode(Node);
4494       return;
4495     }
4496     if (matchBitExtract(Node))
4497       return;
4498     if (AndImmShrink && shrinkAndImmediate(Node))
4499       return;
4500
4501     LLVM_FALLTHROUGH;
4502   case ISD::OR:
4503   case ISD::XOR:
4504     if (tryShrinkShlLogicImm(Node))
4505       return;
4506
4507     if (Opcode == ISD::OR && tryMatchBitSelect(Node))
4508       return;
4509
4510     LLVM_FALLTHROUGH;
4511   case ISD::ADD:
4512   case ISD::SUB: {
4513     if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
4514         combineIncDecVector(Node))
4515       return;
4516
4517     // Try to avoid folding immediates with multiple uses for optsize.
4518     // This code tries to select to register form directly to avoid going
4519     // through the isel table which might fold the immediate. We can't change
4520     // the patterns on the add/sub/and/or/xor with immediate paterns in the
4521     // tablegen files to check immediate use count without making the patterns
4522     // unavailable to the fast-isel table.
4523     if (!OptForSize)
4524       break;
4525
4526     // Only handle i8/i16/i32/i64.
4527     if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
4528       break;
4529
4530     SDValue N0 = Node->getOperand(0);
4531     SDValue N1 = Node->getOperand(1);
4532
4533     ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
4534     if (!Cst)
4535       break;
4536
4537     int64_t Val = Cst->getSExtValue();
4538
4539     // Make sure its an immediate that is considered foldable.
4540     // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4541     if (!isInt<8>(Val) && !isInt<32>(Val))
4542       break;
4543
4544     // If this can match to INC/DEC, let it go.
4545     if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
4546       break;
4547
4548     // Check if we should avoid folding this immediate.
4549     if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
4550       break;
4551
4552     // We should not fold the immediate. So we need a register form instead.
4553     unsigned ROpc, MOpc;
4554     switch (NVT.SimpleTy) {
4555     default: llvm_unreachable("Unexpected VT!");
4556     case MVT::i8:
4557       switch (Opcode) {
4558       default: llvm_unreachable("Unexpected opcode!");
4559       case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
4560       case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
4561       case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
4562       case ISD::OR:  ROpc = X86::OR8rr;  MOpc = X86::OR8rm;  break;
4563       case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
4564       }
4565       break;
4566     case MVT::i16:
4567       switch (Opcode) {
4568       default: llvm_unreachable("Unexpected opcode!");
4569       case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
4570       case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
4571       case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
4572       case ISD::OR:  ROpc = X86::OR16rr;  MOpc = X86::OR16rm;  break;
4573       case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
4574       }
4575       break;
4576     case MVT::i32:
4577       switch (Opcode) {
4578       default: llvm_unreachable("Unexpected opcode!");
4579       case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
4580       case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
4581       case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
4582       case ISD::OR:  ROpc = X86::OR32rr;  MOpc = X86::OR32rm;  break;
4583       case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
4584       }
4585       break;
4586     case MVT::i64:
4587       switch (Opcode) {
4588       default: llvm_unreachable("Unexpected opcode!");
4589       case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
4590       case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
4591       case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
4592       case ISD::OR:  ROpc = X86::OR64rr;  MOpc = X86::OR64rm;  break;
4593       case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
4594       }
4595       break;
4596     }
4597
4598     // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4599
4600     // If this is a not a subtract, we can still try to fold a load.
4601     if (Opcode != ISD::SUB) {
4602       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4603       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4604         SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4605         SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4606         MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4607         // Update the chain.
4608         ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
4609         // Record the mem-refs
4610         CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
4611         ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4612         CurDAG->RemoveDeadNode(Node);
4613         return;
4614       }
4615     }
4616
4617     CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
4618     return;
4619   }
4620
4621   case X86ISD::SMUL:
4622     // i16/i32/i64 are handled with isel patterns.
4623     if (NVT != MVT::i8)
4624       break;
4625     LLVM_FALLTHROUGH;
4626   case X86ISD::UMUL: {
4627     SDValue N0 = Node->getOperand(0);
4628     SDValue N1 = Node->getOperand(1);
4629
4630     unsigned LoReg, ROpc, MOpc;
4631     switch (NVT.SimpleTy) {
4632     default: llvm_unreachable("Unsupported VT!");
4633     case MVT::i8:
4634       LoReg = X86::AL;
4635       ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
4636       MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
4637       break;
4638     case MVT::i16:
4639       LoReg = X86::AX;
4640       ROpc = X86::MUL16r;
4641       MOpc = X86::MUL16m;
4642       break;
4643     case MVT::i32:
4644       LoReg = X86::EAX;
4645       ROpc = X86::MUL32r;
4646       MOpc = X86::MUL32m;
4647       break;
4648     case MVT::i64:
4649       LoReg = X86::RAX;
4650       ROpc = X86::MUL64r;
4651       MOpc = X86::MUL64m;
4652       break;
4653     }
4654
4655     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4656     bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4657     // Multiply is commmutative.
4658     if (!FoldedLoad) {
4659       FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4660       if (FoldedLoad)
4661         std::swap(N0, N1);
4662     }
4663
4664     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4665                                           N0, SDValue()).getValue(1);
4666
4667     MachineSDNode *CNode;
4668     if (FoldedLoad) {
4669       // i16/i32/i64 use an instruction that produces a low and high result even
4670       // though only the low result is used.
4671       SDVTList VTs;
4672       if (NVT == MVT::i8)
4673         VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4674       else
4675         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4676
4677       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4678                         InFlag };
4679       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4680
4681       // Update the chain.
4682       ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
4683       // Record the mem-refs
4684       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4685     } else {
4686       // i16/i32/i64 use an instruction that produces a low and high result even
4687       // though only the low result is used.
4688       SDVTList VTs;
4689       if (NVT == MVT::i8)
4690         VTs = CurDAG->getVTList(NVT, MVT::i32);
4691       else
4692         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4693
4694       CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4695     }
4696
4697     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4698     ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
4699     CurDAG->RemoveDeadNode(Node);
4700     return;
4701   }
4702
4703   case ISD::SMUL_LOHI:
4704   case ISD::UMUL_LOHI: {
4705     SDValue N0 = Node->getOperand(0);
4706     SDValue N1 = Node->getOperand(1);
4707
4708     unsigned Opc, MOpc;
4709     bool isSigned = Opcode == ISD::SMUL_LOHI;
4710     if (!isSigned) {
4711       switch (NVT.SimpleTy) {
4712       default: llvm_unreachable("Unsupported VT!");
4713       case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
4714       case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
4715       }
4716     } else {
4717       switch (NVT.SimpleTy) {
4718       default: llvm_unreachable("Unsupported VT!");
4719       case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
4720       case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
4721       }
4722     }
4723
4724     unsigned SrcReg, LoReg, HiReg;
4725     switch (Opc) {
4726     default: llvm_unreachable("Unknown MUL opcode!");
4727     case X86::IMUL32r:
4728     case X86::MUL32r:
4729       SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
4730       break;
4731     case X86::IMUL64r:
4732     case X86::MUL64r:
4733       SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
4734       break;
4735     }
4736
4737     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4738     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4739     // Multiply is commmutative.
4740     if (!foldedLoad) {
4741       foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4742       if (foldedLoad)
4743         std::swap(N0, N1);
4744     }
4745
4746     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
4747                                           N0, SDValue()).getValue(1);
4748     if (foldedLoad) {
4749       SDValue Chain;
4750       MachineSDNode *CNode = nullptr;
4751       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4752                         InFlag };
4753       SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4754       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4755       Chain = SDValue(CNode, 0);
4756       InFlag = SDValue(CNode, 1);
4757
4758       // Update the chain.
4759       ReplaceUses(N1.getValue(1), Chain);
4760       // Record the mem-refs
4761       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4762     } else {
4763       SDValue Ops[] = { N1, InFlag };
4764       SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4765       SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4766       InFlag = SDValue(CNode, 0);
4767     }
4768
4769     // Copy the low half of the result, if it is needed.
4770     if (!SDValue(Node, 0).use_empty()) {
4771       assert(LoReg && "Register for low half is not defined!");
4772       SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4773                                              NVT, InFlag);
4774       InFlag = ResLo.getValue(2);
4775       ReplaceUses(SDValue(Node, 0), ResLo);
4776       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4777                  dbgs() << '\n');
4778     }
4779     // Copy the high half of the result, if it is needed.
4780     if (!SDValue(Node, 1).use_empty()) {
4781       assert(HiReg && "Register for high half is not defined!");
4782       SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4783                                              NVT, InFlag);
4784       InFlag = ResHi.getValue(2);
4785       ReplaceUses(SDValue(Node, 1), ResHi);
4786       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4787                  dbgs() << '\n');
4788     }
4789
4790     CurDAG->RemoveDeadNode(Node);
4791     return;
4792   }
4793
4794   case ISD::SDIVREM:
4795   case ISD::UDIVREM: {
4796     SDValue N0 = Node->getOperand(0);
4797     SDValue N1 = Node->getOperand(1);
4798
4799     unsigned Opc, MOpc;
4800     bool isSigned = Opcode == ISD::SDIVREM;
4801     if (!isSigned) {
4802       switch (NVT.SimpleTy) {
4803       default: llvm_unreachable("Unsupported VT!");
4804       case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
4805       case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
4806       case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
4807       case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
4808       }
4809     } else {
4810       switch (NVT.SimpleTy) {
4811       default: llvm_unreachable("Unsupported VT!");
4812       case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
4813       case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
4814       case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
4815       case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
4816       }
4817     }
4818
4819     unsigned LoReg, HiReg, ClrReg;
4820     unsigned SExtOpcode;
4821     switch (NVT.SimpleTy) {
4822     default: llvm_unreachable("Unsupported VT!");
4823     case MVT::i8:
4824       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
4825       SExtOpcode = 0; // Not used.
4826       break;
4827     case MVT::i16:
4828       LoReg = X86::AX;  HiReg = X86::DX;
4829       ClrReg = X86::DX;
4830       SExtOpcode = X86::CWD;
4831       break;
4832     case MVT::i32:
4833       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4834       SExtOpcode = X86::CDQ;
4835       break;
4836     case MVT::i64:
4837       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4838       SExtOpcode = X86::CQO;
4839       break;
4840     }
4841
4842     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4843     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4844     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4845
4846     SDValue InFlag;
4847     if (NVT == MVT::i8) {
4848       // Special case for div8, just use a move with zero extension to AX to
4849       // clear the upper 8 bits (AH).
4850       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4851       MachineSDNode *Move;
4852       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4853         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4854         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
4855                                                     : X86::MOVZX16rm8;
4856         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
4857         Chain = SDValue(Move, 1);
4858         ReplaceUses(N0.getValue(1), Chain);
4859         // Record the mem-refs
4860         CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4861       } else {
4862         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
4863                                                     : X86::MOVZX16rr8;
4864         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
4865         Chain = CurDAG->getEntryNode();
4866       }
4867       Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
4868                                     SDValue());
4869       InFlag = Chain.getValue(1);
4870     } else {
4871       InFlag =
4872         CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4873                              LoReg, N0, SDValue()).getValue(1);
4874       if (isSigned && !signBitIsZero) {
4875         // Sign extend the low part into the high part.
4876         InFlag =
4877           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4878       } else {
4879         // Zero out the high part, effectively zero extending the input.
4880         SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
4881         switch (NVT.SimpleTy) {
4882         case MVT::i16:
4883           ClrNode =
4884               SDValue(CurDAG->getMachineNode(
4885                           TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4886                           CurDAG->getTargetConstant(X86::sub_16bit, dl,
4887                                                     MVT::i32)),
4888                       0);
4889           break;
4890         case MVT::i32:
4891           break;
4892         case MVT::i64:
4893           ClrNode =
4894               SDValue(CurDAG->getMachineNode(
4895                           TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4896                           CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4897                           CurDAG->getTargetConstant(X86::sub_32bit, dl,
4898                                                     MVT::i32)),
4899                       0);
4900           break;
4901         default:
4902           llvm_unreachable("Unexpected division source");
4903         }
4904
4905         InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4906                                       ClrNode, InFlag).getValue(1);
4907       }
4908     }
4909
4910     if (foldedLoad) {
4911       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4912                         InFlag };
4913       MachineSDNode *CNode =
4914         CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4915       InFlag = SDValue(CNode, 1);
4916       // Update the chain.
4917       ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4918       // Record the mem-refs
4919       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4920     } else {
4921       InFlag =
4922         SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
4923     }
4924
4925     // Prevent use of AH in a REX instruction by explicitly copying it to
4926     // an ABCD_L register.
4927     //
4928     // The current assumption of the register allocator is that isel
4929     // won't generate explicit references to the GR8_ABCD_H registers. If
4930     // the allocator and/or the backend get enhanced to be more robust in
4931     // that regard, this can be, and should be, removed.
4932     if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
4933       SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
4934       unsigned AHExtOpcode =
4935           isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
4936
4937       SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
4938                                              MVT::Glue, AHCopy, InFlag);
4939       SDValue Result(RNode, 0);
4940       InFlag = SDValue(RNode, 1);
4941
4942       Result =
4943           CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
4944
4945       ReplaceUses(SDValue(Node, 1), Result);
4946       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4947                  dbgs() << '\n');
4948     }
4949     // Copy the division (low) result, if it is needed.
4950     if (!SDValue(Node, 0).use_empty()) {
4951       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4952                                                 LoReg, NVT, InFlag);
4953       InFlag = Result.getValue(2);
4954       ReplaceUses(SDValue(Node, 0), Result);
4955       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4956                  dbgs() << '\n');
4957     }
4958     // Copy the remainder (high) result, if it is needed.
4959     if (!SDValue(Node, 1).use_empty()) {
4960       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4961                                               HiReg, NVT, InFlag);
4962       InFlag = Result.getValue(2);
4963       ReplaceUses(SDValue(Node, 1), Result);
4964       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4965                  dbgs() << '\n');
4966     }
4967     CurDAG->RemoveDeadNode(Node);
4968     return;
4969   }
4970
4971   case X86ISD::CMP: {
4972     SDValue N0 = Node->getOperand(0);
4973     SDValue N1 = Node->getOperand(1);
4974
4975     // Optimizations for TEST compares.
4976     if (!isNullConstant(N1))
4977       break;
4978
4979     // Save the original VT of the compare.
4980     MVT CmpVT = N0.getSimpleValueType();
4981
4982     // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
4983     // by a test instruction. The test should be removed later by
4984     // analyzeCompare if we are using only the zero flag.
4985     // TODO: Should we check the users and use the BEXTR flags directly?
4986     if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
4987       if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
4988         unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
4989                                              : X86::TEST32rr;
4990         SDValue BEXTR = SDValue(NewNode, 0);
4991         NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
4992         ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4993         CurDAG->RemoveDeadNode(Node);
4994         return;
4995       }
4996     }
4997
4998     // We can peek through truncates, but we need to be careful below.
4999     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5000       N0 = N0.getOperand(0);
5001
5002     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5003     // use a smaller encoding.
5004     // Look past the truncate if CMP is the only use of it.
5005     if (N0.getOpcode() == ISD::AND &&
5006         N0.getNode()->hasOneUse() &&
5007         N0.getValueType() != MVT::i8) {
5008       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5009       if (!C) break;
5010       uint64_t Mask = C->getZExtValue();
5011
5012       // Check if we can replace AND+IMM64 with a shift. This is possible for
5013       // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
5014       // flag.
5015       if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
5016           onlyUsesZeroFlag(SDValue(Node, 0))) {
5017         if (isMask_64(~Mask)) {
5018           unsigned TrailingZeros = countTrailingZeros(Mask);
5019           SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
5020           SDValue Shift =
5021             SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
5022                                            N0.getOperand(0), Imm), 0);
5023           MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5024                                                        MVT::i32, Shift, Shift);
5025           ReplaceNode(Node, Test);
5026           return;
5027         }
5028         if (isMask_64(Mask)) {
5029           unsigned LeadingZeros = countLeadingZeros(Mask);
5030           SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
5031           SDValue Shift =
5032             SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
5033                                            N0.getOperand(0), Imm), 0);
5034           MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5035                                                        MVT::i32, Shift, Shift);
5036           ReplaceNode(Node, Test);
5037           return;
5038         }
5039       }
5040
5041       MVT VT;
5042       int SubRegOp;
5043       unsigned ROpc, MOpc;
5044
5045       // For each of these checks we need to be careful if the sign flag is
5046       // being used. It is only safe to use the sign flag in two conditions,
5047       // either the sign bit in the shrunken mask is zero or the final test
5048       // size is equal to the original compare size.
5049
5050       if (isUInt<8>(Mask) &&
5051           (!(Mask & 0x80) || CmpVT == MVT::i8 ||
5052            hasNoSignFlagUses(SDValue(Node, 0)))) {
5053         // For example, convert "testl %eax, $8" to "testb %al, $8"
5054         VT = MVT::i8;
5055         SubRegOp = X86::sub_8bit;
5056         ROpc = X86::TEST8ri;
5057         MOpc = X86::TEST8mi;
5058       } else if (OptForMinSize && isUInt<16>(Mask) &&
5059                  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
5060                   hasNoSignFlagUses(SDValue(Node, 0)))) {
5061         // For example, "testl %eax, $32776" to "testw %ax, $32776".
5062         // NOTE: We only want to form TESTW instructions if optimizing for
5063         // min size. Otherwise we only save one byte and possibly get a length
5064         // changing prefix penalty in the decoders.
5065         VT = MVT::i16;
5066         SubRegOp = X86::sub_16bit;
5067         ROpc = X86::TEST16ri;
5068         MOpc = X86::TEST16mi;
5069       } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
5070                  ((!(Mask & 0x80000000) &&
5071                    // Without minsize 16-bit Cmps can get here so we need to
5072                    // be sure we calculate the correct sign flag if needed.
5073                    (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
5074                   CmpVT == MVT::i32 ||
5075                   hasNoSignFlagUses(SDValue(Node, 0)))) {
5076         // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
5077         // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
5078         // Otherwize, we find ourselves in a position where we have to do
5079         // promotion. If previous passes did not promote the and, we assume
5080         // they had a good reason not to and do not promote here.
5081         VT = MVT::i32;
5082         SubRegOp = X86::sub_32bit;
5083         ROpc = X86::TEST32ri;
5084         MOpc = X86::TEST32mi;
5085       } else {
5086         // No eligible transformation was found.
5087         break;
5088       }
5089
5090       SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
5091       SDValue Reg = N0.getOperand(0);
5092
5093       // Emit a testl or testw.
5094       MachineSDNode *NewNode;
5095       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5096       if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5097         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
5098                           Reg.getOperand(0) };
5099         NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
5100         // Update the chain.
5101         ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
5102         // Record the mem-refs
5103         CurDAG->setNodeMemRefs(NewNode,
5104                                {cast<LoadSDNode>(Reg)->getMemOperand()});
5105       } else {
5106         // Extract the subregister if necessary.
5107         if (N0.getValueType() != VT)
5108           Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
5109
5110         NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
5111       }
5112       // Replace CMP with TEST.
5113       ReplaceNode(Node, NewNode);
5114       return;
5115     }
5116     break;
5117   }
5118   case X86ISD::PCMPISTR: {
5119     if (!Subtarget->hasSSE42())
5120       break;
5121
5122     bool NeedIndex = !SDValue(Node, 0).use_empty();
5123     bool NeedMask = !SDValue(Node, 1).use_empty();
5124     // We can't fold a load if we are going to make two instructions.
5125     bool MayFoldLoad = !NeedIndex || !NeedMask;
5126
5127     MachineSDNode *CNode;
5128     if (NeedMask) {
5129       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
5130       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
5131       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
5132       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5133     }
5134     if (NeedIndex || !NeedMask) {
5135       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
5136       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
5137       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
5138       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5139     }
5140
5141     // Connect the flag usage to the last instruction created.
5142     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5143     CurDAG->RemoveDeadNode(Node);
5144     return;
5145   }
5146   case X86ISD::PCMPESTR: {
5147     if (!Subtarget->hasSSE42())
5148       break;
5149
5150     // Copy the two implicit register inputs.
5151     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
5152                                           Node->getOperand(1),
5153                                           SDValue()).getValue(1);
5154     InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
5155                                   Node->getOperand(3), InFlag).getValue(1);
5156
5157     bool NeedIndex = !SDValue(Node, 0).use_empty();
5158     bool NeedMask = !SDValue(Node, 1).use_empty();
5159     // We can't fold a load if we are going to make two instructions.
5160     bool MayFoldLoad = !NeedIndex || !NeedMask;
5161
5162     MachineSDNode *CNode;
5163     if (NeedMask) {
5164       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
5165       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
5166       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
5167                            InFlag);
5168       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5169     }
5170     if (NeedIndex || !NeedMask) {
5171       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
5172       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
5173       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
5174       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5175     }
5176     // Connect the flag usage to the last instruction created.
5177     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5178     CurDAG->RemoveDeadNode(Node);
5179     return;
5180   }
5181
5182   case ISD::SETCC: {
5183     if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
5184       return;
5185
5186     break;
5187   }
5188
5189   case ISD::STORE:
5190     if (foldLoadStoreIntoMemOperand(Node))
5191       return;
5192     break;
5193   case ISD::FCEIL:
5194   case ISD::FFLOOR:
5195   case ISD::FTRUNC:
5196   case ISD::FNEARBYINT:
5197   case ISD::FRINT: {
5198     // Replace fp rounding with their X86 specific equivalent so we don't
5199     // need 2 sets of patterns.
5200     // FIXME: This can only happen when the nodes started as STRICT_* and have
5201     // been mutated into their non-STRICT equivalents. Eventually this
5202     // mutation will be removed and we should switch the STRICT_ nodes to a
5203     // strict version of RNDSCALE in PreProcessISelDAG.
5204     unsigned Imm;
5205     switch (Node->getOpcode()) {
5206     default: llvm_unreachable("Unexpected opcode!");
5207     case ISD::FCEIL:      Imm = 0xA; break;
5208     case ISD::FFLOOR:     Imm = 0x9; break;
5209     case ISD::FTRUNC:     Imm = 0xB; break;
5210     case ISD::FNEARBYINT: Imm = 0xC; break;
5211     case ISD::FRINT:      Imm = 0x4; break;
5212     }
5213     SDLoc dl(Node);
5214     SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0),
5215                                   Node->getOperand(0),
5216                                   CurDAG->getTargetConstant(Imm, dl, MVT::i8));
5217     ReplaceNode(Node, Res.getNode());
5218     SelectCode(Res.getNode());
5219     return;
5220   }
5221   }
5222
5223   SelectCode(Node);
5224 }
5225
5226 bool X86DAGToDAGISel::
5227 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
5228                              std::vector<SDValue> &OutOps) {
5229   SDValue Op0, Op1, Op2, Op3, Op4;
5230   switch (ConstraintID) {
5231   default:
5232     llvm_unreachable("Unexpected asm memory constraint");
5233   case InlineAsm::Constraint_i:
5234     // FIXME: It seems strange that 'i' is needed here since it's supposed to
5235     //        be an immediate and not a memory constraint.
5236     LLVM_FALLTHROUGH;
5237   case InlineAsm::Constraint_o: // offsetable        ??
5238   case InlineAsm::Constraint_v: // not offsetable    ??
5239   case InlineAsm::Constraint_m: // memory
5240   case InlineAsm::Constraint_X:
5241     if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
5242       return true;
5243     break;
5244   }
5245
5246   OutOps.push_back(Op0);
5247   OutOps.push_back(Op1);
5248   OutOps.push_back(Op2);
5249   OutOps.push_back(Op3);
5250   OutOps.push_back(Op4);
5251   return false;
5252 }
5253
5254 /// This pass converts a legalized DAG into a X86-specific DAG,
5255 /// ready for instruction scheduling.
5256 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
5257                                      CodeGenOpt::Level OptLevel) {
5258   return new X86DAGToDAGISel(TM, OptLevel);
5259 }