llvm/lib/Target/X86/X86FastISel.cpp

   1 //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the X86-specific support for the FastISel class. Much
  10 // of the target-specific code is generated by tablegen in the file
  11 // X86GenFastISel.inc, which is #included here.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86.h"
  16 #include "X86CallingConv.h"
  17 #include "X86InstrBuilder.h"
  18 #include "X86InstrInfo.h"
  19 #include "X86MachineFunctionInfo.h"
  20 #include "X86RegisterInfo.h"
  21 #include "X86Subtarget.h"
  22 #include "X86TargetMachine.h"
  23 #include "llvm/Analysis/BranchProbabilityInfo.h"
  24 #include "llvm/CodeGen/FastISel.h"
  25 #include "llvm/CodeGen/FunctionLoweringInfo.h"
  26 #include "llvm/CodeGen/MachineConstantPool.h"
  27 #include "llvm/CodeGen/MachineFrameInfo.h"
  28 #include "llvm/CodeGen/MachineRegisterInfo.h"
  29 #include "llvm/IR/CallingConv.h"
  30 #include "llvm/IR/DebugInfo.h"
  31 #include "llvm/IR/DerivedTypes.h"
  32 #include "llvm/IR/GetElementPtrTypeIterator.h"
  33 #include "llvm/IR/GlobalVariable.h"
  34 #include "llvm/IR/Instructions.h"
  35 #include "llvm/IR/IntrinsicInst.h"
  36 #include "llvm/IR/IntrinsicsX86.h"
  37 #include "llvm/IR/Operator.h"
  38 #include "llvm/MC/MCAsmInfo.h"
  39 #include "llvm/MC/MCSymbol.h"
  40 #include "llvm/Support/ErrorHandling.h"
  41 #include "llvm/Target/TargetOptions.h"
  42 using namespace llvm;
  43
  44 namespace {
  45
  46 class X86FastISel final : public FastISel {
  47   /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
  48   /// make the right decision when generating code for different targets.
  49   const X86Subtarget *Subtarget;
  50
  51 public:
  52   explicit X86FastISel(FunctionLoweringInfo &funcInfo,
  53                        const TargetLibraryInfo *libInfo)
  54       : FastISel(funcInfo, libInfo) {
  55     Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
  56   }
  57
  58   bool fastSelectInstruction(const Instruction *I) override;
  59
  60   /// The specified machine instr operand is a vreg, and that
  61   /// vreg is being provided by the specified load instruction.  If possible,
  62   /// try to fold the load as an operand to the instruction, returning true if
  63   /// possible.
  64   bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
  65                            const LoadInst *LI) override;
  66
  67   bool fastLowerArguments() override;
  68   bool fastLowerCall(CallLoweringInfo &CLI) override;
  69   bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
  70
  71 #include "X86GenFastISel.inc"
  72
  73 private:
  74   bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
  75                           const DebugLoc &DL);
  76
  77   bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
  78                        unsigned &ResultReg, unsigned Alignment = 1);
  79
  80   bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
  81                         MachineMemOperand *MMO = nullptr, bool Aligned = false);
  82   bool X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
  83                         MachineMemOperand *MMO = nullptr, bool Aligned = false);
  84
  85   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
  86                          unsigned &ResultReg);
  87
  88   bool X86SelectAddress(const Value *V, X86AddressMode &AM);
  89   bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
  90
  91   bool X86SelectLoad(const Instruction *I);
  92
  93   bool X86SelectStore(const Instruction *I);
  94
  95   bool X86SelectRet(const Instruction *I);
  96
  97   bool X86SelectCmp(const Instruction *I);
  98
  99   bool X86SelectZExt(const Instruction *I);
 100
 101   bool X86SelectSExt(const Instruction *I);
 102
 103   bool X86SelectBranch(const Instruction *I);
 104
 105   bool X86SelectShift(const Instruction *I);
 106
 107   bool X86SelectDivRem(const Instruction *I);
 108
 109   bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
 110
 111   bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
 112
 113   bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
 114
 115   bool X86SelectSelect(const Instruction *I);
 116
 117   bool X86SelectTrunc(const Instruction *I);
 118
 119   bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
 120                                const TargetRegisterClass *RC);
 121
 122   bool X86SelectFPExt(const Instruction *I);
 123   bool X86SelectFPTrunc(const Instruction *I);
 124   bool X86SelectSIToFP(const Instruction *I);
 125   bool X86SelectUIToFP(const Instruction *I);
 126   bool X86SelectIntToFP(const Instruction *I, bool IsSigned);
 127
 128   const X86InstrInfo *getInstrInfo() const {
 129     return Subtarget->getInstrInfo();
 130   }
 131   const X86TargetMachine *getTargetMachine() const {
 132     return static_cast<const X86TargetMachine *>(&TM);
 133   }
 134
 135   bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
 136
 137   unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
 138   unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
 139   unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
 140   unsigned fastMaterializeConstant(const Constant *C) override;
 141
 142   unsigned fastMaterializeAlloca(const AllocaInst *C) override;
 143
 144   unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
 145
 146   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
 147   /// computed in an SSE register, not on the X87 floating point stack.
 148   bool isScalarFPTypeInSSEReg(EVT VT) const {
 149     return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
 150            (VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16;
 151   }
 152
 153   bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
 154
 155   bool IsMemcpySmall(uint64_t Len);
 156
 157   bool TryEmitSmallMemcpy(X86AddressMode DestAM,
 158                           X86AddressMode SrcAM, uint64_t Len);
 159
 160   bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
 161                             const Value *Cond);
 162
 163   const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
 164                                             X86AddressMode &AM);
 165
 166   unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
 167                              const TargetRegisterClass *RC, unsigned Op0,
 168                              unsigned Op1, unsigned Op2, unsigned Op3);
 169 };
 170
 171 } // end anonymous namespace.
 172
 173 static std::pair<unsigned, bool>
 174 getX86SSEConditionCode(CmpInst::Predicate Predicate) {
 175   unsigned CC;
 176   bool NeedSwap = false;
 177
 178   // SSE Condition code mapping:
 179   //  0 - EQ
 180   //  1 - LT
 181   //  2 - LE
 182   //  3 - UNORD
 183   //  4 - NEQ
 184   //  5 - NLT
 185   //  6 - NLE
 186   //  7 - ORD
 187   switch (Predicate) {
 188   default: llvm_unreachable("Unexpected predicate");
 189   case CmpInst::FCMP_OEQ: CC = 0;          break;
 190   case CmpInst::FCMP_OGT: NeedSwap = true; [[fallthrough]];
 191   case CmpInst::FCMP_OLT: CC = 1;          break;
 192   case CmpInst::FCMP_OGE: NeedSwap = true; [[fallthrough]];
 193   case CmpInst::FCMP_OLE: CC = 2;          break;
 194   case CmpInst::FCMP_UNO: CC = 3;          break;
 195   case CmpInst::FCMP_UNE: CC = 4;          break;
 196   case CmpInst::FCMP_ULE: NeedSwap = true; [[fallthrough]];
 197   case CmpInst::FCMP_UGE: CC = 5;          break;
 198   case CmpInst::FCMP_ULT: NeedSwap = true; [[fallthrough]];
 199   case CmpInst::FCMP_UGT: CC = 6;          break;
 200   case CmpInst::FCMP_ORD: CC = 7;          break;
 201   case CmpInst::FCMP_UEQ: CC = 8;          break;
 202   case CmpInst::FCMP_ONE: CC = 12;         break;
 203   }
 204
 205   return std::make_pair(CC, NeedSwap);
 206 }
 207
 208 /// Adds a complex addressing mode to the given machine instr builder.
 209 /// Note, this will constrain the index register.  If its not possible to
 210 /// constrain the given index register, then a new one will be created.  The
 211 /// IndexReg field of the addressing mode will be updated to match in this case.
 212 const MachineInstrBuilder &
 213 X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
 214                             X86AddressMode &AM) {
 215   // First constrain the index register.  It needs to be a GR64_NOSP.
 216   AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
 217                                          MIB->getNumOperands() +
 218                                          X86::AddrIndexReg);
 219   return ::addFullAddress(MIB, AM);
 220 }
 221
 222 /// Check if it is possible to fold the condition from the XALU intrinsic
 223 /// into the user. The condition code will only be updated on success.
 224 bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
 225                                        const Value *Cond) {
 226   if (!isa<ExtractValueInst>(Cond))
 227     return false;
 228
 229   const auto *EV = cast<ExtractValueInst>(Cond);
 230   if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
 231     return false;
 232
 233   const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
 234   MVT RetVT;
 235   const Function *Callee = II->getCalledFunction();
 236   Type *RetTy =
 237     cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
 238   if (!isTypeLegal(RetTy, RetVT))
 239     return false;
 240
 241   if (RetVT != MVT::i32 && RetVT != MVT::i64)
 242     return false;
 243
 244   X86::CondCode TmpCC;
 245   switch (II->getIntrinsicID()) {
 246   default: return false;
 247   case Intrinsic::sadd_with_overflow:
 248   case Intrinsic::ssub_with_overflow:
 249   case Intrinsic::smul_with_overflow:
 250   case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
 251   case Intrinsic::uadd_with_overflow:
 252   case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
 253   }
 254
 255   // Check if both instructions are in the same basic block.
 256   if (II->getParent() != I->getParent())
 257     return false;
 258
 259   // Make sure nothing is in the way
 260   BasicBlock::const_iterator Start(I);
 261   BasicBlock::const_iterator End(II);
 262   for (auto Itr = std::prev(Start); Itr != End; --Itr) {
 263     // We only expect extractvalue instructions between the intrinsic and the
 264     // instruction to be selected.
 265     if (!isa<ExtractValueInst>(Itr))
 266       return false;
 267
 268     // Check that the extractvalue operand comes from the intrinsic.
 269     const auto *EVI = cast<ExtractValueInst>(Itr);
 270     if (EVI->getAggregateOperand() != II)
 271       return false;
 272   }
 273
 274   // Make sure no potentially eflags clobbering phi moves can be inserted in
 275   // between.
 276   auto HasPhis = [](const BasicBlock *Succ) { return !Succ->phis().empty(); };
 277   if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
 278     return false;
 279
 280   // Make sure there are no potentially eflags clobbering constant
 281   // materializations in between.
 282   if (llvm::any_of(I->operands(), [](Value *V) { return isa<Constant>(V); }))
 283     return false;
 284
 285   CC = TmpCC;
 286   return true;
 287 }
 288
 289 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
 290   EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true);
 291   if (evt == MVT::Other || !evt.isSimple())
 292     // Unhandled type. Halt "fast" selection and bail.
 293     return false;
 294
 295   VT = evt.getSimpleVT();
 296   // For now, require SSE/SSE2 for performing floating-point operations,
 297   // since x87 requires additional work.
 298   if (VT == MVT::f64 && !Subtarget->hasSSE2())
 299     return false;
 300   if (VT == MVT::f32 && !Subtarget->hasSSE1())
 301     return false;
 302   // Similarly, no f80 support yet.
 303   if (VT == MVT::f80)
 304     return false;
 305   // We only handle legal types. For example, on x86-32 the instruction
 306   // selector contains all of the 64-bit instructions from x86-64,
 307   // under the assumption that i64 won't be used if the target doesn't
 308   // support it.
 309   return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
 310 }
 311
 312 /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
 313 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
 314 /// Return true and the result register by reference if it is possible.
 315 bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
 316                                   MachineMemOperand *MMO, unsigned &ResultReg,
 317                                   unsigned Alignment) {
 318   bool HasSSE1 = Subtarget->hasSSE1();
 319   bool HasSSE2 = Subtarget->hasSSE2();
 320   bool HasSSE41 = Subtarget->hasSSE41();
 321   bool HasAVX = Subtarget->hasAVX();
 322   bool HasAVX2 = Subtarget->hasAVX2();
 323   bool HasAVX512 = Subtarget->hasAVX512();
 324   bool HasVLX = Subtarget->hasVLX();
 325   bool IsNonTemporal = MMO && MMO->isNonTemporal();
 326
 327   // Treat i1 loads the same as i8 loads. Masking will be done when storing.
 328   if (VT == MVT::i1)
 329     VT = MVT::i8;
 330
 331   // Get opcode and regclass of the output for the given load instruction.
 332   unsigned Opc = 0;
 333   switch (VT.SimpleTy) {
 334   default: return false;
 335   case MVT::i8:
 336     Opc = X86::MOV8rm;
 337     break;
 338   case MVT::i16:
 339     Opc = X86::MOV16rm;
 340     break;
 341   case MVT::i32:
 342     Opc = X86::MOV32rm;
 343     break;
 344   case MVT::i64:
 345     // Must be in x86-64 mode.
 346     Opc = X86::MOV64rm;
 347     break;
 348   case MVT::f32:
 349     Opc = HasAVX512 ? X86::VMOVSSZrm_alt
 350           : HasAVX  ? X86::VMOVSSrm_alt
 351           : HasSSE1 ? X86::MOVSSrm_alt
 352                     : X86::LD_Fp32m;
 353     break;
 354   case MVT::f64:
 355     Opc = HasAVX512 ? X86::VMOVSDZrm_alt
 356           : HasAVX  ? X86::VMOVSDrm_alt
 357           : HasSSE2 ? X86::MOVSDrm_alt
 358                     : X86::LD_Fp64m;
 359     break;
 360   case MVT::f80:
 361     // No f80 support yet.
 362     return false;
 363   case MVT::v4f32:
 364     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
 365       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
 366             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
 367     else if (Alignment >= 16)
 368       Opc = HasVLX ? X86::VMOVAPSZ128rm :
 369             HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
 370     else
 371       Opc = HasVLX ? X86::VMOVUPSZ128rm :
 372             HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
 373     break;
 374   case MVT::v2f64:
 375     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
 376       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
 377             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
 378     else if (Alignment >= 16)
 379       Opc = HasVLX ? X86::VMOVAPDZ128rm :
 380             HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
 381     else
 382       Opc = HasVLX ? X86::VMOVUPDZ128rm :
 383             HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
 384     break;
 385   case MVT::v4i32:
 386   case MVT::v2i64:
 387   case MVT::v8i16:
 388   case MVT::v16i8:
 389     if (IsNonTemporal && Alignment >= 16 && HasSSE41)
 390       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
 391             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
 392     else if (Alignment >= 16)
 393       Opc = HasVLX ? X86::VMOVDQA64Z128rm :
 394             HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
 395     else
 396       Opc = HasVLX ? X86::VMOVDQU64Z128rm :
 397             HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
 398     break;
 399   case MVT::v8f32:
 400     assert(HasAVX);
 401     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
 402       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
 403     else if (IsNonTemporal && Alignment >= 16)
 404       return false; // Force split for X86::VMOVNTDQArm
 405     else if (Alignment >= 32)
 406       Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
 407     else
 408       Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
 409     break;
 410   case MVT::v4f64:
 411     assert(HasAVX);
 412     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
 413       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
 414     else if (IsNonTemporal && Alignment >= 16)
 415       return false; // Force split for X86::VMOVNTDQArm
 416     else if (Alignment >= 32)
 417       Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
 418     else
 419       Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
 420     break;
 421   case MVT::v8i32:
 422   case MVT::v4i64:
 423   case MVT::v16i16:
 424   case MVT::v32i8:
 425     assert(HasAVX);
 426     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
 427       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
 428     else if (IsNonTemporal && Alignment >= 16)
 429       return false; // Force split for X86::VMOVNTDQArm
 430     else if (Alignment >= 32)
 431       Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
 432     else
 433       Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
 434     break;
 435   case MVT::v16f32:
 436     assert(HasAVX512);
 437     if (IsNonTemporal && Alignment >= 64)
 438       Opc = X86::VMOVNTDQAZrm;
 439     else
 440       Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
 441     break;
 442   case MVT::v8f64:
 443     assert(HasAVX512);
 444     if (IsNonTemporal && Alignment >= 64)
 445       Opc = X86::VMOVNTDQAZrm;
 446     else
 447       Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
 448     break;
 449   case MVT::v8i64:
 450   case MVT::v16i32:
 451   case MVT::v32i16:
 452   case MVT::v64i8:
 453     assert(HasAVX512);
 454     // Note: There are a lot more choices based on type with AVX-512, but
 455     // there's really no advantage when the load isn't masked.
 456     if (IsNonTemporal && Alignment >= 64)
 457       Opc = X86::VMOVNTDQAZrm;
 458     else
 459       Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
 460     break;
 461   }
 462
 463   const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
 464
 465   ResultReg = createResultReg(RC);
 466   MachineInstrBuilder MIB =
 467     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg);
 468   addFullAddress(MIB, AM);
 469   if (MMO)
 470     MIB->addMemOperand(*FuncInfo.MF, MMO);
 471   return true;
 472 }
 473
 474 /// X86FastEmitStore - Emit a machine instruction to store a value Val of
 475 /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
 476 /// and a displacement offset, or a GlobalAddress,
 477 /// i.e. V. Return true if it is possible.
 478 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
 479                                    MachineMemOperand *MMO, bool Aligned) {
 480   bool HasSSE1 = Subtarget->hasSSE1();
 481   bool HasSSE2 = Subtarget->hasSSE2();
 482   bool HasSSE4A = Subtarget->hasSSE4A();
 483   bool HasAVX = Subtarget->hasAVX();
 484   bool HasAVX512 = Subtarget->hasAVX512();
 485   bool HasVLX = Subtarget->hasVLX();
 486   bool IsNonTemporal = MMO && MMO->isNonTemporal();
 487
 488   // Get opcode and regclass of the output for the given store instruction.
 489   unsigned Opc = 0;
 490   switch (VT.getSimpleVT().SimpleTy) {
 491   case MVT::f80: // No f80 support yet.
 492   default: return false;
 493   case MVT::i1: {
 494     // Mask out all but lowest bit.
 495     Register AndResult = createResultReg(&X86::GR8RegClass);
 496     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
 497             TII.get(X86::AND8ri), AndResult)
 498       .addReg(ValReg).addImm(1);
 499     ValReg = AndResult;
 500     [[fallthrough]]; // handle i1 as i8.
 501   }
 502   case MVT::i8:  Opc = X86::MOV8mr;  break;
 503   case MVT::i16: Opc = X86::MOV16mr; break;
 504   case MVT::i32:
 505     Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
 506     break;
 507   case MVT::i64:
 508     // Must be in x86-64 mode.
 509     Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
 510     break;
 511   case MVT::f32:
 512     if (HasSSE1) {
 513       if (IsNonTemporal && HasSSE4A)
 514         Opc = X86::MOVNTSS;
 515       else
 516         Opc = HasAVX512 ? X86::VMOVSSZmr :
 517               HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
 518     } else
 519       Opc = X86::ST_Fp32m;
 520     break;
 521   case MVT::f64:
 522     if (HasSSE2) {
 523       if (IsNonTemporal && HasSSE4A)
 524         Opc = X86::MOVNTSD;
 525       else
 526         Opc = HasAVX512 ? X86::VMOVSDZmr :
 527               HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
 528     } else
 529       Opc = X86::ST_Fp64m;
 530     break;
 531   case MVT::x86mmx:
 532     Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
 533     break;
 534   case MVT::v4f32:
 535     if (Aligned) {
 536       if (IsNonTemporal)
 537         Opc = HasVLX ? X86::VMOVNTPSZ128mr :
 538               HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
 539       else
 540         Opc = HasVLX ? X86::VMOVAPSZ128mr :
 541               HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
 542     } else
 543       Opc = HasVLX ? X86::VMOVUPSZ128mr :
 544             HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
 545     break;
 546   case MVT::v2f64:
 547     if (Aligned) {
 548       if (IsNonTemporal)
 549         Opc = HasVLX ? X86::VMOVNTPDZ128mr :
 550               HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
 551       else
 552         Opc = HasVLX ? X86::VMOVAPDZ128mr :
 553               HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
 554     } else
 555       Opc = HasVLX ? X86::VMOVUPDZ128mr :
 556             HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
 557     break;
 558   case MVT::v4i32:
 559   case MVT::v2i64:
 560   case MVT::v8i16:
 561   case MVT::v16i8:
 562     if (Aligned) {
 563       if (IsNonTemporal)
 564         Opc = HasVLX ? X86::VMOVNTDQZ128mr :
 565               HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
 566       else
 567         Opc = HasVLX ? X86::VMOVDQA64Z128mr :
 568               HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
 569     } else
 570       Opc = HasVLX ? X86::VMOVDQU64Z128mr :
 571             HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
 572     break;
 573   case MVT::v8f32:
 574     assert(HasAVX);
 575     if (Aligned) {
 576       if (IsNonTemporal)
 577         Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
 578       else
 579         Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
 580     } else
 581       Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
 582     break;
 583   case MVT::v4f64:
 584     assert(HasAVX);
 585     if (Aligned) {
 586       if (IsNonTemporal)
 587         Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
 588       else
 589         Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
 590     } else
 591       Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
 592     break;
 593   case MVT::v8i32:
 594   case MVT::v4i64:
 595   case MVT::v16i16:
 596   case MVT::v32i8:
 597     assert(HasAVX);
 598     if (Aligned) {
 599       if (IsNonTemporal)
 600         Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
 601       else
 602         Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
 603     } else
 604       Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
 605     break;
 606   case MVT::v16f32:
 607     assert(HasAVX512);
 608     if (Aligned)
 609       Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
 610     else
 611       Opc = X86::VMOVUPSZmr;
 612     break;
 613   case MVT::v8f64:
 614     assert(HasAVX512);
 615     if (Aligned) {
 616       Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
 617     } else
 618       Opc = X86::VMOVUPDZmr;
 619     break;
 620   case MVT::v8i64:
 621   case MVT::v16i32:
 622   case MVT::v32i16:
 623   case MVT::v64i8:
 624     assert(HasAVX512);
 625     // Note: There are a lot more choices based on type with AVX-512, but
 626     // there's really no advantage when the store isn't masked.
 627     if (Aligned)
 628       Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
 629     else
 630       Opc = X86::VMOVDQU64Zmr;
 631     break;
 632   }
 633
 634   const MCInstrDesc &Desc = TII.get(Opc);
 635   // Some of the instructions in the previous switch use FR128 instead
 636   // of FR32 for ValReg. Make sure the register we feed the instruction
 637   // matches its register class constraints.
 638   // Note: This is fine to do a copy from FR32 to FR128, this is the
 639   // same registers behind the scene and actually why it did not trigger
 640   // any bugs before.
 641   ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
 642   MachineInstrBuilder MIB =
 643       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, Desc);
 644   addFullAddress(MIB, AM).addReg(ValReg);
 645   if (MMO)
 646     MIB->addMemOperand(*FuncInfo.MF, MMO);
 647
 648   return true;
 649 }
 650
 651 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
 652                                    X86AddressMode &AM,
 653                                    MachineMemOperand *MMO, bool Aligned) {
 654   // Handle 'null' like i32/i64 0.
 655   if (isa<ConstantPointerNull>(Val))
 656     Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
 657
 658   // If this is a store of a simple constant, fold the constant into the store.
 659   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
 660     unsigned Opc = 0;
 661     bool Signed = true;
 662     switch (VT.getSimpleVT().SimpleTy) {
 663     default: break;
 664     case MVT::i1:
 665       Signed = false;
 666       [[fallthrough]]; // Handle as i8.
 667     case MVT::i8:  Opc = X86::MOV8mi;  break;
 668     case MVT::i16: Opc = X86::MOV16mi; break;
 669     case MVT::i32: Opc = X86::MOV32mi; break;
 670     case MVT::i64:
 671       // Must be a 32-bit sign extended value.
 672       if (isInt<32>(CI->getSExtValue()))
 673         Opc = X86::MOV64mi32;
 674       break;
 675     }
 676
 677     if (Opc) {
 678       MachineInstrBuilder MIB =
 679         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc));
 680       addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
 681                                             : CI->getZExtValue());
 682       if (MMO)
 683         MIB->addMemOperand(*FuncInfo.MF, MMO);
 684       return true;
 685     }
 686   }
 687
 688   Register ValReg = getRegForValue(Val);
 689   if (ValReg == 0)
 690     return false;
 691
 692   return X86FastEmitStore(VT, ValReg, AM, MMO, Aligned);
 693 }
 694
 695 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
 696 /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
 697 /// ISD::SIGN_EXTEND).
 698 bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
 699                                     unsigned Src, EVT SrcVT,
 700                                     unsigned &ResultReg) {
 701   unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);
 702   if (RR == 0)
 703     return false;
 704
 705   ResultReg = RR;
 706   return true;
 707 }
 708
 709 bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
 710   // Handle constant address.
 711   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
 712     // Can't handle alternate code models yet.
 713     if (TM.getCodeModel() != CodeModel::Small &&
 714         TM.getCodeModel() != CodeModel::Medium)
 715       return false;
 716
 717     // Can't handle large objects yet.
 718     if (TM.isLargeGlobalValue(GV))
 719       return false;
 720
 721     // Can't handle TLS yet.
 722     if (GV->isThreadLocal())
 723       return false;
 724
 725     // Can't handle !absolute_symbol references yet.
 726     if (GV->isAbsoluteSymbolRef())
 727       return false;
 728
 729     // RIP-relative addresses can't have additional register operands, so if
 730     // we've already folded stuff into the addressing mode, just force the
 731     // global value into its own register, which we can use as the basereg.
 732     if (!Subtarget->isPICStyleRIPRel() ||
 733         (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
 734       // Okay, we've committed to selecting this global. Set up the address.
 735       AM.GV = GV;
 736
 737       // Allow the subtarget to classify the global.
 738       unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
 739
 740       // If this reference is relative to the pic base, set it now.
 741       if (isGlobalRelativeToPICBase(GVFlags)) {
 742         // FIXME: How do we know Base.Reg is free??
 743         AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
 744       }
 745
 746       // Unless the ABI requires an extra load, return a direct reference to
 747       // the global.
 748       if (!isGlobalStubReference(GVFlags)) {
 749         if (Subtarget->isPICStyleRIPRel()) {
 750           // Use rip-relative addressing if we can.  Above we verified that the
 751           // base and index registers are unused.
 752           assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
 753           AM.Base.Reg = X86::RIP;
 754         }
 755         AM.GVOpFlags = GVFlags;
 756         return true;
 757       }
 758
 759       // Ok, we need to do a load from a stub.  If we've already loaded from
 760       // this stub, reuse the loaded pointer, otherwise emit the load now.
 761       DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V);
 762       Register LoadReg;
 763       if (I != LocalValueMap.end() && I->second) {
 764         LoadReg = I->second;
 765       } else {
 766         // Issue load from stub.
 767         unsigned Opc = 0;
 768         const TargetRegisterClass *RC = nullptr;
 769         X86AddressMode StubAM;
 770         StubAM.Base.Reg = AM.Base.Reg;
 771         StubAM.GV = GV;
 772         StubAM.GVOpFlags = GVFlags;
 773
 774         // Prepare for inserting code in the local-value area.
 775         SavePoint SaveInsertPt = enterLocalValueArea();
 776
 777         if (TLI.getPointerTy(DL) == MVT::i64) {
 778           Opc = X86::MOV64rm;
 779           RC  = &X86::GR64RegClass;
 780         } else {
 781           Opc = X86::MOV32rm;
 782           RC  = &X86::GR32RegClass;
 783         }
 784
 785         if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL ||
 786             GVFlags == X86II::MO_GOTPCREL_NORELAX)
 787           StubAM.Base.Reg = X86::RIP;
 788
 789         LoadReg = createResultReg(RC);
 790         MachineInstrBuilder LoadMI =
 791           BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), LoadReg);
 792         addFullAddress(LoadMI, StubAM);
 793
 794         // Ok, back to normal mode.
 795         leaveLocalValueArea(SaveInsertPt);
 796
 797         // Prevent loading GV stub multiple times in same MBB.
 798         LocalValueMap[V] = LoadReg;
 799       }
 800
 801       // Now construct the final address. Note that the Disp, Scale,
 802       // and Index values may already be set here.
 803       AM.Base.Reg = LoadReg;
 804       AM.GV = nullptr;
 805       return true;
 806     }
 807   }
 808
 809   // If all else fails, try to materialize the value in a register.
 810   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
 811     if (AM.Base.Reg == 0) {
 812       AM.Base.Reg = getRegForValue(V);
 813       return AM.Base.Reg != 0;
 814     }
 815     if (AM.IndexReg == 0) {
 816       assert(AM.Scale == 1 && "Scale with no index!");
 817       AM.IndexReg = getRegForValue(V);
 818       return AM.IndexReg != 0;
 819     }
 820   }
 821
 822   return false;
 823 }
 824
 825 /// X86SelectAddress - Attempt to fill in an address from the given value.
 826 ///
 827 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 828   SmallVector<const Value *, 32> GEPs;
 829 redo_gep:
 830   const User *U = nullptr;
 831   unsigned Opcode = Instruction::UserOp1;
 832   if (const Instruction *I = dyn_cast<Instruction>(V)) {
 833     // Don't walk into other basic blocks; it's possible we haven't
 834     // visited them yet, so the instructions may not yet be assigned
 835     // virtual registers.
 836     if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
 837         FuncInfo.getMBB(I->getParent()) == FuncInfo.MBB) {
 838       Opcode = I->getOpcode();
 839       U = I;
 840     }
 841   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
 842     Opcode = C->getOpcode();
 843     U = C;
 844   }
 845
 846   if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
 847     if (Ty->getAddressSpace() > 255)
 848       // Fast instruction selection doesn't support the special
 849       // address spaces.
 850       return false;
 851
 852   switch (Opcode) {
 853   default: break;
 854   case Instruction::BitCast:
 855     // Look past bitcasts.
 856     return X86SelectAddress(U->getOperand(0), AM);
 857
 858   case Instruction::IntToPtr:
 859     // Look past no-op inttoptrs.
 860     if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
 861         TLI.getPointerTy(DL))
 862       return X86SelectAddress(U->getOperand(0), AM);
 863     break;
 864
 865   case Instruction::PtrToInt:
 866     // Look past no-op ptrtoints.
 867     if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
 868       return X86SelectAddress(U->getOperand(0), AM);
 869     break;
 870
 871   case Instruction::Alloca: {
 872     // Do static allocas.
 873     const AllocaInst *A = cast<AllocaInst>(V);
 874     DenseMap<const AllocaInst *, int>::iterator SI =
 875       FuncInfo.StaticAllocaMap.find(A);
 876     if (SI != FuncInfo.StaticAllocaMap.end()) {
 877       AM.BaseType = X86AddressMode::FrameIndexBase;
 878       AM.Base.FrameIndex = SI->second;
 879       return true;
 880     }
 881     break;
 882   }
 883
 884   case Instruction::Add: {
 885     // Adds of constants are common and easy enough.
 886     if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
 887       uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
 888       // They have to fit in the 32-bit signed displacement field though.
 889       if (isInt<32>(Disp)) {
 890         AM.Disp = (uint32_t)Disp;
 891         return X86SelectAddress(U->getOperand(0), AM);
 892       }
 893     }
 894     break;
 895   }
 896
 897   case Instruction::GetElementPtr: {
 898     X86AddressMode SavedAM = AM;
 899
 900     // Pattern-match simple GEPs.
 901     uint64_t Disp = (int32_t)AM.Disp;
 902     unsigned IndexReg = AM.IndexReg;
 903     unsigned Scale = AM.Scale;
 904     MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT();
 905
 906     gep_type_iterator GTI = gep_type_begin(U);
 907     // Iterate through the indices, folding what we can. Constants can be
 908     // folded, and one dynamic index can be handled, if the scale is supported.
 909     for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
 910          i != e; ++i, ++GTI) {
 911       const Value *Op = *i;
 912       if (StructType *STy = GTI.getStructTypeOrNull()) {
 913         const StructLayout *SL = DL.getStructLayout(STy);
 914         Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
 915         continue;
 916       }
 917
 918       // A array/variable index is always of the form i*S where S is the
 919       // constant scale size.  See if we can push the scale into immediates.
 920       uint64_t S = GTI.getSequentialElementStride(DL);
 921       for (;;) {
 922         if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
 923           // Constant-offset addressing.
 924           Disp += CI->getSExtValue() * S;
 925           break;
 926         }
 927         if (canFoldAddIntoGEP(U, Op)) {
 928           // A compatible add with a constant operand. Fold the constant.
 929           ConstantInt *CI =
 930             cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
 931           Disp += CI->getSExtValue() * S;
 932           // Iterate on the other operand.
 933           Op = cast<AddOperator>(Op)->getOperand(0);
 934           continue;
 935         }
 936         if (IndexReg == 0 &&
 937             (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
 938             (S == 1 || S == 2 || S == 4 || S == 8)) {
 939           // Scaled-index addressing.
 940           Scale = S;
 941           IndexReg = getRegForGEPIndex(PtrVT, Op);
 942           if (IndexReg == 0)
 943             return false;
 944           break;
 945         }
 946         // Unsupported.
 947         goto unsupported_gep;
 948       }
 949     }
 950
 951     // Check for displacement overflow.
 952     if (!isInt<32>(Disp))
 953       break;
 954
 955     AM.IndexReg = IndexReg;
 956     AM.Scale = Scale;
 957     AM.Disp = (uint32_t)Disp;
 958     GEPs.push_back(V);
 959
 960     if (const GetElementPtrInst *GEP =
 961           dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
 962       // Ok, the GEP indices were covered by constant-offset and scaled-index
 963       // addressing. Update the address state and move on to examining the base.
 964       V = GEP;
 965       goto redo_gep;
 966     } else if (X86SelectAddress(U->getOperand(0), AM)) {
 967       return true;
 968     }
 969
 970     // If we couldn't merge the gep value into this addr mode, revert back to
 971     // our address and just match the value instead of completely failing.
 972     AM = SavedAM;
 973
 974     for (const Value *I : reverse(GEPs))
 975       if (handleConstantAddresses(I, AM))
 976         return true;
 977
 978     return false;
 979   unsupported_gep:
 980     // Ok, the GEP indices weren't all covered.
 981     break;
 982   }
 983   }
 984
 985   return handleConstantAddresses(V, AM);
 986 }
 987
 988 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
 989 ///
 990 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
 991   const User *U = nullptr;
 992   unsigned Opcode = Instruction::UserOp1;
 993   const Instruction *I = dyn_cast<Instruction>(V);
 994   // Record if the value is defined in the same basic block.
 995   //
 996   // This information is crucial to know whether or not folding an
 997   // operand is valid.
 998   // Indeed, FastISel generates or reuses a virtual register for all
 999   // operands of all instructions it selects. Obviously, the definition and
1000   // its uses must use the same virtual register otherwise the produced
1001   // code is incorrect.
1002   // Before instruction selection, FunctionLoweringInfo::set sets the virtual
1003   // registers for values that are alive across basic blocks. This ensures
1004   // that the values are consistently set between across basic block, even
1005   // if different instruction selection mechanisms are used (e.g., a mix of
1006   // SDISel and FastISel).
1007   // For values local to a basic block, the instruction selection process
1008   // generates these virtual registers with whatever method is appropriate
1009   // for its needs. In particular, FastISel and SDISel do not share the way
1010   // local virtual registers are set.
1011   // Therefore, this is impossible (or at least unsafe) to share values
1012   // between basic blocks unless they use the same instruction selection
1013   // method, which is not guarantee for X86.
1014   // Moreover, things like hasOneUse could not be used accurately, if we
1015   // allow to reference values across basic blocks whereas they are not
1016   // alive across basic blocks initially.
1017   bool InMBB = true;
1018   if (I) {
1019     Opcode = I->getOpcode();
1020     U = I;
1021     InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
1022   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
1023     Opcode = C->getOpcode();
1024     U = C;
1025   }
1026
1027   switch (Opcode) {
1028   default: break;
1029   case Instruction::BitCast:
1030     // Look past bitcasts if its operand is in the same BB.
1031     if (InMBB)
1032       return X86SelectCallAddress(U->getOperand(0), AM);
1033     break;
1034
1035   case Instruction::IntToPtr:
1036     // Look past no-op inttoptrs if its operand is in the same BB.
1037     if (InMBB &&
1038         TLI.getValueType(DL, U->getOperand(0)->getType()) ==
1039             TLI.getPointerTy(DL))
1040       return X86SelectCallAddress(U->getOperand(0), AM);
1041     break;
1042
1043   case Instruction::PtrToInt:
1044     // Look past no-op ptrtoints if its operand is in the same BB.
1045     if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
1046       return X86SelectCallAddress(U->getOperand(0), AM);
1047     break;
1048   }
1049
1050   // Handle constant address.
1051   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
1052     // Can't handle alternate code models yet.
1053     if (TM.getCodeModel() != CodeModel::Small &&
1054         TM.getCodeModel() != CodeModel::Medium)
1055       return false;
1056
1057     // RIP-relative addresses can't have additional register operands.
1058     if (Subtarget->isPICStyleRIPRel() &&
1059         (AM.Base.Reg != 0 || AM.IndexReg != 0))
1060       return false;
1061
1062     // Can't handle TLS.
1063     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
1064       if (GVar->isThreadLocal())
1065         return false;
1066
1067     // Okay, we've committed to selecting this global. Set up the basic address.
1068     AM.GV = GV;
1069
1070     // Return a direct reference to the global. Fastisel can handle calls to
1071     // functions that require loads, such as dllimport and nonlazybind
1072     // functions.
1073     if (Subtarget->isPICStyleRIPRel()) {
1074       // Use rip-relative addressing if we can.  Above we verified that the
1075       // base and index registers are unused.
1076       assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
1077       AM.Base.Reg = X86::RIP;
1078     } else {
1079       AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
1080     }
1081
1082     return true;
1083   }
1084
1085   // If all else fails, try to materialize the value in a register.
1086   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
1087     auto GetCallRegForValue = [this](const Value *V) {
1088       Register Reg = getRegForValue(V);
1089
1090       // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.
1091       if (Reg && Subtarget->isTarget64BitILP32()) {
1092         Register CopyReg = createResultReg(&X86::GR32RegClass);
1093         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV32rr),
1094                 CopyReg)
1095             .addReg(Reg);
1096
1097         Register ExtReg = createResultReg(&X86::GR64RegClass);
1098         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1099                 TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)
1100             .addImm(0)
1101             .addReg(CopyReg)
1102             .addImm(X86::sub_32bit);
1103         Reg = ExtReg;
1104       }
1105
1106       return Reg;
1107     };
1108
1109     if (AM.Base.Reg == 0) {
1110       AM.Base.Reg = GetCallRegForValue(V);
1111       return AM.Base.Reg != 0;
1112     }
1113     if (AM.IndexReg == 0) {
1114       assert(AM.Scale == 1 && "Scale with no index!");
1115       AM.IndexReg = GetCallRegForValue(V);
1116       return AM.IndexReg != 0;
1117     }
1118   }
1119
1120   return false;
1121 }
1122
1123
1124 /// X86SelectStore - Select and emit code to implement store instructions.
1125 bool X86FastISel::X86SelectStore(const Instruction *I) {
1126   // Atomic stores need special handling.
1127   const StoreInst *S = cast<StoreInst>(I);
1128
1129   if (S->isAtomic())
1130     return false;
1131
1132   const Value *PtrV = I->getOperand(1);
1133   if (TLI.supportSwiftError()) {
1134     // Swifterror values can come from either a function parameter with
1135     // swifterror attribute or an alloca with swifterror attribute.
1136     if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
1137       if (Arg->hasSwiftErrorAttr())
1138         return false;
1139     }
1140
1141     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
1142       if (Alloca->isSwiftError())
1143         return false;
1144     }
1145   }
1146
1147   const Value *Val = S->getValueOperand();
1148   const Value *Ptr = S->getPointerOperand();
1149
1150   MVT VT;
1151   if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
1152     return false;
1153
1154   Align Alignment = S->getAlign();
1155   Align ABIAlignment = DL.getABITypeAlign(Val->getType());
1156   bool Aligned = Alignment >= ABIAlignment;
1157
1158   X86AddressMode AM;
1159   if (!X86SelectAddress(Ptr, AM))
1160     return false;
1161
1162   return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
1163 }
1164
1165 /// X86SelectRet - Select and emit code to implement ret instructions.
1166 bool X86FastISel::X86SelectRet(const Instruction *I) {
1167   const ReturnInst *Ret = cast<ReturnInst>(I);
1168   const Function &F = *I->getParent()->getParent();
1169   const X86MachineFunctionInfo *X86MFInfo =
1170       FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
1171
1172   if (!FuncInfo.CanLowerReturn)
1173     return false;
1174
1175   if (TLI.supportSwiftError() &&
1176       F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
1177     return false;
1178
1179   if (TLI.supportSplitCSR(FuncInfo.MF))
1180     return false;
1181
1182   CallingConv::ID CC = F.getCallingConv();
1183   if (CC != CallingConv::C &&
1184       CC != CallingConv::Fast &&
1185       CC != CallingConv::Tail &&
1186       CC != CallingConv::SwiftTail &&
1187       CC != CallingConv::X86_FastCall &&
1188       CC != CallingConv::X86_StdCall &&
1189       CC != CallingConv::X86_ThisCall &&
1190       CC != CallingConv::X86_64_SysV &&
1191       CC != CallingConv::Win64)
1192     return false;
1193
1194   // Don't handle popping bytes if they don't fit the ret's immediate.
1195   if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
1196     return false;
1197
1198   // fastcc with -tailcallopt is intended to provide a guaranteed
1199   // tail call optimization. Fastisel doesn't know how to do that.
1200   if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
1201       CC == CallingConv::Tail || CC == CallingConv::SwiftTail)
1202     return false;
1203
1204   // Let SDISel handle vararg functions.
1205   if (F.isVarArg())
1206     return false;
1207
1208   // Build a list of return value registers.
1209   SmallVector<unsigned, 4> RetRegs;
1210
1211   if (Ret->getNumOperands() > 0) {
1212     SmallVector<ISD::OutputArg, 4> Outs;
1213     GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
1214
1215     // Analyze operands of the call, assigning locations to each operand.
1216     SmallVector<CCValAssign, 16> ValLocs;
1217     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
1218     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1219
1220     const Value *RV = Ret->getOperand(0);
1221     Register Reg = getRegForValue(RV);
1222     if (Reg == 0)
1223       return false;
1224
1225     // Only handle a single return value for now.
1226     if (ValLocs.size() != 1)
1227       return false;
1228
1229     CCValAssign &VA = ValLocs[0];
1230
1231     // Don't bother handling odd stuff for now.
1232     if (VA.getLocInfo() != CCValAssign::Full)
1233       return false;
1234     // Only handle register returns for now.
1235     if (!VA.isRegLoc())
1236       return false;
1237
1238     // The calling-convention tables for x87 returns don't tell
1239     // the whole story.
1240     if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
1241       return false;
1242
1243     unsigned SrcReg = Reg + VA.getValNo();
1244     EVT SrcVT = TLI.getValueType(DL, RV->getType());
1245     EVT DstVT = VA.getValVT();
1246     // Special handling for extended integers.
1247     if (SrcVT != DstVT) {
1248       if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
1249         return false;
1250
1251       if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
1252         return false;
1253
1254       if (SrcVT == MVT::i1) {
1255         if (Outs[0].Flags.isSExt())
1256           return false;
1257         SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg);
1258         SrcVT = MVT::i8;
1259       }
1260       if (SrcVT != DstVT) {
1261         unsigned Op =
1262             Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
1263         SrcReg =
1264             fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg);
1265       }
1266     }
1267
1268     // Make the copy.
1269     Register DstReg = VA.getLocReg();
1270     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
1271     // Avoid a cross-class copy. This is very unlikely.
1272     if (!SrcRC->contains(DstReg))
1273       return false;
1274     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1275             TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
1276
1277     // Add register to return instruction.
1278     RetRegs.push_back(VA.getLocReg());
1279   }
1280
1281   // Swift calling convention does not require we copy the sret argument
1282   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
1283
1284   // All x86 ABIs require that for returning structs by value we copy
1285   // the sret argument into %rax/%eax (depending on ABI) for the return.
1286   // We saved the argument into a virtual register in the entry block,
1287   // so now we copy the value out and into %rax/%eax.
1288   if (F.hasStructRetAttr() && CC != CallingConv::Swift &&
1289       CC != CallingConv::SwiftTail) {
1290     Register Reg = X86MFInfo->getSRetReturnReg();
1291     assert(Reg &&
1292            "SRetReturnReg should have been set in LowerFormalArguments()!");
1293     unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
1294     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1295             TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
1296     RetRegs.push_back(RetReg);
1297   }
1298
1299   // Now emit the RET.
1300   MachineInstrBuilder MIB;
1301   if (X86MFInfo->getBytesToPopOnReturn()) {
1302     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1303                   TII.get(Subtarget->is64Bit() ? X86::RETI64 : X86::RETI32))
1304               .addImm(X86MFInfo->getBytesToPopOnReturn());
1305   } else {
1306     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1307                   TII.get(Subtarget->is64Bit() ? X86::RET64 : X86::RET32));
1308   }
1309   for (unsigned Reg : RetRegs)
1310     MIB.addReg(Reg, RegState::Implicit);
1311   return true;
1312 }
1313
1314 /// X86SelectLoad - Select and emit code to implement load instructions.
1315 ///
1316 bool X86FastISel::X86SelectLoad(const Instruction *I) {
1317   const LoadInst *LI = cast<LoadInst>(I);
1318
1319   // Atomic loads need special handling.
1320   if (LI->isAtomic())
1321     return false;
1322
1323   const Value *SV = I->getOperand(0);
1324   if (TLI.supportSwiftError()) {
1325     // Swifterror values can come from either a function parameter with
1326     // swifterror attribute or an alloca with swifterror attribute.
1327     if (const Argument *Arg = dyn_cast<Argument>(SV)) {
1328       if (Arg->hasSwiftErrorAttr())
1329         return false;
1330     }
1331
1332     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
1333       if (Alloca->isSwiftError())
1334         return false;
1335     }
1336   }
1337
1338   MVT VT;
1339   if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
1340     return false;
1341
1342   const Value *Ptr = LI->getPointerOperand();
1343
1344   X86AddressMode AM;
1345   if (!X86SelectAddress(Ptr, AM))
1346     return false;
1347
1348   unsigned ResultReg = 0;
1349   if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
1350                        LI->getAlign().value()))
1351     return false;
1352
1353   updateValueMap(I, ResultReg);
1354   return true;
1355 }
1356
1357 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
1358   bool HasAVX512 = Subtarget->hasAVX512();
1359   bool HasAVX = Subtarget->hasAVX();
1360   bool HasSSE1 = Subtarget->hasSSE1();
1361   bool HasSSE2 = Subtarget->hasSSE2();
1362
1363   switch (VT.getSimpleVT().SimpleTy) {
1364   default:       return 0;
1365   case MVT::i8:  return X86::CMP8rr;
1366   case MVT::i16: return X86::CMP16rr;
1367   case MVT::i32: return X86::CMP32rr;
1368   case MVT::i64: return X86::CMP64rr;
1369   case MVT::f32:
1370     return HasAVX512 ? X86::VUCOMISSZrr
1371            : HasAVX  ? X86::VUCOMISSrr
1372            : HasSSE1 ? X86::UCOMISSrr
1373                      : 0;
1374   case MVT::f64:
1375     return HasAVX512 ? X86::VUCOMISDZrr
1376            : HasAVX  ? X86::VUCOMISDrr
1377            : HasSSE2 ? X86::UCOMISDrr
1378                      : 0;
1379   }
1380 }
1381
1382 /// If we have a comparison with RHS as the RHS  of the comparison, return an
1383 /// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
1384 static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
1385   switch (VT.getSimpleVT().SimpleTy) {
1386   // Otherwise, we can't fold the immediate into this comparison.
1387   default:
1388     return 0;
1389   case MVT::i8:
1390     return X86::CMP8ri;
1391   case MVT::i16:
1392     return X86::CMP16ri;
1393   case MVT::i32:
1394     return X86::CMP32ri;
1395   case MVT::i64:
1396     // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
1397     // field.
1398     return isInt<32>(RHSC->getSExtValue()) ? X86::CMP64ri32 : 0;
1399   }
1400 }
1401
1402 bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
1403                                      const DebugLoc &CurMIMD) {
1404   Register Op0Reg = getRegForValue(Op0);
1405   if (Op0Reg == 0) return false;
1406
1407   // Handle 'null' like i32/i64 0.
1408   if (isa<ConstantPointerNull>(Op1))
1409     Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
1410
1411   // We have two options: compare with register or immediate.  If the RHS of
1412   // the compare is an immediate that we can fold into this compare, use
1413   // CMPri, otherwise use CMPrr.
1414   if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
1415     if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
1416       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurMIMD, TII.get(CompareImmOpc))
1417         .addReg(Op0Reg)
1418         .addImm(Op1C->getSExtValue());
1419       return true;
1420     }
1421   }
1422
1423   unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
1424   if (CompareOpc == 0) return false;
1425
1426   Register Op1Reg = getRegForValue(Op1);
1427   if (Op1Reg == 0) return false;
1428   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurMIMD, TII.get(CompareOpc))
1429     .addReg(Op0Reg)
1430     .addReg(Op1Reg);
1431
1432   return true;
1433 }
1434
1435 bool X86FastISel::X86SelectCmp(const Instruction *I) {
1436   const CmpInst *CI = cast<CmpInst>(I);
1437
1438   MVT VT;
1439   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
1440     return false;
1441
1442   // Below code only works for scalars.
1443   if (VT.isVector())
1444     return false;
1445
1446   // Try to optimize or fold the cmp.
1447   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1448   unsigned ResultReg = 0;
1449   switch (Predicate) {
1450   default: break;
1451   case CmpInst::FCMP_FALSE: {
1452     ResultReg = createResultReg(&X86::GR32RegClass);
1453     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV32r0),
1454             ResultReg);
1455     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, X86::sub_8bit);
1456     if (!ResultReg)
1457       return false;
1458     break;
1459   }
1460   case CmpInst::FCMP_TRUE: {
1461     ResultReg = createResultReg(&X86::GR8RegClass);
1462     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV8ri),
1463             ResultReg).addImm(1);
1464     break;
1465   }
1466   }
1467
1468   if (ResultReg) {
1469     updateValueMap(I, ResultReg);
1470     return true;
1471   }
1472
1473   const Value *LHS = CI->getOperand(0);
1474   const Value *RHS = CI->getOperand(1);
1475
1476   // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
1477   // We don't have to materialize a zero constant for this case and can just use
1478   // %x again on the RHS.
1479   if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1480     const auto *RHSC = dyn_cast<ConstantFP>(RHS);
1481     if (RHSC && RHSC->isNullValue())
1482       RHS = LHS;
1483   }
1484
1485   // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
1486   static const uint16_t SETFOpcTable[2][3] = {
1487     { X86::COND_E,  X86::COND_NP, X86::AND8rr },
1488     { X86::COND_NE, X86::COND_P,  X86::OR8rr  }
1489   };
1490   const uint16_t *SETFOpc = nullptr;
1491   switch (Predicate) {
1492   default: break;
1493   case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
1494   case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
1495   }
1496
1497   ResultReg = createResultReg(&X86::GR8RegClass);
1498   if (SETFOpc) {
1499     if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
1500       return false;
1501
1502     Register FlagReg1 = createResultReg(&X86::GR8RegClass);
1503     Register FlagReg2 = createResultReg(&X86::GR8RegClass);
1504     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1505             FlagReg1).addImm(SETFOpc[0]);
1506     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1507             FlagReg2).addImm(SETFOpc[1]);
1508     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(SETFOpc[2]),
1509             ResultReg).addReg(FlagReg1).addReg(FlagReg2);
1510     updateValueMap(I, ResultReg);
1511     return true;
1512   }
1513
1514   X86::CondCode CC;
1515   bool SwapArgs;
1516   std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
1517   assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1518
1519   if (SwapArgs)
1520     std::swap(LHS, RHS);
1521
1522   // Emit a compare of LHS/RHS.
1523   if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
1524     return false;
1525
1526   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
1527           ResultReg).addImm(CC);
1528   updateValueMap(I, ResultReg);
1529   return true;
1530 }
1531
1532 bool X86FastISel::X86SelectZExt(const Instruction *I) {
1533   EVT DstVT = TLI.getValueType(DL, I->getType());
1534   if (!TLI.isTypeLegal(DstVT))
1535     return false;
1536
1537   Register ResultReg = getRegForValue(I->getOperand(0));
1538   if (ResultReg == 0)
1539     return false;
1540
1541   // Handle zero-extension from i1 to i8, which is common.
1542   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
1543   if (SrcVT == MVT::i1) {
1544     // Set the high bits to zero.
1545     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg);
1546     SrcVT = MVT::i8;
1547
1548     if (ResultReg == 0)
1549       return false;
1550   }
1551
1552   if (DstVT == MVT::i64) {
1553     // Handle extension to 64-bits via sub-register shenanigans.
1554     unsigned MovInst;
1555
1556     switch (SrcVT.SimpleTy) {
1557     case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
1558     case MVT::i16: MovInst = X86::MOVZX32rr16; break;
1559     case MVT::i32: MovInst = X86::MOV32rr;     break;
1560     default: llvm_unreachable("Unexpected zext to i64 source type");
1561     }
1562
1563     Register Result32 = createResultReg(&X86::GR32RegClass);
1564     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(MovInst), Result32)
1565       .addReg(ResultReg);
1566
1567     ResultReg = createResultReg(&X86::GR64RegClass);
1568     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::SUBREG_TO_REG),
1569             ResultReg)
1570       .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
1571   } else if (DstVT == MVT::i16) {
1572     // i8->i16 doesn't exist in the autogenerated isel table. Need to zero
1573     // extend to 32-bits and then extract down to 16-bits.
1574     Register Result32 = createResultReg(&X86::GR32RegClass);
1575     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOVZX32rr8),
1576             Result32).addReg(ResultReg);
1577
1578     ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);
1579   } else if (DstVT != MVT::i8) {
1580     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
1581                            ResultReg);
1582     if (ResultReg == 0)
1583       return false;
1584   }
1585
1586   updateValueMap(I, ResultReg);
1587   return true;
1588 }
1589
1590 bool X86FastISel::X86SelectSExt(const Instruction *I) {
1591   EVT DstVT = TLI.getValueType(DL, I->getType());
1592   if (!TLI.isTypeLegal(DstVT))
1593     return false;
1594
1595   Register ResultReg = getRegForValue(I->getOperand(0));
1596   if (ResultReg == 0)
1597     return false;
1598
1599   // Handle sign-extension from i1 to i8.
1600   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
1601   if (SrcVT == MVT::i1) {
1602     // Set the high bits to zero.
1603     Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg);
1604     if (ZExtReg == 0)
1605       return false;
1606
1607     // Negate the result to make an 8-bit sign extended value.
1608     ResultReg = createResultReg(&X86::GR8RegClass);
1609     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::NEG8r),
1610             ResultReg).addReg(ZExtReg);
1611
1612     SrcVT = MVT::i8;
1613   }
1614
1615   if (DstVT == MVT::i16) {
1616     // i8->i16 doesn't exist in the autogenerated isel table. Need to sign
1617     // extend to 32-bits and then extract down to 16-bits.
1618     Register Result32 = createResultReg(&X86::GR32RegClass);
1619     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOVSX32rr8),
1620             Result32).addReg(ResultReg);
1621
1622     ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);
1623   } else if (DstVT != MVT::i8) {
1624     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
1625                            ResultReg);
1626     if (ResultReg == 0)
1627       return false;
1628   }
1629
1630   updateValueMap(I, ResultReg);
1631   return true;
1632 }
1633
1634 bool X86FastISel::X86SelectBranch(const Instruction *I) {
1635   // Unconditional branches are selected by tablegen-generated code.
1636   // Handle a conditional branch.
1637   const BranchInst *BI = cast<BranchInst>(I);
1638   MachineBasicBlock *TrueMBB = FuncInfo.getMBB(BI->getSuccessor(0));
1639   MachineBasicBlock *FalseMBB = FuncInfo.getMBB(BI->getSuccessor(1));
1640
1641   // Fold the common case of a conditional branch with a comparison
1642   // in the same block (values defined on other blocks may not have
1643   // initialized registers).
1644   X86::CondCode CC;
1645   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
1646     if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
1647       EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
1648
1649       // Try to optimize or fold the cmp.
1650       CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1651       switch (Predicate) {
1652       default: break;
1653       case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, MIMD.getDL()); return true;
1654       case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, MIMD.getDL()); return true;
1655       }
1656
1657       const Value *CmpLHS = CI->getOperand(0);
1658       const Value *CmpRHS = CI->getOperand(1);
1659
1660       // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
1661       // 0.0.
1662       // We don't have to materialize a zero constant for this case and can just
1663       // use %x again on the RHS.
1664       if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1665         const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
1666         if (CmpRHSC && CmpRHSC->isNullValue())
1667           CmpRHS = CmpLHS;
1668       }
1669
1670       // Try to take advantage of fallthrough opportunities.
1671       if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
1672         std::swap(TrueMBB, FalseMBB);
1673         Predicate = CmpInst::getInversePredicate(Predicate);
1674       }
1675
1676       // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
1677       // code check. Instead two branch instructions are required to check all
1678       // the flags. First we change the predicate to a supported condition code,
1679       // which will be the first branch. Later one we will emit the second
1680       // branch.
1681       bool NeedExtraBranch = false;
1682       switch (Predicate) {
1683       default: break;
1684       case CmpInst::FCMP_OEQ:
1685         std::swap(TrueMBB, FalseMBB);
1686         [[fallthrough]];
1687       case CmpInst::FCMP_UNE:
1688         NeedExtraBranch = true;
1689         Predicate = CmpInst::FCMP_ONE;
1690         break;
1691       }
1692
1693       bool SwapArgs;
1694       std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
1695       assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1696
1697       if (SwapArgs)
1698         std::swap(CmpLHS, CmpRHS);
1699
1700       // Emit a compare of the LHS and RHS, setting the flags.
1701       if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
1702         return false;
1703
1704       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1705         .addMBB(TrueMBB).addImm(CC);
1706
1707       // X86 requires a second branch to handle UNE (and OEQ, which is mapped
1708       // to UNE above).
1709       if (NeedExtraBranch) {
1710         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1711           .addMBB(TrueMBB).addImm(X86::COND_P);
1712       }
1713
1714       finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1715       return true;
1716     }
1717   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
1718     // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
1719     // typically happen for _Bool and C++ bools.
1720     MVT SourceVT;
1721     if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
1722         isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
1723       unsigned TestOpc = 0;
1724       switch (SourceVT.SimpleTy) {
1725       default: break;
1726       case MVT::i8:  TestOpc = X86::TEST8ri; break;
1727       case MVT::i16: TestOpc = X86::TEST16ri; break;
1728       case MVT::i32: TestOpc = X86::TEST32ri; break;
1729       case MVT::i64: TestOpc = X86::TEST64ri32; break;
1730       }
1731       if (TestOpc) {
1732         Register OpReg = getRegForValue(TI->getOperand(0));
1733         if (OpReg == 0) return false;
1734
1735         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TestOpc))
1736           .addReg(OpReg).addImm(1);
1737
1738         unsigned JmpCond = X86::COND_NE;
1739         if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
1740           std::swap(TrueMBB, FalseMBB);
1741           JmpCond = X86::COND_E;
1742         }
1743
1744         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1745           .addMBB(TrueMBB).addImm(JmpCond);
1746
1747         finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1748         return true;
1749       }
1750     }
1751   } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
1752     // Fake request the condition, otherwise the intrinsic might be completely
1753     // optimized away.
1754     Register TmpReg = getRegForValue(BI->getCondition());
1755     if (TmpReg == 0)
1756       return false;
1757
1758     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1759       .addMBB(TrueMBB).addImm(CC);
1760     finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1761     return true;
1762   }
1763
1764   // Otherwise do a clumsy setcc and re-test it.
1765   // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
1766   // in an explicit cast, so make sure to handle that correctly.
1767   Register OpReg = getRegForValue(BI->getCondition());
1768   if (OpReg == 0) return false;
1769
1770   // In case OpReg is a K register, COPY to a GPR
1771   if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
1772     unsigned KOpReg = OpReg;
1773     OpReg = createResultReg(&X86::GR32RegClass);
1774     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1775             TII.get(TargetOpcode::COPY), OpReg)
1776         .addReg(KOpReg);
1777     OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, X86::sub_8bit);
1778   }
1779   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
1780       .addReg(OpReg)
1781       .addImm(1);
1782   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))
1783     .addMBB(TrueMBB).addImm(X86::COND_NE);
1784   finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
1785   return true;
1786 }
1787
1788 bool X86FastISel::X86SelectShift(const Instruction *I) {
1789   unsigned CReg = 0, OpReg = 0;
1790   const TargetRegisterClass *RC = nullptr;
1791   if (I->getType()->isIntegerTy(8)) {
1792     CReg = X86::CL;
1793     RC = &X86::GR8RegClass;
1794     switch (I->getOpcode()) {
1795     case Instruction::LShr: OpReg = X86::SHR8rCL; break;
1796     case Instruction::AShr: OpReg = X86::SAR8rCL; break;
1797     case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
1798     default: return false;
1799     }
1800   } else if (I->getType()->isIntegerTy(16)) {
1801     CReg = X86::CX;
1802     RC = &X86::GR16RegClass;
1803     switch (I->getOpcode()) {
1804     default: llvm_unreachable("Unexpected shift opcode");
1805     case Instruction::LShr: OpReg = X86::SHR16rCL; break;
1806     case Instruction::AShr: OpReg = X86::SAR16rCL; break;
1807     case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
1808     }
1809   } else if (I->getType()->isIntegerTy(32)) {
1810     CReg = X86::ECX;
1811     RC = &X86::GR32RegClass;
1812     switch (I->getOpcode()) {
1813     default: llvm_unreachable("Unexpected shift opcode");
1814     case Instruction::LShr: OpReg = X86::SHR32rCL; break;
1815     case Instruction::AShr: OpReg = X86::SAR32rCL; break;
1816     case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
1817     }
1818   } else if (I->getType()->isIntegerTy(64)) {
1819     CReg = X86::RCX;
1820     RC = &X86::GR64RegClass;
1821     switch (I->getOpcode()) {
1822     default: llvm_unreachable("Unexpected shift opcode");
1823     case Instruction::LShr: OpReg = X86::SHR64rCL; break;
1824     case Instruction::AShr: OpReg = X86::SAR64rCL; break;
1825     case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
1826     }
1827   } else {
1828     return false;
1829   }
1830
1831   MVT VT;
1832   if (!isTypeLegal(I->getType(), VT))
1833     return false;
1834
1835   Register Op0Reg = getRegForValue(I->getOperand(0));
1836   if (Op0Reg == 0) return false;
1837
1838   Register Op1Reg = getRegForValue(I->getOperand(1));
1839   if (Op1Reg == 0) return false;
1840   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
1841           CReg).addReg(Op1Reg);
1842
1843   // The shift instruction uses X86::CL. If we defined a super-register
1844   // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
1845   if (CReg != X86::CL)
1846     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1847             TII.get(TargetOpcode::KILL), X86::CL)
1848       .addReg(CReg, RegState::Kill);
1849
1850   Register ResultReg = createResultReg(RC);
1851   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(OpReg), ResultReg)
1852     .addReg(Op0Reg);
1853   updateValueMap(I, ResultReg);
1854   return true;
1855 }
1856
1857 bool X86FastISel::X86SelectDivRem(const Instruction *I) {
1858   const static unsigned NumTypes = 4; // i8, i16, i32, i64
1859   const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem
1860   const static bool S = true;  // IsSigned
1861   const static bool U = false; // !IsSigned
1862   const static unsigned Copy = TargetOpcode::COPY;
1863   // For the X86 DIV/IDIV instruction, in most cases the dividend
1864   // (numerator) must be in a specific register pair highreg:lowreg,
1865   // producing the quotient in lowreg and the remainder in highreg.
1866   // For most data types, to set up the instruction, the dividend is
1867   // copied into lowreg, and lowreg is sign-extended or zero-extended
1868   // into highreg.  The exception is i8, where the dividend is defined
1869   // as a single register rather than a register pair, and we
1870   // therefore directly sign-extend or zero-extend the dividend into
1871   // lowreg, instead of copying, and ignore the highreg.
1872   const static struct DivRemEntry {
1873     // The following portion depends only on the data type.
1874     const TargetRegisterClass *RC;
1875     unsigned LowInReg;  // low part of the register pair
1876     unsigned HighInReg; // high part of the register pair
1877     // The following portion depends on both the data type and the operation.
1878     struct DivRemResult {
1879     unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
1880     unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
1881                               // highreg, or copying a zero into highreg.
1882     unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
1883                               // zero/sign-extending into lowreg for i8.
1884     unsigned DivRemResultReg; // Register containing the desired result.
1885     bool IsOpSigned;          // Whether to use signed or unsigned form.
1886     } ResultTable[NumOps];
1887   } OpTable[NumTypes] = {
1888     { &X86::GR8RegClass,  X86::AX,  0, {
1889         { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv
1890         { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem
1891         { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv
1892         { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem
1893       }
1894     }, // i8
1895     { &X86::GR16RegClass, X86::AX,  X86::DX, {
1896         { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
1897         { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
1898         { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
1899         { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
1900       }
1901     }, // i16
1902     { &X86::GR32RegClass, X86::EAX, X86::EDX, {
1903         { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv
1904         { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem
1905         { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv
1906         { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem
1907       }
1908     }, // i32
1909     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
1910         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
1911         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
1912         { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
1913         { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
1914       }
1915     }, // i64
1916   };
1917
1918   MVT VT;
1919   if (!isTypeLegal(I->getType(), VT))
1920     return false;
1921
1922   unsigned TypeIndex, OpIndex;
1923   switch (VT.SimpleTy) {
1924   default: return false;
1925   case MVT::i8:  TypeIndex = 0; break;
1926   case MVT::i16: TypeIndex = 1; break;
1927   case MVT::i32: TypeIndex = 2; break;
1928   case MVT::i64: TypeIndex = 3;
1929     if (!Subtarget->is64Bit())
1930       return false;
1931     break;
1932   }
1933
1934   switch (I->getOpcode()) {
1935   default: llvm_unreachable("Unexpected div/rem opcode");
1936   case Instruction::SDiv: OpIndex = 0; break;
1937   case Instruction::SRem: OpIndex = 1; break;
1938   case Instruction::UDiv: OpIndex = 2; break;
1939   case Instruction::URem: OpIndex = 3; break;
1940   }
1941
1942   const DivRemEntry &TypeEntry = OpTable[TypeIndex];
1943   const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
1944   Register Op0Reg = getRegForValue(I->getOperand(0));
1945   if (Op0Reg == 0)
1946     return false;
1947   Register Op1Reg = getRegForValue(I->getOperand(1));
1948   if (Op1Reg == 0)
1949     return false;
1950
1951   // Move op0 into low-order input register.
1952   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1953           TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
1954   // Zero-extend or sign-extend into high-order input register.
1955   if (OpEntry.OpSignExtend) {
1956     if (OpEntry.IsOpSigned)
1957       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1958               TII.get(OpEntry.OpSignExtend));
1959     else {
1960       Register Zero32 = createResultReg(&X86::GR32RegClass);
1961       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1962               TII.get(X86::MOV32r0), Zero32);
1963
1964       // Copy the zero into the appropriate sub/super/identical physical
1965       // register. Unfortunately the operations needed are not uniform enough
1966       // to fit neatly into the table above.
1967       if (VT == MVT::i16) {
1968         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1969                 TII.get(Copy), TypeEntry.HighInReg)
1970           .addReg(Zero32, 0, X86::sub_16bit);
1971       } else if (VT == MVT::i32) {
1972         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1973                 TII.get(Copy), TypeEntry.HighInReg)
1974             .addReg(Zero32);
1975       } else if (VT == MVT::i64) {
1976         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1977                 TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
1978             .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
1979       }
1980     }
1981   }
1982   // Generate the DIV/IDIV instruction.
1983   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
1984           TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
1985   // For i8 remainder, we can't reference ah directly, as we'll end
1986   // up with bogus copies like %r9b = COPY %ah. Reference ax
1987   // instead to prevent ah references in a rex instruction.
1988   //
1989   // The current assumption of the fast register allocator is that isel
1990   // won't generate explicit references to the GR8_NOREX registers. If
1991   // the allocator and/or the backend get enhanced to be more robust in
1992   // that regard, this can be, and should be, removed.
1993   unsigned ResultReg = 0;
1994   if ((I->getOpcode() == Instruction::SRem ||
1995        I->getOpcode() == Instruction::URem) &&
1996       OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
1997     Register SourceSuperReg = createResultReg(&X86::GR16RegClass);
1998     Register ResultSuperReg = createResultReg(&X86::GR16RegClass);
1999     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2000             TII.get(Copy), SourceSuperReg).addReg(X86::AX);
2001
2002     // Shift AX right by 8 bits instead of using AH.
2003     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SHR16ri),
2004             ResultSuperReg).addReg(SourceSuperReg).addImm(8);
2005
2006     // Now reference the 8-bit subreg of the result.
2007     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
2008                                            X86::sub_8bit);
2009   }
2010   // Copy the result out of the physreg if we haven't already.
2011   if (!ResultReg) {
2012     ResultReg = createResultReg(TypeEntry.RC);
2013     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Copy), ResultReg)
2014         .addReg(OpEntry.DivRemResultReg);
2015   }
2016   updateValueMap(I, ResultReg);
2017
2018   return true;
2019 }
2020
2021 /// Emit a conditional move instruction (if the are supported) to lower
2022 /// the select.
2023 bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
2024   // Check if the subtarget supports these instructions.
2025   if (!Subtarget->canUseCMOV())
2026     return false;
2027
2028   // FIXME: Add support for i8.
2029   if (RetVT < MVT::i16 || RetVT > MVT::i64)
2030     return false;
2031
2032   const Value *Cond = I->getOperand(0);
2033   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2034   bool NeedTest = true;
2035   X86::CondCode CC = X86::COND_NE;
2036
2037   // Optimize conditions coming from a compare if both instructions are in the
2038   // same basic block (values defined in other basic blocks may not have
2039   // initialized registers).
2040   const auto *CI = dyn_cast<CmpInst>(Cond);
2041   if (CI && (CI->getParent() == I->getParent())) {
2042     CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2043
2044     // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
2045     static const uint16_t SETFOpcTable[2][3] = {
2046       { X86::COND_NP, X86::COND_E,  X86::TEST8rr },
2047       { X86::COND_P,  X86::COND_NE, X86::OR8rr   }
2048     };
2049     const uint16_t *SETFOpc = nullptr;
2050     switch (Predicate) {
2051     default: break;
2052     case CmpInst::FCMP_OEQ:
2053       SETFOpc = &SETFOpcTable[0][0];
2054       Predicate = CmpInst::ICMP_NE;
2055       break;
2056     case CmpInst::FCMP_UNE:
2057       SETFOpc = &SETFOpcTable[1][0];
2058       Predicate = CmpInst::ICMP_NE;
2059       break;
2060     }
2061
2062     bool NeedSwap;
2063     std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
2064     assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
2065
2066     const Value *CmpLHS = CI->getOperand(0);
2067     const Value *CmpRHS = CI->getOperand(1);
2068     if (NeedSwap)
2069       std::swap(CmpLHS, CmpRHS);
2070
2071     EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
2072     // Emit a compare of the LHS and RHS, setting the flags.
2073     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
2074       return false;
2075
2076     if (SETFOpc) {
2077       Register FlagReg1 = createResultReg(&X86::GR8RegClass);
2078       Register FlagReg2 = createResultReg(&X86::GR8RegClass);
2079       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2080               FlagReg1).addImm(SETFOpc[0]);
2081       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2082               FlagReg2).addImm(SETFOpc[1]);
2083       auto const &II = TII.get(SETFOpc[2]);
2084       if (II.getNumDefs()) {
2085         Register TmpReg = createResultReg(&X86::GR8RegClass);
2086         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, TmpReg)
2087           .addReg(FlagReg2).addReg(FlagReg1);
2088       } else {
2089         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
2090           .addReg(FlagReg2).addReg(FlagReg1);
2091       }
2092     }
2093     NeedTest = false;
2094   } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
2095     // Fake request the condition, otherwise the intrinsic might be completely
2096     // optimized away.
2097     Register TmpReg = getRegForValue(Cond);
2098     if (TmpReg == 0)
2099       return false;
2100
2101     NeedTest = false;
2102   }
2103
2104   if (NeedTest) {
2105     // Selects operate on i1, however, CondReg is 8 bits width and may contain
2106     // garbage. Indeed, only the less significant bit is supposed to be
2107     // accurate. If we read more than the lsb, we may see non-zero values
2108     // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
2109     // the select. This is achieved by performing TEST against 1.
2110     Register CondReg = getRegForValue(Cond);
2111     if (CondReg == 0)
2112       return false;
2113
2114     // In case OpReg is a K register, COPY to a GPR
2115     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
2116       unsigned KCondReg = CondReg;
2117       CondReg = createResultReg(&X86::GR32RegClass);
2118       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2119               TII.get(TargetOpcode::COPY), CondReg)
2120           .addReg(KCondReg);
2121       CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);
2122     }
2123     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
2124         .addReg(CondReg)
2125         .addImm(1);
2126   }
2127
2128   const Value *LHS = I->getOperand(1);
2129   const Value *RHS = I->getOperand(2);
2130
2131   Register RHSReg = getRegForValue(RHS);
2132   Register LHSReg = getRegForValue(LHS);
2133   if (!LHSReg || !RHSReg)
2134     return false;
2135
2136   const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
2137   unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC) / 8, false,
2138                                     Subtarget->hasNDD());
2139   Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
2140   updateValueMap(I, ResultReg);
2141   return true;
2142 }
2143
2144 /// Emit SSE or AVX instructions to lower the select.
2145 ///
2146 /// Try to use SSE1/SSE2 instructions to simulate a select without branches.
2147 /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
2148 /// SSE instructions are available. If AVX is available, try to use a VBLENDV.
2149 bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
2150   // Optimize conditions coming from a compare if both instructions are in the
2151   // same basic block (values defined in other basic blocks may not have
2152   // initialized registers).
2153   const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
2154   if (!CI || (CI->getParent() != I->getParent()))
2155     return false;
2156
2157   if (I->getType() != CI->getOperand(0)->getType() ||
2158       !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
2159         (Subtarget->hasSSE2() && RetVT == MVT::f64)))
2160     return false;
2161
2162   const Value *CmpLHS = CI->getOperand(0);
2163   const Value *CmpRHS = CI->getOperand(1);
2164   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2165
2166   // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
2167   // We don't have to materialize a zero constant for this case and can just use
2168   // %x again on the RHS.
2169   if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
2170     const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
2171     if (CmpRHSC && CmpRHSC->isNullValue())
2172       CmpRHS = CmpLHS;
2173   }
2174
2175   unsigned CC;
2176   bool NeedSwap;
2177   std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
2178   if (CC > 7 && !Subtarget->hasAVX())
2179     return false;
2180
2181   if (NeedSwap)
2182     std::swap(CmpLHS, CmpRHS);
2183
2184   const Value *LHS = I->getOperand(1);
2185   const Value *RHS = I->getOperand(2);
2186
2187   Register LHSReg = getRegForValue(LHS);
2188   Register RHSReg = getRegForValue(RHS);
2189   Register CmpLHSReg = getRegForValue(CmpLHS);
2190   Register CmpRHSReg = getRegForValue(CmpRHS);
2191   if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg)
2192     return false;
2193
2194   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2195   unsigned ResultReg;
2196
2197   if (Subtarget->hasAVX512()) {
2198     // If we have AVX512 we can use a mask compare and masked movss/sd.
2199     const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
2200     const TargetRegisterClass *VK1 = &X86::VK1RegClass;
2201
2202     unsigned CmpOpcode =
2203       (RetVT == MVT::f32) ? X86::VCMPSSZrri : X86::VCMPSDZrri;
2204     Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpRHSReg,
2205                                        CC);
2206
2207     // Need an IMPLICIT_DEF for the input that is used to generate the upper
2208     // bits of the result register since its not based on any of the inputs.
2209     Register ImplicitDefReg = createResultReg(VR128X);
2210     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2211             TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2212
2213     // Place RHSReg is the passthru of the masked movss/sd operation and put
2214     // LHS in the input. The mask input comes from the compare.
2215     unsigned MovOpcode =
2216       (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
2217     unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, CmpReg,
2218                                         ImplicitDefReg, LHSReg);
2219
2220     ResultReg = createResultReg(RC);
2221     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2222             TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);
2223
2224   } else if (Subtarget->hasAVX()) {
2225     const TargetRegisterClass *VR128 = &X86::VR128RegClass;
2226
2227     // If we have AVX, create 1 blendv instead of 3 logic instructions.
2228     // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
2229     // uses XMM0 as the selection register. That may need just as many
2230     // instructions as the AND/ANDN/OR sequence due to register moves, so
2231     // don't bother.
2232     unsigned CmpOpcode =
2233       (RetVT == MVT::f32) ? X86::VCMPSSrri : X86::VCMPSDrri;
2234     unsigned BlendOpcode =
2235       (RetVT == MVT::f32) ? X86::VBLENDVPSrrr : X86::VBLENDVPDrrr;
2236
2237     Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpRHSReg,
2238                                        CC);
2239     Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, LHSReg,
2240                                           CmpReg);
2241     ResultReg = createResultReg(RC);
2242     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2243             TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
2244   } else {
2245     // Choose the SSE instruction sequence based on data type (float or double).
2246     static const uint16_t OpcTable[2][4] = {
2247       { X86::CMPSSrri,  X86::ANDPSrr,  X86::ANDNPSrr,  X86::ORPSrr  },
2248       { X86::CMPSDrri,  X86::ANDPDrr,  X86::ANDNPDrr,  X86::ORPDrr  }
2249     };
2250
2251     const uint16_t *Opc = nullptr;
2252     switch (RetVT.SimpleTy) {
2253     default: return false;
2254     case MVT::f32: Opc = &OpcTable[0][0]; break;
2255     case MVT::f64: Opc = &OpcTable[1][0]; break;
2256     }
2257
2258     const TargetRegisterClass *VR128 = &X86::VR128RegClass;
2259     Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpRHSReg, CC);
2260     Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, LHSReg);
2261     Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, RHSReg);
2262     Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, AndReg);
2263     ResultReg = createResultReg(RC);
2264     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2265             TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
2266   }
2267   updateValueMap(I, ResultReg);
2268   return true;
2269 }
2270
2271 bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
2272   // These are pseudo CMOV instructions and will be later expanded into control-
2273   // flow.
2274   unsigned Opc;
2275   switch (RetVT.SimpleTy) {
2276   default: return false;
2277   case MVT::i8:  Opc = X86::CMOV_GR8;   break;
2278   case MVT::i16: Opc = X86::CMOV_GR16;  break;
2279   case MVT::i32: Opc = X86::CMOV_GR32;  break;
2280   case MVT::f16:
2281     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break;
2282   case MVT::f32:
2283     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break;
2284   case MVT::f64:
2285     Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break;
2286   }
2287
2288   const Value *Cond = I->getOperand(0);
2289   X86::CondCode CC = X86::COND_NE;
2290
2291   // Optimize conditions coming from a compare if both instructions are in the
2292   // same basic block (values defined in other basic blocks may not have
2293   // initialized registers).
2294   const auto *CI = dyn_cast<CmpInst>(Cond);
2295   if (CI && (CI->getParent() == I->getParent())) {
2296     bool NeedSwap;
2297     std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
2298     if (CC > X86::LAST_VALID_COND)
2299       return false;
2300
2301     const Value *CmpLHS = CI->getOperand(0);
2302     const Value *CmpRHS = CI->getOperand(1);
2303
2304     if (NeedSwap)
2305       std::swap(CmpLHS, CmpRHS);
2306
2307     EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
2308     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
2309       return false;
2310   } else {
2311     Register CondReg = getRegForValue(Cond);
2312     if (CondReg == 0)
2313       return false;
2314
2315     // In case OpReg is a K register, COPY to a GPR
2316     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
2317       unsigned KCondReg = CondReg;
2318       CondReg = createResultReg(&X86::GR32RegClass);
2319       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2320               TII.get(TargetOpcode::COPY), CondReg)
2321           .addReg(KCondReg);
2322       CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);
2323     }
2324     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))
2325         .addReg(CondReg)
2326         .addImm(1);
2327   }
2328
2329   const Value *LHS = I->getOperand(1);
2330   const Value *RHS = I->getOperand(2);
2331
2332   Register LHSReg = getRegForValue(LHS);
2333   Register RHSReg = getRegForValue(RHS);
2334   if (!LHSReg || !RHSReg)
2335     return false;
2336
2337   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2338
2339   Register ResultReg =
2340     fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
2341   updateValueMap(I, ResultReg);
2342   return true;
2343 }
2344
2345 bool X86FastISel::X86SelectSelect(const Instruction *I) {
2346   MVT RetVT;
2347   if (!isTypeLegal(I->getType(), RetVT))
2348     return false;
2349
2350   // Check if we can fold the select.
2351   if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
2352     CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
2353     const Value *Opnd = nullptr;
2354     switch (Predicate) {
2355     default:                              break;
2356     case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
2357     case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
2358     }
2359     // No need for a select anymore - this is an unconditional move.
2360     if (Opnd) {
2361       Register OpReg = getRegForValue(Opnd);
2362       if (OpReg == 0)
2363         return false;
2364       const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
2365       Register ResultReg = createResultReg(RC);
2366       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2367               TII.get(TargetOpcode::COPY), ResultReg)
2368         .addReg(OpReg);
2369       updateValueMap(I, ResultReg);
2370       return true;
2371     }
2372   }
2373
2374   // First try to use real conditional move instructions.
2375   if (X86FastEmitCMoveSelect(RetVT, I))
2376     return true;
2377
2378   // Try to use a sequence of SSE instructions to simulate a conditional move.
2379   if (X86FastEmitSSESelect(RetVT, I))
2380     return true;
2381
2382   // Fall-back to pseudo conditional move instructions, which will be later
2383   // converted to control-flow.
2384   if (X86FastEmitPseudoSelect(RetVT, I))
2385     return true;
2386
2387   return false;
2388 }
2389
2390 // Common code for X86SelectSIToFP and X86SelectUIToFP.
2391 bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
2392   // The target-independent selection algorithm in FastISel already knows how
2393   // to select a SINT_TO_FP if the target is SSE but not AVX.
2394   // Early exit if the subtarget doesn't have AVX.
2395   // Unsigned conversion requires avx512.
2396   bool HasAVX512 = Subtarget->hasAVX512();
2397   if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512))
2398     return false;
2399
2400   // TODO: We could sign extend narrower types.
2401   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
2402   if (SrcVT != MVT::i32 && SrcVT != MVT::i64)
2403     return false;
2404
2405   // Select integer to float/double conversion.
2406   Register OpReg = getRegForValue(I->getOperand(0));
2407   if (OpReg == 0)
2408     return false;
2409
2410   unsigned Opcode;
2411
2412   static const uint16_t SCvtOpc[2][2][2] = {
2413     { { X86::VCVTSI2SSrr,  X86::VCVTSI642SSrr },
2414       { X86::VCVTSI2SDrr,  X86::VCVTSI642SDrr } },
2415     { { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr },
2416       { X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } },
2417   };
2418   static const uint16_t UCvtOpc[2][2] = {
2419     { X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr },
2420     { X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr },
2421   };
2422   bool Is64Bit = SrcVT == MVT::i64;
2423
2424   if (I->getType()->isDoubleTy()) {
2425     // s/uitofp int -> double
2426     Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit];
2427   } else if (I->getType()->isFloatTy()) {
2428     // s/uitofp int -> float
2429     Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit];
2430   } else
2431     return false;
2432
2433   MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
2434   const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
2435   Register ImplicitDefReg = createResultReg(RC);
2436   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2437           TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2438   Register ResultReg = fastEmitInst_rr(Opcode, RC, ImplicitDefReg, OpReg);
2439   updateValueMap(I, ResultReg);
2440   return true;
2441 }
2442
2443 bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
2444   return X86SelectIntToFP(I, /*IsSigned*/true);
2445 }
2446
2447 bool X86FastISel::X86SelectUIToFP(const Instruction *I) {
2448   return X86SelectIntToFP(I, /*IsSigned*/false);
2449 }
2450
2451 // Helper method used by X86SelectFPExt and X86SelectFPTrunc.
2452 bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
2453                                           unsigned TargetOpc,
2454                                           const TargetRegisterClass *RC) {
2455   assert((I->getOpcode() == Instruction::FPExt ||
2456           I->getOpcode() == Instruction::FPTrunc) &&
2457          "Instruction must be an FPExt or FPTrunc!");
2458   bool HasAVX = Subtarget->hasAVX();
2459
2460   Register OpReg = getRegForValue(I->getOperand(0));
2461   if (OpReg == 0)
2462     return false;
2463
2464   unsigned ImplicitDefReg;
2465   if (HasAVX) {
2466     ImplicitDefReg = createResultReg(RC);
2467     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2468             TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2469
2470   }
2471
2472   Register ResultReg = createResultReg(RC);
2473   MachineInstrBuilder MIB;
2474   MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpc),
2475                 ResultReg);
2476
2477   if (HasAVX)
2478     MIB.addReg(ImplicitDefReg);
2479
2480   MIB.addReg(OpReg);
2481   updateValueMap(I, ResultReg);
2482   return true;
2483 }
2484
2485 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
2486   if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() &&
2487       I->getOperand(0)->getType()->isFloatTy()) {
2488     bool HasAVX512 = Subtarget->hasAVX512();
2489     // fpext from float to double.
2490     unsigned Opc =
2491         HasAVX512 ? X86::VCVTSS2SDZrr
2492                   : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
2493     return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64));
2494   }
2495
2496   return false;
2497 }
2498
2499 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
2500   if (Subtarget->hasSSE2() && I->getType()->isFloatTy() &&
2501       I->getOperand(0)->getType()->isDoubleTy()) {
2502     bool HasAVX512 = Subtarget->hasAVX512();
2503     // fptrunc from double to float.
2504     unsigned Opc =
2505         HasAVX512 ? X86::VCVTSD2SSZrr
2506                   : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
2507     return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32));
2508   }
2509
2510   return false;
2511 }
2512
2513 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
2514   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
2515   EVT DstVT = TLI.getValueType(DL, I->getType());
2516
2517   // This code only handles truncation to byte.
2518   if (DstVT != MVT::i8 && DstVT != MVT::i1)
2519     return false;
2520   if (!TLI.isTypeLegal(SrcVT))
2521     return false;
2522
2523   Register InputReg = getRegForValue(I->getOperand(0));
2524   if (!InputReg)
2525     // Unhandled operand.  Halt "fast" selection and bail.
2526     return false;
2527
2528   if (SrcVT == MVT::i8) {
2529     // Truncate from i8 to i1; no code needed.
2530     updateValueMap(I, InputReg);
2531     return true;
2532   }
2533
2534   // Issue an extract_subreg.
2535   Register ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg,
2536                                                   X86::sub_8bit);
2537   if (!ResultReg)
2538     return false;
2539
2540   updateValueMap(I, ResultReg);
2541   return true;
2542 }
2543
2544 bool X86FastISel::IsMemcpySmall(uint64_t Len) {
2545   return Len <= (Subtarget->is64Bit() ? 32 : 16);
2546 }
2547
2548 bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
2549                                      X86AddressMode SrcAM, uint64_t Len) {
2550
2551   // Make sure we don't bloat code by inlining very large memcpy's.
2552   if (!IsMemcpySmall(Len))
2553     return false;
2554
2555   bool i64Legal = Subtarget->is64Bit();
2556
2557   // We don't care about alignment here since we just emit integer accesses.
2558   while (Len) {
2559     MVT VT;
2560     if (Len >= 8 && i64Legal)
2561       VT = MVT::i64;
2562     else if (Len >= 4)
2563       VT = MVT::i32;
2564     else if (Len >= 2)
2565       VT = MVT::i16;
2566     else
2567       VT = MVT::i8;
2568
2569     unsigned Reg;
2570     bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
2571     RV &= X86FastEmitStore(VT, Reg, DestAM);
2572     assert(RV && "Failed to emit load or store??");
2573     (void)RV;
2574
2575     unsigned Size = VT.getSizeInBits()/8;
2576     Len -= Size;
2577     DestAM.Disp += Size;
2578     SrcAM.Disp += Size;
2579   }
2580
2581   return true;
2582 }
2583
2584 bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
2585   // FIXME: Handle more intrinsics.
2586   switch (II->getIntrinsicID()) {
2587   default: return false;
2588   case Intrinsic::convert_from_fp16:
2589   case Intrinsic::convert_to_fp16: {
2590     if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
2591       return false;
2592
2593     const Value *Op = II->getArgOperand(0);
2594     Register InputReg = getRegForValue(Op);
2595     if (InputReg == 0)
2596       return false;
2597
2598     // F16C only allows converting from float to half and from half to float.
2599     bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
2600     if (IsFloatToHalf) {
2601       if (!Op->getType()->isFloatTy())
2602         return false;
2603     } else {
2604       if (!II->getType()->isFloatTy())
2605         return false;
2606     }
2607
2608     unsigned ResultReg = 0;
2609     const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
2610     if (IsFloatToHalf) {
2611       // 'InputReg' is implicitly promoted from register class FR32 to
2612       // register class VR128 by method 'constrainOperandRegClass' which is
2613       // directly called by 'fastEmitInst_ri'.
2614       // Instruction VCVTPS2PHrr takes an extra immediate operand which is
2615       // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
2616       // It's consistent with the other FP instructions, which are usually
2617       // controlled by MXCSR.
2618       unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr
2619                                          : X86::VCVTPS2PHrr;
2620       InputReg = fastEmitInst_ri(Opc, RC, InputReg, 4);
2621
2622       // Move the lower 32-bits of ResultReg to another register of class GR32.
2623       Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr
2624                                    : X86::VMOVPDI2DIrr;
2625       ResultReg = createResultReg(&X86::GR32RegClass);
2626       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg)
2627           .addReg(InputReg, RegState::Kill);
2628
2629       // The result value is in the lower 16-bits of ResultReg.
2630       unsigned RegIdx = X86::sub_16bit;
2631       ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, RegIdx);
2632     } else {
2633       assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
2634       // Explicitly zero-extend the input to 32-bit.
2635       InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg);
2636
2637       // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
2638       InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
2639                             InputReg);
2640
2641       unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
2642                                          : X86::VCVTPH2PSrr;
2643       InputReg = fastEmitInst_r(Opc, RC, InputReg);
2644
2645       // The result value is in the lower 32-bits of ResultReg.
2646       // Emit an explicit copy from register class VR128 to register class FR32.
2647       ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
2648       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2649               TII.get(TargetOpcode::COPY), ResultReg)
2650           .addReg(InputReg, RegState::Kill);
2651     }
2652
2653     updateValueMap(II, ResultReg);
2654     return true;
2655   }
2656   case Intrinsic::frameaddress: {
2657     MachineFunction *MF = FuncInfo.MF;
2658     if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
2659       return false;
2660
2661     Type *RetTy = II->getCalledFunction()->getReturnType();
2662
2663     MVT VT;
2664     if (!isTypeLegal(RetTy, VT))
2665       return false;
2666
2667     unsigned Opc;
2668     const TargetRegisterClass *RC = nullptr;
2669
2670     switch (VT.SimpleTy) {
2671     default: llvm_unreachable("Invalid result type for frameaddress.");
2672     case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
2673     case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
2674     }
2675
2676     // This needs to be set before we call getPtrSizedFrameRegister, otherwise
2677     // we get the wrong frame register.
2678     MachineFrameInfo &MFI = MF->getFrameInfo();
2679     MFI.setFrameAddressIsTaken(true);
2680
2681     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2682     unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
2683     assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
2684             (FrameReg == X86::EBP && VT == MVT::i32)) &&
2685            "Invalid Frame Register!");
2686
2687     // Always make a copy of the frame register to a vreg first, so that we
2688     // never directly reference the frame register (the TwoAddressInstruction-
2689     // Pass doesn't like that).
2690     Register SrcReg = createResultReg(RC);
2691     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2692             TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
2693
2694     // Now recursively load from the frame address.
2695     // movq (%rbp), %rax
2696     // movq (%rax), %rax
2697     // movq (%rax), %rax
2698     // ...
2699     unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
2700     while (Depth--) {
2701       Register DestReg = createResultReg(RC);
2702       addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2703                            TII.get(Opc), DestReg), SrcReg);
2704       SrcReg = DestReg;
2705     }
2706
2707     updateValueMap(II, SrcReg);
2708     return true;
2709   }
2710   case Intrinsic::memcpy: {
2711     const MemCpyInst *MCI = cast<MemCpyInst>(II);
2712     // Don't handle volatile or variable length memcpys.
2713     if (MCI->isVolatile())
2714       return false;
2715
2716     if (isa<ConstantInt>(MCI->getLength())) {
2717       // Small memcpy's are common enough that we want to do them
2718       // without a call if possible.
2719       uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
2720       if (IsMemcpySmall(Len)) {
2721         X86AddressMode DestAM, SrcAM;
2722         if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
2723             !X86SelectAddress(MCI->getRawSource(), SrcAM))
2724           return false;
2725         TryEmitSmallMemcpy(DestAM, SrcAM, Len);
2726         return true;
2727       }
2728     }
2729
2730     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
2731     if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
2732       return false;
2733
2734     if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
2735       return false;
2736
2737     return lowerCallTo(II, "memcpy", II->arg_size() - 1);
2738   }
2739   case Intrinsic::memset: {
2740     const MemSetInst *MSI = cast<MemSetInst>(II);
2741
2742     if (MSI->isVolatile())
2743       return false;
2744
2745     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
2746     if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
2747       return false;
2748
2749     if (MSI->getDestAddressSpace() > 255)
2750       return false;
2751
2752     return lowerCallTo(II, "memset", II->arg_size() - 1);
2753   }
2754   case Intrinsic::stackprotector: {
2755     // Emit code to store the stack guard onto the stack.
2756     EVT PtrTy = TLI.getPointerTy(DL);
2757
2758     const Value *Op1 = II->getArgOperand(0); // The guard's value.
2759     const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
2760
2761     MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
2762
2763     // Grab the frame index.
2764     X86AddressMode AM;
2765     if (!X86SelectAddress(Slot, AM)) return false;
2766     if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
2767     return true;
2768   }
2769   case Intrinsic::dbg_declare: {
2770     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
2771     X86AddressMode AM;
2772     assert(DI->getAddress() && "Null address should be checked earlier!");
2773     if (!X86SelectAddress(DI->getAddress(), AM))
2774       return false;
2775     const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
2776     assert(DI->getVariable()->isValidLocationForIntrinsic(MIMD.getDL()) &&
2777            "Expected inlined-at fields to agree");
2778     addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II), AM)
2779         .addImm(0)
2780         .addMetadata(DI->getVariable())
2781         .addMetadata(DI->getExpression());
2782     return true;
2783   }
2784   case Intrinsic::trap: {
2785     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TRAP));
2786     return true;
2787   }
2788   case Intrinsic::sqrt: {
2789     if (!Subtarget->hasSSE1())
2790       return false;
2791
2792     Type *RetTy = II->getCalledFunction()->getReturnType();
2793
2794     MVT VT;
2795     if (!isTypeLegal(RetTy, VT))
2796       return false;
2797
2798     // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
2799     // is not generated by FastISel yet.
2800     // FIXME: Update this code once tablegen can handle it.
2801     static const uint16_t SqrtOpc[3][2] = {
2802       { X86::SQRTSSr,   X86::SQRTSDr },
2803       { X86::VSQRTSSr,  X86::VSQRTSDr },
2804       { X86::VSQRTSSZr, X86::VSQRTSDZr },
2805     };
2806     unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
2807                         Subtarget->hasAVX()    ? 1 :
2808                                                  0;
2809     unsigned Opc;
2810     switch (VT.SimpleTy) {
2811     default: return false;
2812     case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break;
2813     case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break;
2814     }
2815
2816     const Value *SrcVal = II->getArgOperand(0);
2817     Register SrcReg = getRegForValue(SrcVal);
2818
2819     if (SrcReg == 0)
2820       return false;
2821
2822     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
2823     unsigned ImplicitDefReg = 0;
2824     if (AVXLevel > 0) {
2825       ImplicitDefReg = createResultReg(RC);
2826       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2827               TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2828     }
2829
2830     Register ResultReg = createResultReg(RC);
2831     MachineInstrBuilder MIB;
2832     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc),
2833                   ResultReg);
2834
2835     if (ImplicitDefReg)
2836       MIB.addReg(ImplicitDefReg);
2837
2838     MIB.addReg(SrcReg);
2839
2840     updateValueMap(II, ResultReg);
2841     return true;
2842   }
2843   case Intrinsic::sadd_with_overflow:
2844   case Intrinsic::uadd_with_overflow:
2845   case Intrinsic::ssub_with_overflow:
2846   case Intrinsic::usub_with_overflow:
2847   case Intrinsic::smul_with_overflow:
2848   case Intrinsic::umul_with_overflow: {
2849     // This implements the basic lowering of the xalu with overflow intrinsics
2850     // into add/sub/mul followed by either seto or setb.
2851     const Function *Callee = II->getCalledFunction();
2852     auto *Ty = cast<StructType>(Callee->getReturnType());
2853     Type *RetTy = Ty->getTypeAtIndex(0U);
2854     assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
2855            Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
2856            "Overflow value expected to be an i1");
2857
2858     MVT VT;
2859     if (!isTypeLegal(RetTy, VT))
2860       return false;
2861
2862     if (VT < MVT::i8 || VT > MVT::i64)
2863       return false;
2864
2865     const Value *LHS = II->getArgOperand(0);
2866     const Value *RHS = II->getArgOperand(1);
2867
2868     // Canonicalize immediate to the RHS.
2869     if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
2870       std::swap(LHS, RHS);
2871
2872     unsigned BaseOpc, CondCode;
2873     switch (II->getIntrinsicID()) {
2874     default: llvm_unreachable("Unexpected intrinsic!");
2875     case Intrinsic::sadd_with_overflow:
2876       BaseOpc = ISD::ADD; CondCode = X86::COND_O; break;
2877     case Intrinsic::uadd_with_overflow:
2878       BaseOpc = ISD::ADD; CondCode = X86::COND_B; break;
2879     case Intrinsic::ssub_with_overflow:
2880       BaseOpc = ISD::SUB; CondCode = X86::COND_O; break;
2881     case Intrinsic::usub_with_overflow:
2882       BaseOpc = ISD::SUB; CondCode = X86::COND_B; break;
2883     case Intrinsic::smul_with_overflow:
2884       BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break;
2885     case Intrinsic::umul_with_overflow:
2886       BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
2887     }
2888
2889     Register LHSReg = getRegForValue(LHS);
2890     if (LHSReg == 0)
2891       return false;
2892
2893     unsigned ResultReg = 0;
2894     // Check if we have an immediate version.
2895     if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
2896       static const uint16_t Opc[2][4] = {
2897         { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
2898         { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
2899       };
2900
2901       if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) &&
2902           CondCode == X86::COND_O) {
2903         // We can use INC/DEC.
2904         ResultReg = createResultReg(TLI.getRegClassFor(VT));
2905         bool IsDec = BaseOpc == ISD::SUB;
2906         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2907                 TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
2908           .addReg(LHSReg);
2909       } else
2910         ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, CI->getZExtValue());
2911     }
2912
2913     unsigned RHSReg;
2914     if (!ResultReg) {
2915       RHSReg = getRegForValue(RHS);
2916       if (RHSReg == 0)
2917         return false;
2918       ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, RHSReg);
2919     }
2920
2921     // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
2922     // it manually.
2923     if (BaseOpc == X86ISD::UMUL && !ResultReg) {
2924       static const uint16_t MULOpc[] =
2925         { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
2926       static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
2927       // First copy the first operand into RAX, which is an implicit input to
2928       // the X86::MUL*r instruction.
2929       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2930               TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
2931         .addReg(LHSReg);
2932       ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
2933                                  TLI.getRegClassFor(VT), RHSReg);
2934     } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
2935       static const uint16_t MULOpc[] =
2936         { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
2937       if (VT == MVT::i8) {
2938         // Copy the first operand into AL, which is an implicit input to the
2939         // X86::IMUL8r instruction.
2940         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
2941                TII.get(TargetOpcode::COPY), X86::AL)
2942           .addReg(LHSReg);
2943         ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg);
2944       } else
2945         ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
2946                                     TLI.getRegClassFor(VT), LHSReg, RHSReg);
2947     }
2948
2949     if (!ResultReg)
2950       return false;
2951
2952     // Assign to a GPR since the overflow return value is lowered to a SETcc.
2953     Register ResultReg2 = createResultReg(&X86::GR8RegClass);
2954     assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
2955     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),
2956             ResultReg2).addImm(CondCode);
2957
2958     updateValueMap(II, ResultReg, 2);
2959     return true;
2960   }
2961   case Intrinsic::x86_sse_cvttss2si:
2962   case Intrinsic::x86_sse_cvttss2si64:
2963   case Intrinsic::x86_sse2_cvttsd2si:
2964   case Intrinsic::x86_sse2_cvttsd2si64: {
2965     bool IsInputDouble;
2966     switch (II->getIntrinsicID()) {
2967     default: llvm_unreachable("Unexpected intrinsic.");
2968     case Intrinsic::x86_sse_cvttss2si:
2969     case Intrinsic::x86_sse_cvttss2si64:
2970       if (!Subtarget->hasSSE1())
2971         return false;
2972       IsInputDouble = false;
2973       break;
2974     case Intrinsic::x86_sse2_cvttsd2si:
2975     case Intrinsic::x86_sse2_cvttsd2si64:
2976       if (!Subtarget->hasSSE2())
2977         return false;
2978       IsInputDouble = true;
2979       break;
2980     }
2981
2982     Type *RetTy = II->getCalledFunction()->getReturnType();
2983     MVT VT;
2984     if (!isTypeLegal(RetTy, VT))
2985       return false;
2986
2987     static const uint16_t CvtOpc[3][2][2] = {
2988       { { X86::CVTTSS2SIrr,   X86::CVTTSS2SI64rr },
2989         { X86::CVTTSD2SIrr,   X86::CVTTSD2SI64rr } },
2990       { { X86::VCVTTSS2SIrr,  X86::VCVTTSS2SI64rr },
2991         { X86::VCVTTSD2SIrr,  X86::VCVTTSD2SI64rr } },
2992       { { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr },
2993         { X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } },
2994     };
2995     unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :
2996                         Subtarget->hasAVX()    ? 1 :
2997                                                  0;
2998     unsigned Opc;
2999     switch (VT.SimpleTy) {
3000     default: llvm_unreachable("Unexpected result type.");
3001     case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break;
3002     case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break;
3003     }
3004
3005     // Check if we can fold insertelement instructions into the convert.
3006     const Value *Op = II->getArgOperand(0);
3007     while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
3008       const Value *Index = IE->getOperand(2);
3009       if (!isa<ConstantInt>(Index))
3010         break;
3011       unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
3012
3013       if (Idx == 0) {
3014         Op = IE->getOperand(1);
3015         break;
3016       }
3017       Op = IE->getOperand(0);
3018     }
3019
3020     Register Reg = getRegForValue(Op);
3021     if (Reg == 0)
3022       return false;
3023
3024     Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3025     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg)
3026       .addReg(Reg);
3027
3028     updateValueMap(II, ResultReg);
3029     return true;
3030   }
3031   case Intrinsic::x86_sse42_crc32_32_8:
3032   case Intrinsic::x86_sse42_crc32_32_16:
3033   case Intrinsic::x86_sse42_crc32_32_32:
3034   case Intrinsic::x86_sse42_crc32_64_64: {
3035     if (!Subtarget->hasCRC32())
3036       return false;
3037
3038     Type *RetTy = II->getCalledFunction()->getReturnType();
3039
3040     MVT VT;
3041     if (!isTypeLegal(RetTy, VT))
3042       return false;
3043
3044     unsigned Opc;
3045     const TargetRegisterClass *RC = nullptr;
3046
3047     switch (II->getIntrinsicID()) {
3048     default:
3049       llvm_unreachable("Unexpected intrinsic.");
3050 #define GET_EGPR_IF_ENABLED(OPC) Subtarget->hasEGPR() ? OPC##_EVEX : OPC
3051     case Intrinsic::x86_sse42_crc32_32_8:
3052       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r8);
3053       RC = &X86::GR32RegClass;
3054       break;
3055     case Intrinsic::x86_sse42_crc32_32_16:
3056       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r16);
3057       RC = &X86::GR32RegClass;
3058       break;
3059     case Intrinsic::x86_sse42_crc32_32_32:
3060       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r32);
3061       RC = &X86::GR32RegClass;
3062       break;
3063     case Intrinsic::x86_sse42_crc32_64_64:
3064       Opc = GET_EGPR_IF_ENABLED(X86::CRC32r64r64);
3065       RC = &X86::GR64RegClass;
3066       break;
3067 #undef GET_EGPR_IF_ENABLED
3068     }
3069
3070     const Value *LHS = II->getArgOperand(0);
3071     const Value *RHS = II->getArgOperand(1);
3072
3073     Register LHSReg = getRegForValue(LHS);
3074     Register RHSReg = getRegForValue(RHS);
3075     if (!LHSReg || !RHSReg)
3076       return false;
3077
3078     Register ResultReg = fastEmitInst_rr(Opc, RC, LHSReg, RHSReg);
3079     if (!ResultReg)
3080       return false;
3081
3082     updateValueMap(II, ResultReg);
3083     return true;
3084   }
3085   }
3086 }
3087
3088 bool X86FastISel::fastLowerArguments() {
3089   if (!FuncInfo.CanLowerReturn)
3090     return false;
3091
3092   const Function *F = FuncInfo.Fn;
3093   if (F->isVarArg())
3094     return false;
3095
3096   CallingConv::ID CC = F->getCallingConv();
3097   if (CC != CallingConv::C)
3098     return false;
3099
3100   if (Subtarget->isCallingConvWin64(CC))
3101     return false;
3102
3103   if (!Subtarget->is64Bit())
3104     return false;
3105
3106   if (Subtarget->useSoftFloat())
3107     return false;
3108
3109   // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
3110   unsigned GPRCnt = 0;
3111   unsigned FPRCnt = 0;
3112   for (auto const &Arg : F->args()) {
3113     if (Arg.hasAttribute(Attribute::ByVal) ||
3114         Arg.hasAttribute(Attribute::InReg) ||
3115         Arg.hasAttribute(Attribute::StructRet) ||
3116         Arg.hasAttribute(Attribute::SwiftSelf) ||
3117         Arg.hasAttribute(Attribute::SwiftAsync) ||
3118         Arg.hasAttribute(Attribute::SwiftError) ||
3119         Arg.hasAttribute(Attribute::Nest))
3120       return false;
3121
3122     Type *ArgTy = Arg.getType();
3123     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
3124       return false;
3125
3126     EVT ArgVT = TLI.getValueType(DL, ArgTy);
3127     if (!ArgVT.isSimple()) return false;
3128     switch (ArgVT.getSimpleVT().SimpleTy) {
3129     default: return false;
3130     case MVT::i32:
3131     case MVT::i64:
3132       ++GPRCnt;
3133       break;
3134     case MVT::f32:
3135     case MVT::f64:
3136       if (!Subtarget->hasSSE1())
3137         return false;
3138       ++FPRCnt;
3139       break;
3140     }
3141
3142     if (GPRCnt > 6)
3143       return false;
3144
3145     if (FPRCnt > 8)
3146       return false;
3147   }
3148
3149   static const MCPhysReg GPR32ArgRegs[] = {
3150     X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
3151   };
3152   static const MCPhysReg GPR64ArgRegs[] = {
3153     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
3154   };
3155   static const MCPhysReg XMMArgRegs[] = {
3156     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3157     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3158   };
3159
3160   unsigned GPRIdx = 0;
3161   unsigned FPRIdx = 0;
3162   for (auto const &Arg : F->args()) {
3163     MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
3164     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
3165     unsigned SrcReg;
3166     switch (VT.SimpleTy) {
3167     default: llvm_unreachable("Unexpected value type.");
3168     case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
3169     case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
3170     case MVT::f32: [[fallthrough]];
3171     case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
3172     }
3173     Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
3174     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
3175     // Without this, EmitLiveInCopies may eliminate the livein if its only
3176     // use is a bitcast (which isn't turned into an instruction).
3177     Register ResultReg = createResultReg(RC);
3178     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3179             TII.get(TargetOpcode::COPY), ResultReg)
3180       .addReg(DstReg, getKillRegState(true));
3181     updateValueMap(&Arg, ResultReg);
3182   }
3183   return true;
3184 }
3185
3186 static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
3187                                                   CallingConv::ID CC,
3188                                                   const CallBase *CB) {
3189   if (Subtarget->is64Bit())
3190     return 0;
3191   if (Subtarget->getTargetTriple().isOSMSVCRT())
3192     return 0;
3193   if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3194       CC == CallingConv::HiPE || CC == CallingConv::Tail ||
3195       CC == CallingConv::SwiftTail)
3196     return 0;
3197
3198   if (CB)
3199     if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) ||
3200         CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
3201       return 0;
3202
3203   return 4;
3204 }
3205
3206 bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
3207   auto &OutVals       = CLI.OutVals;
3208   auto &OutFlags      = CLI.OutFlags;
3209   auto &OutRegs       = CLI.OutRegs;
3210   auto &Ins           = CLI.Ins;
3211   auto &InRegs        = CLI.InRegs;
3212   CallingConv::ID CC  = CLI.CallConv;
3213   bool &IsTailCall    = CLI.IsTailCall;
3214   bool IsVarArg       = CLI.IsVarArg;
3215   const Value *Callee = CLI.Callee;
3216   MCSymbol *Symbol    = CLI.Symbol;
3217   const auto *CB      = CLI.CB;
3218
3219   bool Is64Bit        = Subtarget->is64Bit();
3220   bool IsWin64        = Subtarget->isCallingConvWin64(CC);
3221
3222   // Call / invoke instructions with NoCfCheck attribute require special
3223   // handling.
3224   if (CB && CB->doesNoCfCheck())
3225     return false;
3226
3227   // Functions with no_caller_saved_registers that need special handling.
3228   if ((CB && isa<CallInst>(CB) && CB->hasFnAttr("no_caller_saved_registers")))
3229     return false;
3230
3231   // Functions with no_callee_saved_registers that need special handling.
3232   if ((CB && CB->hasFnAttr("no_callee_saved_registers")))
3233     return false;
3234
3235   // Indirect calls with CFI checks need special handling.
3236   if (CB && CB->isIndirectCall() && CB->getOperandBundle(LLVMContext::OB_kcfi))
3237     return false;
3238
3239   // Functions using thunks for indirect calls need to use SDISel.
3240   if (Subtarget->useIndirectThunkCalls())
3241     return false;
3242
3243   // Handle only C and fastcc calling conventions for now.
3244   switch (CC) {
3245   default: return false;
3246   case CallingConv::C:
3247   case CallingConv::Fast:
3248   case CallingConv::Tail:
3249   case CallingConv::Swift:
3250   case CallingConv::SwiftTail:
3251   case CallingConv::X86_FastCall:
3252   case CallingConv::X86_StdCall:
3253   case CallingConv::X86_ThisCall:
3254   case CallingConv::Win64:
3255   case CallingConv::X86_64_SysV:
3256   case CallingConv::CFGuard_Check:
3257     break;
3258   }
3259
3260   // Allow SelectionDAG isel to handle tail calls.
3261   if (IsTailCall)
3262     return false;
3263
3264   // fastcc with -tailcallopt is intended to provide a guaranteed
3265   // tail call optimization. Fastisel doesn't know how to do that.
3266   if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
3267       CC == CallingConv::Tail || CC == CallingConv::SwiftTail)
3268     return false;
3269
3270   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
3271   // x86-32. Special handling for x86-64 is implemented.
3272   if (IsVarArg && IsWin64)
3273     return false;
3274
3275   // Don't know about inalloca yet.
3276   if (CLI.CB && CLI.CB->hasInAllocaArgument())
3277     return false;
3278
3279   for (auto Flag : CLI.OutFlags)
3280     if (Flag.isSwiftError() || Flag.isPreallocated())
3281       return false;
3282
3283   SmallVector<MVT, 16> OutVTs;
3284   SmallVector<unsigned, 16> ArgRegs;
3285
3286   // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
3287   // instruction. This is safe because it is common to all FastISel supported
3288   // calling conventions on x86.
3289   for (int i = 0, e = OutVals.size(); i != e; ++i) {
3290     Value *&Val = OutVals[i];
3291     ISD::ArgFlagsTy Flags = OutFlags[i];
3292     if (auto *CI = dyn_cast<ConstantInt>(Val)) {
3293       if (CI->getBitWidth() < 32) {
3294         if (Flags.isSExt())
3295           Val = ConstantInt::get(CI->getContext(), CI->getValue().sext(32));
3296         else
3297           Val = ConstantInt::get(CI->getContext(), CI->getValue().zext(32));
3298       }
3299     }
3300
3301     // Passing bools around ends up doing a trunc to i1 and passing it.
3302     // Codegen this as an argument + "and 1".
3303     MVT VT;
3304     auto *TI = dyn_cast<TruncInst>(Val);
3305     unsigned ResultReg;
3306     if (TI && TI->getType()->isIntegerTy(1) && CLI.CB &&
3307         (TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) {
3308       Value *PrevVal = TI->getOperand(0);
3309       ResultReg = getRegForValue(PrevVal);
3310
3311       if (!ResultReg)
3312         return false;
3313
3314       if (!isTypeLegal(PrevVal->getType(), VT))
3315         return false;
3316
3317       ResultReg = fastEmit_ri(VT, VT, ISD::AND, ResultReg, 1);
3318     } else {
3319       if (!isTypeLegal(Val->getType(), VT) ||
3320           (VT.isVector() && VT.getVectorElementType() == MVT::i1))
3321         return false;
3322       ResultReg = getRegForValue(Val);
3323     }
3324
3325     if (!ResultReg)
3326       return false;
3327
3328     ArgRegs.push_back(ResultReg);
3329     OutVTs.push_back(VT);
3330   }
3331
3332   // Analyze operands of the call, assigning locations to each operand.
3333   SmallVector<CCValAssign, 16> ArgLocs;
3334   CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
3335
3336   // Allocate shadow area for Win64
3337   if (IsWin64)
3338     CCInfo.AllocateStack(32, Align(8));
3339
3340   CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
3341
3342   // Get a count of how many bytes are to be pushed on the stack.
3343   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3344
3345   // Issue CALLSEQ_START
3346   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
3347   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackDown))
3348     .addImm(NumBytes).addImm(0).addImm(0);
3349
3350   // Walk the register/memloc assignments, inserting copies/loads.
3351   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3352   for (const CCValAssign &VA : ArgLocs) {
3353     const Value *ArgVal = OutVals[VA.getValNo()];
3354     MVT ArgVT = OutVTs[VA.getValNo()];
3355
3356     if (ArgVT == MVT::x86mmx)
3357       return false;
3358
3359     unsigned ArgReg = ArgRegs[VA.getValNo()];
3360
3361     // Promote the value if needed.
3362     switch (VA.getLocInfo()) {
3363     case CCValAssign::Full: break;
3364     case CCValAssign::SExt: {
3365       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3366              "Unexpected extend");
3367
3368       if (ArgVT == MVT::i1)
3369         return false;
3370
3371       bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
3372                                        ArgVT, ArgReg);
3373       assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
3374       ArgVT = VA.getLocVT();
3375       break;
3376     }
3377     case CCValAssign::ZExt: {
3378       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3379              "Unexpected extend");
3380
3381       // Handle zero-extension from i1 to i8, which is common.
3382       if (ArgVT == MVT::i1) {
3383         // Set the high bits to zero.
3384         ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg);
3385         ArgVT = MVT::i8;
3386
3387         if (ArgReg == 0)
3388           return false;
3389       }
3390
3391       bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
3392                                        ArgVT, ArgReg);
3393       assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
3394       ArgVT = VA.getLocVT();
3395       break;
3396     }
3397     case CCValAssign::AExt: {
3398       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
3399              "Unexpected extend");
3400       bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
3401                                        ArgVT, ArgReg);
3402       if (!Emitted)
3403         Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
3404                                     ArgVT, ArgReg);
3405       if (!Emitted)
3406         Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
3407                                     ArgVT, ArgReg);
3408
3409       assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
3410       ArgVT = VA.getLocVT();
3411       break;
3412     }
3413     case CCValAssign::BCvt: {
3414       ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg);
3415       assert(ArgReg && "Failed to emit a bitcast!");
3416       ArgVT = VA.getLocVT();
3417       break;
3418     }
3419     case CCValAssign::VExt:
3420       // VExt has not been implemented, so this should be impossible to reach
3421       // for now.  However, fallback to Selection DAG isel once implemented.
3422       return false;
3423     case CCValAssign::AExtUpper:
3424     case CCValAssign::SExtUpper:
3425     case CCValAssign::ZExtUpper:
3426     case CCValAssign::FPExt:
3427     case CCValAssign::Trunc:
3428       llvm_unreachable("Unexpected loc info!");
3429     case CCValAssign::Indirect:
3430       // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
3431       // support this.
3432       return false;
3433     }
3434
3435     if (VA.isRegLoc()) {
3436       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3437               TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
3438       OutRegs.push_back(VA.getLocReg());
3439     } else {
3440       assert(VA.isMemLoc() && "Unknown value location!");
3441
3442       // Don't emit stores for undef values.
3443       if (isa<UndefValue>(ArgVal))
3444         continue;
3445
3446       unsigned LocMemOffset = VA.getLocMemOffset();
3447       X86AddressMode AM;
3448       AM.Base.Reg = RegInfo->getStackRegister();
3449       AM.Disp = LocMemOffset;
3450       ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
3451       Align Alignment = DL.getABITypeAlign(ArgVal->getType());
3452       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
3453           MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
3454           MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
3455       if (Flags.isByVal()) {
3456         X86AddressMode SrcAM;
3457         SrcAM.Base.Reg = ArgReg;
3458         if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
3459           return false;
3460       } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
3461         // If this is a really simple value, emit this with the Value* version
3462         // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
3463         // as it can cause us to reevaluate the argument.
3464         if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
3465           return false;
3466       } else {
3467         if (!X86FastEmitStore(ArgVT, ArgReg, AM, MMO))
3468           return false;
3469       }
3470     }
3471   }
3472
3473   // ELF / PIC requires GOT in the EBX register before function calls via PLT
3474   // GOT pointer.
3475   if (Subtarget->isPICStyleGOT()) {
3476     unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3477     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3478             TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
3479   }
3480
3481   if (Is64Bit && IsVarArg && !IsWin64) {
3482     // From AMD64 ABI document:
3483     // For calls that may call functions that use varargs or stdargs
3484     // (prototype-less calls or calls to functions containing ellipsis (...) in
3485     // the declaration) %al is used as hidden argument to specify the number
3486     // of SSE registers used. The contents of %al do not need to match exactly
3487     // the number of registers, but must be an ubound on the number of SSE
3488     // registers used and is in the range 0 - 8 inclusive.
3489
3490     // Count the number of XMM registers allocated.
3491     static const MCPhysReg XMMArgRegs[] = {
3492       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3493       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3494     };
3495     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3496     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3497            && "SSE registers cannot be used when SSE is disabled");
3498     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV8ri),
3499             X86::AL).addImm(NumXMMRegs);
3500   }
3501
3502   // Materialize callee address in a register. FIXME: GV address can be
3503   // handled with a CALLpcrel32 instead.
3504   X86AddressMode CalleeAM;
3505   if (!X86SelectCallAddress(Callee, CalleeAM))
3506     return false;
3507
3508   unsigned CalleeOp = 0;
3509   const GlobalValue *GV = nullptr;
3510   if (CalleeAM.GV != nullptr) {
3511     GV = CalleeAM.GV;
3512   } else if (CalleeAM.Base.Reg != 0) {
3513     CalleeOp = CalleeAM.Base.Reg;
3514   } else
3515     return false;
3516
3517   // Issue the call.
3518   MachineInstrBuilder MIB;
3519   if (CalleeOp) {
3520     // Register-indirect call.
3521     unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
3522     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc))
3523       .addReg(CalleeOp);
3524   } else {
3525     // Direct call.
3526     assert(GV && "Not a direct call");
3527     // See if we need any target-specific flags on the GV operand.
3528     unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);
3529     if (OpFlags == X86II::MO_PLT && !Is64Bit &&
3530         TM.getRelocationModel() == Reloc::Static && isa<Function>(GV) &&
3531         cast<Function>(GV)->isIntrinsic())
3532       OpFlags = X86II::MO_NO_FLAG;
3533
3534     // This will be a direct call, or an indirect call through memory for
3535     // NonLazyBind calls or dllimport calls.
3536     bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT ||
3537                     OpFlags == X86II::MO_GOTPCREL ||
3538                     OpFlags == X86II::MO_GOTPCREL_NORELAX ||
3539                     OpFlags == X86II::MO_COFFSTUB;
3540     unsigned CallOpc = NeedLoad
3541                            ? (Is64Bit ? X86::CALL64m : X86::CALL32m)
3542                            : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
3543
3544     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc));
3545     if (NeedLoad)
3546       MIB.addReg(Is64Bit ? X86::RIP : X86::NoRegister).addImm(1).addReg(0);
3547     if (Symbol)
3548       MIB.addSym(Symbol, OpFlags);
3549     else
3550       MIB.addGlobalAddress(GV, 0, OpFlags);
3551     if (NeedLoad)
3552       MIB.addReg(0);
3553   }
3554
3555   // Add a register mask operand representing the call-preserved registers.
3556   // Proper defs for return values will be added by setPhysRegsDeadExcept().
3557   MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
3558
3559   // Add an implicit use GOT pointer in EBX.
3560   if (Subtarget->isPICStyleGOT())
3561     MIB.addReg(X86::EBX, RegState::Implicit);
3562
3563   if (Is64Bit && IsVarArg && !IsWin64)
3564     MIB.addReg(X86::AL, RegState::Implicit);
3565
3566   // Add implicit physical register uses to the call.
3567   for (auto Reg : OutRegs)
3568     MIB.addReg(Reg, RegState::Implicit);
3569
3570   // Issue CALLSEQ_END
3571   unsigned NumBytesForCalleeToPop =
3572       X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
3573                        TM.Options.GuaranteedTailCallOpt)
3574           ? NumBytes // Callee pops everything.
3575           : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB);
3576   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
3577   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackUp))
3578     .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
3579
3580   // Now handle call return values.
3581   SmallVector<CCValAssign, 16> RVLocs;
3582   CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
3583                     CLI.RetTy->getContext());
3584   CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
3585
3586   // Copy all of the result registers out of their specified physreg.
3587   Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
3588   for (unsigned i = 0; i != RVLocs.size(); ++i) {
3589     CCValAssign &VA = RVLocs[i];
3590     EVT CopyVT = VA.getValVT();
3591     unsigned CopyReg = ResultReg + i;
3592     Register SrcReg = VA.getLocReg();
3593
3594     // If this is x86-64, and we disabled SSE, we can't return FP values
3595     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
3596         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
3597       report_fatal_error("SSE register return with SSE disabled");
3598     }
3599
3600     // If we prefer to use the value in xmm registers, copy it out as f80 and
3601     // use a truncate to move it from fp stack reg to xmm reg.
3602     if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&
3603         isScalarFPTypeInSSEReg(VA.getValVT())) {
3604       CopyVT = MVT::f80;
3605       CopyReg = createResultReg(&X86::RFP80RegClass);
3606     }
3607
3608     // Copy out the result.
3609     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3610             TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
3611     InRegs.push_back(VA.getLocReg());
3612
3613     // Round the f80 to the right size, which also moves it to the appropriate
3614     // xmm register. This is accomplished by storing the f80 value in memory
3615     // and then loading it back.
3616     if (CopyVT != VA.getValVT()) {
3617       EVT ResVT = VA.getValVT();
3618       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
3619       unsigned MemSize = ResVT.getSizeInBits()/8;
3620       int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false);
3621       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3622                                 TII.get(Opc)), FI)
3623         .addReg(CopyReg);
3624       Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt;
3625       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3626                                 TII.get(Opc), ResultReg + i), FI);
3627     }
3628   }
3629
3630   CLI.ResultReg = ResultReg;
3631   CLI.NumResultRegs = RVLocs.size();
3632   CLI.Call = MIB;
3633
3634   return true;
3635 }
3636
3637 bool
3638 X86FastISel::fastSelectInstruction(const Instruction *I)  {
3639   switch (I->getOpcode()) {
3640   default: break;
3641   case Instruction::Load:
3642     return X86SelectLoad(I);
3643   case Instruction::Store:
3644     return X86SelectStore(I);
3645   case Instruction::Ret:
3646     return X86SelectRet(I);
3647   case Instruction::ICmp:
3648   case Instruction::FCmp:
3649     return X86SelectCmp(I);
3650   case Instruction::ZExt:
3651     return X86SelectZExt(I);
3652   case Instruction::SExt:
3653     return X86SelectSExt(I);
3654   case Instruction::Br:
3655     return X86SelectBranch(I);
3656   case Instruction::LShr:
3657   case Instruction::AShr:
3658   case Instruction::Shl:
3659     return X86SelectShift(I);
3660   case Instruction::SDiv:
3661   case Instruction::UDiv:
3662   case Instruction::SRem:
3663   case Instruction::URem:
3664     return X86SelectDivRem(I);
3665   case Instruction::Select:
3666     return X86SelectSelect(I);
3667   case Instruction::Trunc:
3668     return X86SelectTrunc(I);
3669   case Instruction::FPExt:
3670     return X86SelectFPExt(I);
3671   case Instruction::FPTrunc:
3672     return X86SelectFPTrunc(I);
3673   case Instruction::SIToFP:
3674     return X86SelectSIToFP(I);
3675   case Instruction::UIToFP:
3676     return X86SelectUIToFP(I);
3677   case Instruction::IntToPtr: // Deliberate fall-through.
3678   case Instruction::PtrToInt: {
3679     EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
3680     EVT DstVT = TLI.getValueType(DL, I->getType());
3681     if (DstVT.bitsGT(SrcVT))
3682       return X86SelectZExt(I);
3683     if (DstVT.bitsLT(SrcVT))
3684       return X86SelectTrunc(I);
3685     Register Reg = getRegForValue(I->getOperand(0));
3686     if (Reg == 0) return false;
3687     updateValueMap(I, Reg);
3688     return true;
3689   }
3690   case Instruction::BitCast: {
3691     // Select SSE2/AVX bitcasts between 128/256/512 bit vector types.
3692     if (!Subtarget->hasSSE2())
3693       return false;
3694
3695     MVT SrcVT, DstVT;
3696     if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) ||
3697         !isTypeLegal(I->getType(), DstVT))
3698       return false;
3699
3700     // Only allow vectors that use xmm/ymm/zmm.
3701     if (!SrcVT.isVector() || !DstVT.isVector() ||
3702         SrcVT.getVectorElementType() == MVT::i1 ||
3703         DstVT.getVectorElementType() == MVT::i1)
3704       return false;
3705
3706     Register Reg = getRegForValue(I->getOperand(0));
3707     if (!Reg)
3708       return false;
3709
3710     // Emit a reg-reg copy so we don't propagate cached known bits information
3711     // with the wrong VT if we fall out of fast isel after selecting this.
3712     const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
3713     Register ResultReg = createResultReg(DstClass);
3714     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3715               TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);
3716
3717     updateValueMap(I, ResultReg);
3718     return true;
3719   }
3720   }
3721
3722   return false;
3723 }
3724
3725 unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
3726   if (VT > MVT::i64)
3727     return 0;
3728
3729   uint64_t Imm = CI->getZExtValue();
3730   if (Imm == 0) {
3731     Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
3732     switch (VT.SimpleTy) {
3733     default: llvm_unreachable("Unexpected value type");
3734     case MVT::i1:
3735     case MVT::i8:
3736       return fastEmitInst_extractsubreg(MVT::i8, SrcReg, X86::sub_8bit);
3737     case MVT::i16:
3738       return fastEmitInst_extractsubreg(MVT::i16, SrcReg, X86::sub_16bit);
3739     case MVT::i32:
3740       return SrcReg;
3741     case MVT::i64: {
3742       Register ResultReg = createResultReg(&X86::GR64RegClass);
3743       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3744               TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
3745         .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
3746       return ResultReg;
3747     }
3748     }
3749   }
3750
3751   unsigned Opc = 0;
3752   switch (VT.SimpleTy) {
3753   default: llvm_unreachable("Unexpected value type");
3754   case MVT::i1:
3755     VT = MVT::i8;
3756     [[fallthrough]];
3757   case MVT::i8:  Opc = X86::MOV8ri;  break;
3758   case MVT::i16: Opc = X86::MOV16ri; break;
3759   case MVT::i32: Opc = X86::MOV32ri; break;
3760   case MVT::i64: {
3761     if (isUInt<32>(Imm))
3762       Opc = X86::MOV32ri64;
3763     else if (isInt<32>(Imm))
3764       Opc = X86::MOV64ri32;
3765     else
3766       Opc = X86::MOV64ri;
3767     break;
3768   }
3769   }
3770   return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
3771 }
3772
3773 unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
3774   if (CFP->isNullValue())
3775     return fastMaterializeFloatZero(CFP);
3776
3777   // Can't handle alternate code models yet.
3778   CodeModel::Model CM = TM.getCodeModel();
3779   if (CM != CodeModel::Small && CM != CodeModel::Medium &&
3780       CM != CodeModel::Large)
3781     return 0;
3782
3783   // Get opcode and regclass of the output for the given load instruction.
3784   unsigned Opc = 0;
3785   bool HasSSE1 = Subtarget->hasSSE1();
3786   bool HasSSE2 = Subtarget->hasSSE2();
3787   bool HasAVX = Subtarget->hasAVX();
3788   bool HasAVX512 = Subtarget->hasAVX512();
3789   switch (VT.SimpleTy) {
3790   default: return 0;
3791   case MVT::f32:
3792     Opc = HasAVX512 ? X86::VMOVSSZrm_alt
3793           : HasAVX  ? X86::VMOVSSrm_alt
3794           : HasSSE1 ? X86::MOVSSrm_alt
3795                     : X86::LD_Fp32m;
3796     break;
3797   case MVT::f64:
3798     Opc = HasAVX512 ? X86::VMOVSDZrm_alt
3799           : HasAVX  ? X86::VMOVSDrm_alt
3800           : HasSSE2 ? X86::MOVSDrm_alt
3801                     : X86::LD_Fp64m;
3802     break;
3803   case MVT::f80:
3804     // No f80 support yet.
3805     return 0;
3806   }
3807
3808   // MachineConstantPool wants an explicit alignment.
3809   Align Alignment = DL.getPrefTypeAlign(CFP->getType());
3810
3811   // x86-32 PIC requires a PIC base register for constant pools.
3812   unsigned PICBase = 0;
3813   unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
3814   if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
3815     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3816   else if (OpFlag == X86II::MO_GOTOFF)
3817     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
3818   else if (Subtarget->is64Bit() && TM.getCodeModel() != CodeModel::Large)
3819     PICBase = X86::RIP;
3820
3821   // Create the load from the constant pool.
3822   unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment);
3823   Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
3824
3825   // Large code model only applies to 64-bit mode.
3826   if (Subtarget->is64Bit() && CM == CodeModel::Large) {
3827     Register AddrReg = createResultReg(&X86::GR64RegClass);
3828     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV64ri),
3829             AddrReg)
3830       .addConstantPoolIndex(CPI, 0, OpFlag);
3831     MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3832                                       TII.get(Opc), ResultReg);
3833     addRegReg(MIB, AddrReg, false, PICBase, false);
3834     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
3835         MachinePointerInfo::getConstantPool(*FuncInfo.MF),
3836         MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
3837     MIB->addMemOperand(*FuncInfo.MF, MMO);
3838     return ResultReg;
3839   }
3840
3841   addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3842                                    TII.get(Opc), ResultReg),
3843                            CPI, PICBase, OpFlag);
3844   return ResultReg;
3845 }
3846
3847 unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
3848   // Can't handle large GlobalValues yet.
3849   if (TM.getCodeModel() != CodeModel::Small &&
3850       TM.getCodeModel() != CodeModel::Medium)
3851     return 0;
3852   if (TM.isLargeGlobalValue(GV))
3853     return 0;
3854
3855   // Materialize addresses with LEA/MOV instructions.
3856   X86AddressMode AM;
3857   if (X86SelectAddress(GV, AM)) {
3858     // If the expression is just a basereg, then we're done, otherwise we need
3859     // to emit an LEA.
3860     if (AM.BaseType == X86AddressMode::RegBase &&
3861         AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
3862       return AM.Base.Reg;
3863
3864     Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3865     if (TM.getRelocationModel() == Reloc::Static &&
3866         TLI.getPointerTy(DL) == MVT::i64) {
3867       // The displacement code could be more than 32 bits away so we need to use
3868       // an instruction with a 64 bit immediate
3869       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV64ri),
3870               ResultReg)
3871         .addGlobalAddress(GV);
3872     } else {
3873       unsigned Opc =
3874           TLI.getPointerTy(DL) == MVT::i32
3875               ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
3876               : X86::LEA64r;
3877       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3878                              TII.get(Opc), ResultReg), AM);
3879     }
3880     return ResultReg;
3881   }
3882   return 0;
3883 }
3884
3885 unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
3886   EVT CEVT = TLI.getValueType(DL, C->getType(), true);
3887
3888   // Only handle simple types.
3889   if (!CEVT.isSimple())
3890     return 0;
3891   MVT VT = CEVT.getSimpleVT();
3892
3893   if (const auto *CI = dyn_cast<ConstantInt>(C))
3894     return X86MaterializeInt(CI, VT);
3895   if (const auto *CFP = dyn_cast<ConstantFP>(C))
3896     return X86MaterializeFP(CFP, VT);
3897   if (const auto *GV = dyn_cast<GlobalValue>(C))
3898     return X86MaterializeGV(GV, VT);
3899   if (isa<UndefValue>(C)) {
3900     unsigned Opc = 0;
3901     switch (VT.SimpleTy) {
3902     default:
3903       break;
3904     case MVT::f32:
3905       if (!Subtarget->hasSSE1())
3906         Opc = X86::LD_Fp032;
3907       break;
3908     case MVT::f64:
3909       if (!Subtarget->hasSSE2())
3910         Opc = X86::LD_Fp064;
3911       break;
3912     case MVT::f80:
3913       Opc = X86::LD_Fp080;
3914       break;
3915     }
3916
3917     if (Opc) {
3918       Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3919       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc),
3920               ResultReg);
3921       return ResultReg;
3922     }
3923   }
3924
3925   return 0;
3926 }
3927
3928 unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
3929   // Fail on dynamic allocas. At this point, getRegForValue has already
3930   // checked its CSE maps, so if we're here trying to handle a dynamic
3931   // alloca, we're not going to succeed. X86SelectAddress has a
3932   // check for dynamic allocas, because it's called directly from
3933   // various places, but targetMaterializeAlloca also needs a check
3934   // in order to avoid recursion between getRegForValue,
3935   // X86SelectAddrss, and targetMaterializeAlloca.
3936   if (!FuncInfo.StaticAllocaMap.count(C))
3937     return 0;
3938   assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
3939
3940   X86AddressMode AM;
3941   if (!X86SelectAddress(C, AM))
3942     return 0;
3943   unsigned Opc =
3944       TLI.getPointerTy(DL) == MVT::i32
3945           ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
3946           : X86::LEA64r;
3947   const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
3948   Register ResultReg = createResultReg(RC);
3949   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
3950                          TII.get(Opc), ResultReg), AM);
3951   return ResultReg;
3952 }
3953
3954 unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
3955   MVT VT;
3956   if (!isTypeLegal(CF->getType(), VT))
3957     return 0;
3958
3959   // Get opcode and regclass for the given zero.
3960   bool HasSSE1 = Subtarget->hasSSE1();
3961   bool HasSSE2 = Subtarget->hasSSE2();
3962   bool HasAVX512 = Subtarget->hasAVX512();
3963   unsigned Opc = 0;
3964   switch (VT.SimpleTy) {
3965   default: return 0;
3966   case MVT::f16:
3967     Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH;
3968     break;
3969   case MVT::f32:
3970     Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
3971           : HasSSE1 ? X86::FsFLD0SS
3972                     : X86::LD_Fp032;
3973     break;
3974   case MVT::f64:
3975     Opc = HasAVX512 ? X86::AVX512_FsFLD0SD
3976           : HasSSE2 ? X86::FsFLD0SD
3977                     : X86::LD_Fp064;
3978     break;
3979   case MVT::f80:
3980     // No f80 support yet.
3981     return 0;
3982   }
3983
3984   Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
3985   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg);
3986   return ResultReg;
3987 }
3988
3989
3990 bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
3991                                       const LoadInst *LI) {
3992   const Value *Ptr = LI->getPointerOperand();
3993   X86AddressMode AM;
3994   if (!X86SelectAddress(Ptr, AM))
3995     return false;
3996
3997   const X86InstrInfo &XII = (const X86InstrInfo &)TII;
3998
3999   unsigned Size = DL.getTypeAllocSize(LI->getType());
4000
4001   SmallVector<MachineOperand, 8> AddrOps;
4002   AM.getFullAddress(AddrOps);
4003
4004   MachineInstr *Result = XII.foldMemoryOperandImpl(
4005       *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(),
4006       /*AllowCommute=*/true);
4007   if (!Result)
4008     return false;
4009
4010   // The index register could be in the wrong register class.  Unfortunately,
4011   // foldMemoryOperandImpl could have commuted the instruction so its not enough
4012   // to just look at OpNo + the offset to the index reg.  We actually need to
4013   // scan the instruction to find the index reg and see if its the correct reg
4014   // class.
4015   unsigned OperandNo = 0;
4016   for (MachineInstr::mop_iterator I = Result->operands_begin(),
4017        E = Result->operands_end(); I != E; ++I, ++OperandNo) {
4018     MachineOperand &MO = *I;
4019     if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
4020       continue;
4021     // Found the index reg, now try to rewrite it.
4022     Register IndexReg = constrainOperandRegClass(Result->getDesc(),
4023                                                  MO.getReg(), OperandNo);
4024     if (IndexReg == MO.getReg())
4025       continue;
4026     MO.setReg(IndexReg);
4027   }
4028
4029   Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
4030   Result->cloneInstrSymbols(*FuncInfo.MF, *MI);
4031   MachineBasicBlock::iterator I(MI);
4032   removeDeadCode(I, std::next(I));
4033   return true;
4034 }
4035
4036 unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
4037                                         const TargetRegisterClass *RC,
4038                                         unsigned Op0, unsigned Op1,
4039                                         unsigned Op2, unsigned Op3) {
4040   const MCInstrDesc &II = TII.get(MachineInstOpcode);
4041
4042   Register ResultReg = createResultReg(RC);
4043   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
4044   Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
4045   Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
4046   Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3);
4047
4048   if (II.getNumDefs() >= 1)
4049     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
4050         .addReg(Op0)
4051         .addReg(Op1)
4052         .addReg(Op2)
4053         .addReg(Op3);
4054   else {
4055     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
4056         .addReg(Op0)
4057         .addReg(Op1)
4058         .addReg(Op2)
4059         .addReg(Op3);
4060     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
4061             ResultReg)
4062         .addReg(II.implicit_defs()[0]);
4063   }
4064   return ResultReg;
4065 }
4066
4067
4068 namespace llvm {
4069   FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
4070                                 const TargetLibraryInfo *libInfo) {
4071     return new X86FastISel(funcInfo, libInfo);
4072   }
4073 }