llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

   1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "AArch64InstrInfo.h"
  14 #include "AArch64ExpandImm.h"
  15 #include "AArch64MachineFunctionInfo.h"
  16 #include "AArch64PointerAuth.h"
  17 #include "AArch64Subtarget.h"
  18 #include "MCTargetDesc/AArch64AddressingModes.h"
  19 #include "MCTargetDesc/AArch64MCTargetDesc.h"
  20 #include "Utils/AArch64BaseInfo.h"
  21 #include "llvm/ADT/ArrayRef.h"
  22 #include "llvm/ADT/STLExtras.h"
  23 #include "llvm/ADT/SmallVector.h"
  24 #include "llvm/CodeGen/LivePhysRegs.h"
  25 #include "llvm/CodeGen/MachineBasicBlock.h"
  26 #include "llvm/CodeGen/MachineCombinerPattern.h"
  27 #include "llvm/CodeGen/MachineFrameInfo.h"
  28 #include "llvm/CodeGen/MachineFunction.h"
  29 #include "llvm/CodeGen/MachineInstr.h"
  30 #include "llvm/CodeGen/MachineInstrBuilder.h"
  31 #include "llvm/CodeGen/MachineMemOperand.h"
  32 #include "llvm/CodeGen/MachineModuleInfo.h"
  33 #include "llvm/CodeGen/MachineOperand.h"
  34 #include "llvm/CodeGen/MachineRegisterInfo.h"
  35 #include "llvm/CodeGen/RegisterScavenging.h"
  36 #include "llvm/CodeGen/StackMaps.h"
  37 #include "llvm/CodeGen/TargetRegisterInfo.h"
  38 #include "llvm/CodeGen/TargetSubtargetInfo.h"
  39 #include "llvm/IR/DebugInfoMetadata.h"
  40 #include "llvm/IR/DebugLoc.h"
  41 #include "llvm/IR/GlobalValue.h"
  42 #include "llvm/IR/Module.h"
  43 #include "llvm/MC/MCAsmInfo.h"
  44 #include "llvm/MC/MCInst.h"
  45 #include "llvm/MC/MCInstBuilder.h"
  46 #include "llvm/MC/MCInstrDesc.h"
  47 #include "llvm/Support/Casting.h"
  48 #include "llvm/Support/CodeGen.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/ErrorHandling.h"
  51 #include "llvm/Support/LEB128.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetMachine.h"
  54 #include "llvm/Target/TargetOptions.h"
  55 #include <cassert>
  56 #include <cstdint>
  57 #include <iterator>
  58 #include <utility>
  59
  60 using namespace llvm;
  61
  62 #define GET_INSTRINFO_CTOR_DTOR
  63 #include "AArch64GenInstrInfo.inc"
  64
  65 static cl::opt<unsigned> TBZDisplacementBits(
  66     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
  67     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
  68
  69 static cl::opt<unsigned> CBZDisplacementBits(
  70     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
  71     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
  72
  73 static cl::opt<unsigned>
  74     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
  75                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
  76
  77 static cl::opt<unsigned>
  78     BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
  79                       cl::desc("Restrict range of B instructions (DEBUG)"));
  80
  81 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
  82     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
  83                           AArch64::CATCHRET),
  84       RI(STI.getTargetTriple()), Subtarget(STI) {}
  85
  86 /// GetInstSize - Return the number of bytes of code the specified
  87 /// instruction may be.  This returns the maximum number of bytes.
  88 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
  89   const MachineBasicBlock &MBB = *MI.getParent();
  90   const MachineFunction *MF = MBB.getParent();
  91   const Function &F = MF->getFunction();
  92   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
  93
  94   {
  95     auto Op = MI.getOpcode();
  96     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
  97       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
  98   }
  99
 100   // Meta-instructions emit no code.
 101   if (MI.isMetaInstruction())
 102     return 0;
 103
 104   // FIXME: We currently only handle pseudoinstructions that don't get expanded
 105   //        before the assembly printer.
 106   unsigned NumBytes = 0;
 107   const MCInstrDesc &Desc = MI.getDesc();
 108
 109   if (!MI.isBundle() && isTailCallReturnInst(MI)) {
 110     NumBytes = Desc.getSize() ? Desc.getSize() : 4;
 111
 112     const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
 113     if (!MFI->shouldSignReturnAddress(MF))
 114       return NumBytes;
 115
 116     const auto &STI = MF->getSubtarget<AArch64Subtarget>();
 117     auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
 118     NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
 119     return NumBytes;
 120   }
 121
 122   // Size should be preferably set in
 123   // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
 124   // Specific cases handle instructions of variable sizes
 125   switch (Desc.getOpcode()) {
 126   default:
 127     if (Desc.getSize())
 128       return Desc.getSize();
 129
 130     // Anything not explicitly designated otherwise (i.e. pseudo-instructions
 131     // with fixed constant size but not specified in .td file) is a normal
 132     // 4-byte insn.
 133     NumBytes = 4;
 134     break;
 135   case TargetOpcode::STACKMAP:
 136     // The upper bound for a stackmap intrinsic is the full length of its shadow
 137     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
 138     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
 139     break;
 140   case TargetOpcode::PATCHPOINT:
 141     // The size of the patchpoint intrinsic is the number of bytes requested
 142     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
 143     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
 144     break;
 145   case TargetOpcode::STATEPOINT:
 146     NumBytes = StatepointOpers(&MI).getNumPatchBytes();
 147     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
 148     // No patch bytes means a normal call inst is emitted
 149     if (NumBytes == 0)
 150       NumBytes = 4;
 151     break;
 152   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
 153     // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
 154     // instructions are expanded to the specified number of NOPs. Otherwise,
 155     // they are expanded to 36-byte XRay sleds.
 156     NumBytes =
 157         F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
 158     break;
 159   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
 160   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
 161     // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
 162     NumBytes = 36;
 163     break;
 164   case TargetOpcode::PATCHABLE_EVENT_CALL:
 165     // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
 166     NumBytes = 24;
 167     break;
 168
 169   case AArch64::SPACE:
 170     NumBytes = MI.getOperand(1).getImm();
 171     break;
 172   case TargetOpcode::BUNDLE:
 173     NumBytes = getInstBundleLength(MI);
 174     break;
 175   }
 176
 177   return NumBytes;
 178 }
 179
 180 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
 181   unsigned Size = 0;
 182   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
 183   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
 184   while (++I != E && I->isInsideBundle()) {
 185     assert(!I->isBundle() && "No nested bundle!");
 186     Size += getInstSizeInBytes(*I);
 187   }
 188   return Size;
 189 }
 190
 191 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
 192                             SmallVectorImpl<MachineOperand> &Cond) {
 193   // Block ends with fall-through condbranch.
 194   switch (LastInst->getOpcode()) {
 195   default:
 196     llvm_unreachable("Unknown branch instruction?");
 197   case AArch64::Bcc:
 198     Target = LastInst->getOperand(1).getMBB();
 199     Cond.push_back(LastInst->getOperand(0));
 200     break;
 201   case AArch64::CBZW:
 202   case AArch64::CBZX:
 203   case AArch64::CBNZW:
 204   case AArch64::CBNZX:
 205     Target = LastInst->getOperand(1).getMBB();
 206     Cond.push_back(MachineOperand::CreateImm(-1));
 207     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
 208     Cond.push_back(LastInst->getOperand(0));
 209     break;
 210   case AArch64::TBZW:
 211   case AArch64::TBZX:
 212   case AArch64::TBNZW:
 213   case AArch64::TBNZX:
 214     Target = LastInst->getOperand(2).getMBB();
 215     Cond.push_back(MachineOperand::CreateImm(-1));
 216     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
 217     Cond.push_back(LastInst->getOperand(0));
 218     Cond.push_back(LastInst->getOperand(1));
 219   }
 220 }
 221
 222 static unsigned getBranchDisplacementBits(unsigned Opc) {
 223   switch (Opc) {
 224   default:
 225     llvm_unreachable("unexpected opcode!");
 226   case AArch64::B:
 227     return BDisplacementBits;
 228   case AArch64::TBNZW:
 229   case AArch64::TBZW:
 230   case AArch64::TBNZX:
 231   case AArch64::TBZX:
 232     return TBZDisplacementBits;
 233   case AArch64::CBNZW:
 234   case AArch64::CBZW:
 235   case AArch64::CBNZX:
 236   case AArch64::CBZX:
 237     return CBZDisplacementBits;
 238   case AArch64::Bcc:
 239     return BCCDisplacementBits;
 240   }
 241 }
 242
 243 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
 244                                              int64_t BrOffset) const {
 245   unsigned Bits = getBranchDisplacementBits(BranchOp);
 246   assert(Bits >= 3 && "max branch displacement must be enough to jump"
 247                       "over conditional branch expansion");
 248   return isIntN(Bits, BrOffset / 4);
 249 }
 250
 251 MachineBasicBlock *
 252 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
 253   switch (MI.getOpcode()) {
 254   default:
 255     llvm_unreachable("unexpected opcode!");
 256   case AArch64::B:
 257     return MI.getOperand(0).getMBB();
 258   case AArch64::TBZW:
 259   case AArch64::TBNZW:
 260   case AArch64::TBZX:
 261   case AArch64::TBNZX:
 262     return MI.getOperand(2).getMBB();
 263   case AArch64::CBZW:
 264   case AArch64::CBNZW:
 265   case AArch64::CBZX:
 266   case AArch64::CBNZX:
 267   case AArch64::Bcc:
 268     return MI.getOperand(1).getMBB();
 269   }
 270 }
 271
 272 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
 273                                             MachineBasicBlock &NewDestBB,
 274                                             MachineBasicBlock &RestoreBB,
 275                                             const DebugLoc &DL,
 276                                             int64_t BrOffset,
 277                                             RegScavenger *RS) const {
 278   assert(RS && "RegScavenger required for long branching");
 279   assert(MBB.empty() &&
 280          "new block should be inserted for expanding unconditional branch");
 281   assert(MBB.pred_size() == 1);
 282   assert(RestoreBB.empty() &&
 283          "restore block should be inserted for restoring clobbered registers");
 284
 285   auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
 286     // Offsets outside of the signed 33-bit range are not supported for ADRP +
 287     // ADD.
 288     if (!isInt<33>(BrOffset))
 289       report_fatal_error(
 290           "Branch offsets outside of the signed 33-bit range not supported");
 291
 292     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
 293         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
 294     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
 295         .addReg(Reg)
 296         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
 297         .addImm(0);
 298     BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
 299   };
 300
 301   RS->enterBasicBlockEnd(MBB);
 302   // If X16 is unused, we can rely on the linker to insert a range extension
 303   // thunk if NewDestBB is out of range of a single B instruction.
 304   constexpr Register Reg = AArch64::X16;
 305   if (!RS->isRegUsed(Reg)) {
 306     insertUnconditionalBranch(MBB, &NewDestBB, DL);
 307     RS->setRegUsed(Reg);
 308     return;
 309   }
 310
 311   // If there's a free register and it's worth inflating the code size,
 312   // manually insert the indirect branch.
 313   Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
 314   if (Scavenged != AArch64::NoRegister &&
 315       MBB.getSectionID() == MBBSectionID::ColdSectionID) {
 316     buildIndirectBranch(Scavenged, NewDestBB);
 317     RS->setRegUsed(Scavenged);
 318     return;
 319   }
 320
 321   // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
 322   // with red zones.
 323   AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
 324   if (!AFI || AFI->hasRedZone().value_or(true))
 325     report_fatal_error(
 326         "Unable to insert indirect branch inside function that has red zone");
 327
 328   // Otherwise, spill X16 and defer range extension to the linker.
 329   BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
 330       .addReg(AArch64::SP, RegState::Define)
 331       .addReg(Reg)
 332       .addReg(AArch64::SP)
 333       .addImm(-16);
 334
 335   BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
 336
 337   BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
 338       .addReg(AArch64::SP, RegState::Define)
 339       .addReg(Reg, RegState::Define)
 340       .addReg(AArch64::SP)
 341       .addImm(16);
 342 }
 343
 344 // Branch analysis.
 345 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 346                                      MachineBasicBlock *&TBB,
 347                                      MachineBasicBlock *&FBB,
 348                                      SmallVectorImpl<MachineOperand> &Cond,
 349                                      bool AllowModify) const {
 350   // If the block has no terminators, it just falls into the block after it.
 351   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
 352   if (I == MBB.end())
 353     return false;
 354
 355   // Skip over SpeculationBarrierEndBB terminators
 356   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
 357       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
 358     --I;
 359   }
 360
 361   if (!isUnpredicatedTerminator(*I))
 362     return false;
 363
 364   // Get the last instruction in the block.
 365   MachineInstr *LastInst = &*I;
 366
 367   // If there is only one terminator instruction, process it.
 368   unsigned LastOpc = LastInst->getOpcode();
 369   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
 370     if (isUncondBranchOpcode(LastOpc)) {
 371       TBB = LastInst->getOperand(0).getMBB();
 372       return false;
 373     }
 374     if (isCondBranchOpcode(LastOpc)) {
 375       // Block ends with fall-through condbranch.
 376       parseCondBranch(LastInst, TBB, Cond);
 377       return false;
 378     }
 379     return true; // Can't handle indirect branch.
 380   }
 381
 382   // Get the instruction before it if it is a terminator.
 383   MachineInstr *SecondLastInst = &*I;
 384   unsigned SecondLastOpc = SecondLastInst->getOpcode();
 385
 386   // If AllowModify is true and the block ends with two or more unconditional
 387   // branches, delete all but the first unconditional branch.
 388   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
 389     while (isUncondBranchOpcode(SecondLastOpc)) {
 390       LastInst->eraseFromParent();
 391       LastInst = SecondLastInst;
 392       LastOpc = LastInst->getOpcode();
 393       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
 394         // Return now the only terminator is an unconditional branch.
 395         TBB = LastInst->getOperand(0).getMBB();
 396         return false;
 397       }
 398       SecondLastInst = &*I;
 399       SecondLastOpc = SecondLastInst->getOpcode();
 400     }
 401   }
 402
 403   // If we're allowed to modify and the block ends in a unconditional branch
 404   // which could simply fallthrough, remove the branch.  (Note: This case only
 405   // matters when we can't understand the whole sequence, otherwise it's also
 406   // handled by BranchFolding.cpp.)
 407   if (AllowModify && isUncondBranchOpcode(LastOpc) &&
 408       MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
 409     LastInst->eraseFromParent();
 410     LastInst = SecondLastInst;
 411     LastOpc = LastInst->getOpcode();
 412     if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
 413       assert(!isUncondBranchOpcode(LastOpc) &&
 414              "unreachable unconditional branches removed above");
 415
 416       if (isCondBranchOpcode(LastOpc)) {
 417         // Block ends with fall-through condbranch.
 418         parseCondBranch(LastInst, TBB, Cond);
 419         return false;
 420       }
 421       return true; // Can't handle indirect branch.
 422     }
 423     SecondLastInst = &*I;
 424     SecondLastOpc = SecondLastInst->getOpcode();
 425   }
 426
 427   // If there are three terminators, we don't know what sort of block this is.
 428   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
 429     return true;
 430
 431   // If the block ends with a B and a Bcc, handle it.
 432   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
 433     parseCondBranch(SecondLastInst, TBB, Cond);
 434     FBB = LastInst->getOperand(0).getMBB();
 435     return false;
 436   }
 437
 438   // If the block ends with two unconditional branches, handle it.  The second
 439   // one is not executed, so remove it.
 440   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
 441     TBB = SecondLastInst->getOperand(0).getMBB();
 442     I = LastInst;
 443     if (AllowModify)
 444       I->eraseFromParent();
 445     return false;
 446   }
 447
 448   // ...likewise if it ends with an indirect branch followed by an unconditional
 449   // branch.
 450   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
 451     I = LastInst;
 452     if (AllowModify)
 453       I->eraseFromParent();
 454     return true;
 455   }
 456
 457   // Otherwise, can't handle this.
 458   return true;
 459 }
 460
 461 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
 462                                               MachineBranchPredicate &MBP,
 463                                               bool AllowModify) const {
 464   // For the moment, handle only a block which ends with a cb(n)zx followed by
 465   // a fallthrough.  Why this?  Because it is a common form.
 466   // TODO: Should we handle b.cc?
 467
 468   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
 469   if (I == MBB.end())
 470     return true;
 471
 472   // Skip over SpeculationBarrierEndBB terminators
 473   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
 474       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
 475     --I;
 476   }
 477
 478   if (!isUnpredicatedTerminator(*I))
 479     return true;
 480
 481   // Get the last instruction in the block.
 482   MachineInstr *LastInst = &*I;
 483   unsigned LastOpc = LastInst->getOpcode();
 484   if (!isCondBranchOpcode(LastOpc))
 485     return true;
 486
 487   switch (LastOpc) {
 488   default:
 489     return true;
 490   case AArch64::CBZW:
 491   case AArch64::CBZX:
 492   case AArch64::CBNZW:
 493   case AArch64::CBNZX:
 494     break;
 495   };
 496
 497   MBP.TrueDest = LastInst->getOperand(1).getMBB();
 498   assert(MBP.TrueDest && "expected!");
 499   MBP.FalseDest = MBB.getNextNode();
 500
 501   MBP.ConditionDef = nullptr;
 502   MBP.SingleUseCondition = false;
 503
 504   MBP.LHS = LastInst->getOperand(0);
 505   MBP.RHS = MachineOperand::CreateImm(0);
 506   MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
 507                                             : MachineBranchPredicate::PRED_EQ;
 508   return false;
 509 }
 510
 511 bool AArch64InstrInfo::reverseBranchCondition(
 512     SmallVectorImpl<MachineOperand> &Cond) const {
 513   if (Cond[0].getImm() != -1) {
 514     // Regular Bcc
 515     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
 516     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
 517   } else {
 518     // Folded compare-and-branch
 519     switch (Cond[1].getImm()) {
 520     default:
 521       llvm_unreachable("Unknown conditional branch!");
 522     case AArch64::CBZW:
 523       Cond[1].setImm(AArch64::CBNZW);
 524       break;
 525     case AArch64::CBNZW:
 526       Cond[1].setImm(AArch64::CBZW);
 527       break;
 528     case AArch64::CBZX:
 529       Cond[1].setImm(AArch64::CBNZX);
 530       break;
 531     case AArch64::CBNZX:
 532       Cond[1].setImm(AArch64::CBZX);
 533       break;
 534     case AArch64::TBZW:
 535       Cond[1].setImm(AArch64::TBNZW);
 536       break;
 537     case AArch64::TBNZW:
 538       Cond[1].setImm(AArch64::TBZW);
 539       break;
 540     case AArch64::TBZX:
 541       Cond[1].setImm(AArch64::TBNZX);
 542       break;
 543     case AArch64::TBNZX:
 544       Cond[1].setImm(AArch64::TBZX);
 545       break;
 546     }
 547   }
 548
 549   return false;
 550 }
 551
 552 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
 553                                         int *BytesRemoved) const {
 554   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
 555   if (I == MBB.end())
 556     return 0;
 557
 558   if (!isUncondBranchOpcode(I->getOpcode()) &&
 559       !isCondBranchOpcode(I->getOpcode()))
 560     return 0;
 561
 562   // Remove the branch.
 563   I->eraseFromParent();
 564
 565   I = MBB.end();
 566
 567   if (I == MBB.begin()) {
 568     if (BytesRemoved)
 569       *BytesRemoved = 4;
 570     return 1;
 571   }
 572   --I;
 573   if (!isCondBranchOpcode(I->getOpcode())) {
 574     if (BytesRemoved)
 575       *BytesRemoved = 4;
 576     return 1;
 577   }
 578
 579   // Remove the branch.
 580   I->eraseFromParent();
 581   if (BytesRemoved)
 582     *BytesRemoved = 8;
 583
 584   return 2;
 585 }
 586
 587 void AArch64InstrInfo::instantiateCondBranch(
 588     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
 589     ArrayRef<MachineOperand> Cond) const {
 590   if (Cond[0].getImm() != -1) {
 591     // Regular Bcc
 592     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
 593   } else {
 594     // Folded compare-and-branch
 595     // Note that we use addOperand instead of addReg to keep the flags.
 596     const MachineInstrBuilder MIB =
 597         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
 598     if (Cond.size() > 3)
 599       MIB.addImm(Cond[3].getImm());
 600     MIB.addMBB(TBB);
 601   }
 602 }
 603
 604 unsigned AArch64InstrInfo::insertBranch(
 605     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
 606     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
 607   // Shouldn't be a fall through.
 608   assert(TBB && "insertBranch must not be told to insert a fallthrough");
 609
 610   if (!FBB) {
 611     if (Cond.empty()) // Unconditional branch?
 612       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
 613     else
 614       instantiateCondBranch(MBB, DL, TBB, Cond);
 615
 616     if (BytesAdded)
 617       *BytesAdded = 4;
 618
 619     return 1;
 620   }
 621
 622   // Two-way conditional branch.
 623   instantiateCondBranch(MBB, DL, TBB, Cond);
 624   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
 625
 626   if (BytesAdded)
 627     *BytesAdded = 8;
 628
 629   return 2;
 630 }
 631
 632 // Find the original register that VReg is copied from.
 633 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
 634   while (Register::isVirtualRegister(VReg)) {
 635     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
 636     if (!DefMI->isFullCopy())
 637       return VReg;
 638     VReg = DefMI->getOperand(1).getReg();
 639   }
 640   return VReg;
 641 }
 642
 643 // Determine if VReg is defined by an instruction that can be folded into a
 644 // csel instruction. If so, return the folded opcode, and the replacement
 645 // register.
 646 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
 647                                 unsigned *NewVReg = nullptr) {
 648   VReg = removeCopies(MRI, VReg);
 649   if (!Register::isVirtualRegister(VReg))
 650     return 0;
 651
 652   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
 653   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
 654   unsigned Opc = 0;
 655   unsigned SrcOpNum = 0;
 656   switch (DefMI->getOpcode()) {
 657   case AArch64::ADDSXri:
 658   case AArch64::ADDSWri:
 659     // if NZCV is used, do not fold.
 660     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
 661                                          true) == -1)
 662       return 0;
 663     // fall-through to ADDXri and ADDWri.
 664     [[fallthrough]];
 665   case AArch64::ADDXri:
 666   case AArch64::ADDWri:
 667     // add x, 1 -> csinc.
 668     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
 669         DefMI->getOperand(3).getImm() != 0)
 670       return 0;
 671     SrcOpNum = 1;
 672     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
 673     break;
 674
 675   case AArch64::ORNXrr:
 676   case AArch64::ORNWrr: {
 677     // not x -> csinv, represented as orn dst, xzr, src.
 678     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
 679     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
 680       return 0;
 681     SrcOpNum = 2;
 682     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
 683     break;
 684   }
 685
 686   case AArch64::SUBSXrr:
 687   case AArch64::SUBSWrr:
 688     // if NZCV is used, do not fold.
 689     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
 690                                          true) == -1)
 691       return 0;
 692     // fall-through to SUBXrr and SUBWrr.
 693     [[fallthrough]];
 694   case AArch64::SUBXrr:
 695   case AArch64::SUBWrr: {
 696     // neg x -> csneg, represented as sub dst, xzr, src.
 697     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
 698     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
 699       return 0;
 700     SrcOpNum = 2;
 701     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
 702     break;
 703   }
 704   default:
 705     return 0;
 706   }
 707   assert(Opc && SrcOpNum && "Missing parameters");
 708
 709   if (NewVReg)
 710     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
 711   return Opc;
 712 }
 713
 714 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
 715                                        ArrayRef<MachineOperand> Cond,
 716                                        Register DstReg, Register TrueReg,
 717                                        Register FalseReg, int &CondCycles,
 718                                        int &TrueCycles,
 719                                        int &FalseCycles) const {
 720   // Check register classes.
 721   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 722   const TargetRegisterClass *RC =
 723       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
 724   if (!RC)
 725     return false;
 726
 727   // Also need to check the dest regclass, in case we're trying to optimize
 728   // something like:
 729   // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
 730   if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
 731     return false;
 732
 733   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
 734   unsigned ExtraCondLat = Cond.size() != 1;
 735
 736   // GPRs are handled by csel.
 737   // FIXME: Fold in x+1, -x, and ~x when applicable.
 738   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
 739       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
 740     // Single-cycle csel, csinc, csinv, and csneg.
 741     CondCycles = 1 + ExtraCondLat;
 742     TrueCycles = FalseCycles = 1;
 743     if (canFoldIntoCSel(MRI, TrueReg))
 744       TrueCycles = 0;
 745     else if (canFoldIntoCSel(MRI, FalseReg))
 746       FalseCycles = 0;
 747     return true;
 748   }
 749
 750   // Scalar floating point is handled by fcsel.
 751   // FIXME: Form fabs, fmin, and fmax when applicable.
 752   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
 753       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
 754     CondCycles = 5 + ExtraCondLat;
 755     TrueCycles = FalseCycles = 2;
 756     return true;
 757   }
 758
 759   // Can't do vectors.
 760   return false;
 761 }
 762
 763 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
 764                                     MachineBasicBlock::iterator I,
 765                                     const DebugLoc &DL, Register DstReg,
 766                                     ArrayRef<MachineOperand> Cond,
 767                                     Register TrueReg, Register FalseReg) const {
 768   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 769
 770   // Parse the condition code, see parseCondBranch() above.
 771   AArch64CC::CondCode CC;
 772   switch (Cond.size()) {
 773   default:
 774     llvm_unreachable("Unknown condition opcode in Cond");
 775   case 1: // b.cc
 776     CC = AArch64CC::CondCode(Cond[0].getImm());
 777     break;
 778   case 3: { // cbz/cbnz
 779     // We must insert a compare against 0.
 780     bool Is64Bit;
 781     switch (Cond[1].getImm()) {
 782     default:
 783       llvm_unreachable("Unknown branch opcode in Cond");
 784     case AArch64::CBZW:
 785       Is64Bit = false;
 786       CC = AArch64CC::EQ;
 787       break;
 788     case AArch64::CBZX:
 789       Is64Bit = true;
 790       CC = AArch64CC::EQ;
 791       break;
 792     case AArch64::CBNZW:
 793       Is64Bit = false;
 794       CC = AArch64CC::NE;
 795       break;
 796     case AArch64::CBNZX:
 797       Is64Bit = true;
 798       CC = AArch64CC::NE;
 799       break;
 800     }
 801     Register SrcReg = Cond[2].getReg();
 802     if (Is64Bit) {
 803       // cmp reg, #0 is actually subs xzr, reg, #0.
 804       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
 805       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
 806           .addReg(SrcReg)
 807           .addImm(0)
 808           .addImm(0);
 809     } else {
 810       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
 811       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
 812           .addReg(SrcReg)
 813           .addImm(0)
 814           .addImm(0);
 815     }
 816     break;
 817   }
 818   case 4: { // tbz/tbnz
 819     // We must insert a tst instruction.
 820     switch (Cond[1].getImm()) {
 821     default:
 822       llvm_unreachable("Unknown branch opcode in Cond");
 823     case AArch64::TBZW:
 824     case AArch64::TBZX:
 825       CC = AArch64CC::EQ;
 826       break;
 827     case AArch64::TBNZW:
 828     case AArch64::TBNZX:
 829       CC = AArch64CC::NE;
 830       break;
 831     }
 832     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
 833     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
 834       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
 835           .addReg(Cond[2].getReg())
 836           .addImm(
 837               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
 838     else
 839       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
 840           .addReg(Cond[2].getReg())
 841           .addImm(
 842               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
 843     break;
 844   }
 845   }
 846
 847   unsigned Opc = 0;
 848   const TargetRegisterClass *RC = nullptr;
 849   bool TryFold = false;
 850   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
 851     RC = &AArch64::GPR64RegClass;
 852     Opc = AArch64::CSELXr;
 853     TryFold = true;
 854   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
 855     RC = &AArch64::GPR32RegClass;
 856     Opc = AArch64::CSELWr;
 857     TryFold = true;
 858   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
 859     RC = &AArch64::FPR64RegClass;
 860     Opc = AArch64::FCSELDrrr;
 861   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
 862     RC = &AArch64::FPR32RegClass;
 863     Opc = AArch64::FCSELSrrr;
 864   }
 865   assert(RC && "Unsupported regclass");
 866
 867   // Try folding simple instructions into the csel.
 868   if (TryFold) {
 869     unsigned NewVReg = 0;
 870     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
 871     if (FoldedOpc) {
 872       // The folded opcodes csinc, csinc and csneg apply the operation to
 873       // FalseReg, so we need to invert the condition.
 874       CC = AArch64CC::getInvertedCondCode(CC);
 875       TrueReg = FalseReg;
 876     } else
 877       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
 878
 879     // Fold the operation. Leave any dead instructions for DCE to clean up.
 880     if (FoldedOpc) {
 881       FalseReg = NewVReg;
 882       Opc = FoldedOpc;
 883       // The extends the live range of NewVReg.
 884       MRI.clearKillFlags(NewVReg);
 885     }
 886   }
 887
 888   // Pull all virtual register into the appropriate class.
 889   MRI.constrainRegClass(TrueReg, RC);
 890   MRI.constrainRegClass(FalseReg, RC);
 891
 892   // Insert the csel.
 893   BuildMI(MBB, I, DL, get(Opc), DstReg)
 894       .addReg(TrueReg)
 895       .addReg(FalseReg)
 896       .addImm(CC);
 897 }
 898
 899 // Return true if Imm can be loaded into a register by a "cheap" sequence of
 900 // instructions. For now, "cheap" means at most two instructions.
 901 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
 902   if (BitSize == 32)
 903     return true;
 904
 905   assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
 906   uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
 907   SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
 908   AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
 909
 910   return Is.size() <= 2;
 911 }
 912
 913 // FIXME: this implementation should be micro-architecture dependent, so a
 914 // micro-architecture target hook should be introduced here in future.
 915 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
 916   if (Subtarget.hasExynosCheapAsMoveHandling()) {
 917     if (isExynosCheapAsMove(MI))
 918       return true;
 919     return MI.isAsCheapAsAMove();
 920   }
 921
 922   switch (MI.getOpcode()) {
 923   default:
 924     return MI.isAsCheapAsAMove();
 925
 926   case AArch64::ADDWrs:
 927   case AArch64::ADDXrs:
 928   case AArch64::SUBWrs:
 929   case AArch64::SUBXrs:
 930     return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
 931
 932   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
 933   // ORRXri, it is as cheap as MOV.
 934   // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
 935   case AArch64::MOVi32imm:
 936     return isCheapImmediate(MI, 32);
 937   case AArch64::MOVi64imm:
 938     return isCheapImmediate(MI, 64);
 939   }
 940 }
 941
 942 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
 943   switch (MI.getOpcode()) {
 944   default:
 945     return false;
 946
 947   case AArch64::ADDWrs:
 948   case AArch64::ADDXrs:
 949   case AArch64::ADDSWrs:
 950   case AArch64::ADDSXrs: {
 951     unsigned Imm = MI.getOperand(3).getImm();
 952     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
 953     if (ShiftVal == 0)
 954       return true;
 955     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
 956   }
 957
 958   case AArch64::ADDWrx:
 959   case AArch64::ADDXrx:
 960   case AArch64::ADDXrx64:
 961   case AArch64::ADDSWrx:
 962   case AArch64::ADDSXrx:
 963   case AArch64::ADDSXrx64: {
 964     unsigned Imm = MI.getOperand(3).getImm();
 965     switch (AArch64_AM::getArithExtendType(Imm)) {
 966     default:
 967       return false;
 968     case AArch64_AM::UXTB:
 969     case AArch64_AM::UXTH:
 970     case AArch64_AM::UXTW:
 971     case AArch64_AM::UXTX:
 972       return AArch64_AM::getArithShiftValue(Imm) <= 4;
 973     }
 974   }
 975
 976   case AArch64::SUBWrs:
 977   case AArch64::SUBSWrs: {
 978     unsigned Imm = MI.getOperand(3).getImm();
 979     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
 980     return ShiftVal == 0 ||
 981            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
 982   }
 983
 984   case AArch64::SUBXrs:
 985   case AArch64::SUBSXrs: {
 986     unsigned Imm = MI.getOperand(3).getImm();
 987     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
 988     return ShiftVal == 0 ||
 989            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
 990   }
 991
 992   case AArch64::SUBWrx:
 993   case AArch64::SUBXrx:
 994   case AArch64::SUBXrx64:
 995   case AArch64::SUBSWrx:
 996   case AArch64::SUBSXrx:
 997   case AArch64::SUBSXrx64: {
 998     unsigned Imm = MI.getOperand(3).getImm();
 999     switch (AArch64_AM::getArithExtendType(Imm)) {
1000     default:
1001       return false;
1002     case AArch64_AM::UXTB:
1003     case AArch64_AM::UXTH:
1004     case AArch64_AM::UXTW:
1005     case AArch64_AM::UXTX:
1006       return AArch64_AM::getArithShiftValue(Imm) == 0;
1007     }
1008   }
1009
1010   case AArch64::LDRBBroW:
1011   case AArch64::LDRBBroX:
1012   case AArch64::LDRBroW:
1013   case AArch64::LDRBroX:
1014   case AArch64::LDRDroW:
1015   case AArch64::LDRDroX:
1016   case AArch64::LDRHHroW:
1017   case AArch64::LDRHHroX:
1018   case AArch64::LDRHroW:
1019   case AArch64::LDRHroX:
1020   case AArch64::LDRQroW:
1021   case AArch64::LDRQroX:
1022   case AArch64::LDRSBWroW:
1023   case AArch64::LDRSBWroX:
1024   case AArch64::LDRSBXroW:
1025   case AArch64::LDRSBXroX:
1026   case AArch64::LDRSHWroW:
1027   case AArch64::LDRSHWroX:
1028   case AArch64::LDRSHXroW:
1029   case AArch64::LDRSHXroX:
1030   case AArch64::LDRSWroW:
1031   case AArch64::LDRSWroX:
1032   case AArch64::LDRSroW:
1033   case AArch64::LDRSroX:
1034   case AArch64::LDRWroW:
1035   case AArch64::LDRWroX:
1036   case AArch64::LDRXroW:
1037   case AArch64::LDRXroX:
1038   case AArch64::PRFMroW:
1039   case AArch64::PRFMroX:
1040   case AArch64::STRBBroW:
1041   case AArch64::STRBBroX:
1042   case AArch64::STRBroW:
1043   case AArch64::STRBroX:
1044   case AArch64::STRDroW:
1045   case AArch64::STRDroX:
1046   case AArch64::STRHHroW:
1047   case AArch64::STRHHroX:
1048   case AArch64::STRHroW:
1049   case AArch64::STRHroX:
1050   case AArch64::STRQroW:
1051   case AArch64::STRQroX:
1052   case AArch64::STRSroW:
1053   case AArch64::STRSroX:
1054   case AArch64::STRWroW:
1055   case AArch64::STRWroX:
1056   case AArch64::STRXroW:
1057   case AArch64::STRXroX: {
1058     unsigned IsSigned = MI.getOperand(3).getImm();
1059     return !IsSigned;
1060   }
1061   }
1062 }
1063
1064 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1065   unsigned Opc = MI.getOpcode();
1066   switch (Opc) {
1067     default:
1068       return false;
1069     case AArch64::SEH_StackAlloc:
1070     case AArch64::SEH_SaveFPLR:
1071     case AArch64::SEH_SaveFPLR_X:
1072     case AArch64::SEH_SaveReg:
1073     case AArch64::SEH_SaveReg_X:
1074     case AArch64::SEH_SaveRegP:
1075     case AArch64::SEH_SaveRegP_X:
1076     case AArch64::SEH_SaveFReg:
1077     case AArch64::SEH_SaveFReg_X:
1078     case AArch64::SEH_SaveFRegP:
1079     case AArch64::SEH_SaveFRegP_X:
1080     case AArch64::SEH_SetFP:
1081     case AArch64::SEH_AddFP:
1082     case AArch64::SEH_Nop:
1083     case AArch64::SEH_PrologEnd:
1084     case AArch64::SEH_EpilogStart:
1085     case AArch64::SEH_EpilogEnd:
1086     case AArch64::SEH_PACSignLR:
1087     case AArch64::SEH_SaveAnyRegQP:
1088     case AArch64::SEH_SaveAnyRegQPX:
1089       return true;
1090   }
1091 }
1092
1093 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1094                                              Register &SrcReg, Register &DstReg,
1095                                              unsigned &SubIdx) const {
1096   switch (MI.getOpcode()) {
1097   default:
1098     return false;
1099   case AArch64::SBFMXri: // aka sxtw
1100   case AArch64::UBFMXri: // aka uxtw
1101     // Check for the 32 -> 64 bit extension case, these instructions can do
1102     // much more.
1103     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1104       return false;
1105     // This is a signed or unsigned 32 -> 64 bit extension.
1106     SrcReg = MI.getOperand(1).getReg();
1107     DstReg = MI.getOperand(0).getReg();
1108     SubIdx = AArch64::sub_32;
1109     return true;
1110   }
1111 }
1112
1113 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1114     const MachineInstr &MIa, const MachineInstr &MIb) const {
1115   const TargetRegisterInfo *TRI = &getRegisterInfo();
1116   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1117   int64_t OffsetA = 0, OffsetB = 0;
1118   TypeSize WidthA(0, false), WidthB(0, false);
1119   bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1120
1121   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1122   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1123
1124   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1125       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1126     return false;
1127
1128   // Retrieve the base, offset from the base and width. Width
1129   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
1130   // base are identical, and the offset of a lower memory access +
1131   // the width doesn't overlap the offset of a higher memory access,
1132   // then the memory accesses are different.
1133   // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1134   // are assumed to have the same scale (vscale).
1135   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1136                                    WidthA, TRI) &&
1137       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1138                                    WidthB, TRI)) {
1139     if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1140         OffsetAIsScalable == OffsetBIsScalable) {
1141       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1142       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1143       TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1144       if (LowWidth.isScalable() == OffsetAIsScalable &&
1145           LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1146         return true;
1147     }
1148   }
1149   return false;
1150 }
1151
1152 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1153                                             const MachineBasicBlock *MBB,
1154                                             const MachineFunction &MF) const {
1155   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1156     return true;
1157
1158   // Do not move an instruction that can be recognized as a branch target.
1159   if (hasBTISemantics(MI))
1160     return true;
1161
1162   switch (MI.getOpcode()) {
1163   case AArch64::HINT:
1164     // CSDB hints are scheduling barriers.
1165     if (MI.getOperand(0).getImm() == 0x14)
1166       return true;
1167     break;
1168   case AArch64::DSB:
1169   case AArch64::ISB:
1170     // DSB and ISB also are scheduling barriers.
1171     return true;
1172   case AArch64::MSRpstatesvcrImm1:
1173     // SMSTART and SMSTOP are also scheduling barriers.
1174     return true;
1175   default:;
1176   }
1177   if (isSEHInstruction(MI))
1178     return true;
1179   auto Next = std::next(MI.getIterator());
1180   return Next != MBB->end() && Next->isCFIInstruction();
1181 }
1182
1183 /// analyzeCompare - For a comparison instruction, return the source registers
1184 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1185 /// Return true if the comparison instruction can be analyzed.
1186 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1187                                       Register &SrcReg2, int64_t &CmpMask,
1188                                       int64_t &CmpValue) const {
1189   // The first operand can be a frame index where we'd normally expect a
1190   // register.
1191   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1192   if (!MI.getOperand(1).isReg())
1193     return false;
1194
1195   switch (MI.getOpcode()) {
1196   default:
1197     break;
1198   case AArch64::PTEST_PP:
1199   case AArch64::PTEST_PP_ANY:
1200     SrcReg = MI.getOperand(0).getReg();
1201     SrcReg2 = MI.getOperand(1).getReg();
1202     // Not sure about the mask and value for now...
1203     CmpMask = ~0;
1204     CmpValue = 0;
1205     return true;
1206   case AArch64::SUBSWrr:
1207   case AArch64::SUBSWrs:
1208   case AArch64::SUBSWrx:
1209   case AArch64::SUBSXrr:
1210   case AArch64::SUBSXrs:
1211   case AArch64::SUBSXrx:
1212   case AArch64::ADDSWrr:
1213   case AArch64::ADDSWrs:
1214   case AArch64::ADDSWrx:
1215   case AArch64::ADDSXrr:
1216   case AArch64::ADDSXrs:
1217   case AArch64::ADDSXrx:
1218     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1219     SrcReg = MI.getOperand(1).getReg();
1220     SrcReg2 = MI.getOperand(2).getReg();
1221     CmpMask = ~0;
1222     CmpValue = 0;
1223     return true;
1224   case AArch64::SUBSWri:
1225   case AArch64::ADDSWri:
1226   case AArch64::SUBSXri:
1227   case AArch64::ADDSXri:
1228     SrcReg = MI.getOperand(1).getReg();
1229     SrcReg2 = 0;
1230     CmpMask = ~0;
1231     CmpValue = MI.getOperand(2).getImm();
1232     return true;
1233   case AArch64::ANDSWri:
1234   case AArch64::ANDSXri:
1235     // ANDS does not use the same encoding scheme as the others xxxS
1236     // instructions.
1237     SrcReg = MI.getOperand(1).getReg();
1238     SrcReg2 = 0;
1239     CmpMask = ~0;
1240     CmpValue = AArch64_AM::decodeLogicalImmediate(
1241                    MI.getOperand(2).getImm(),
1242                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1243     return true;
1244   }
1245
1246   return false;
1247 }
1248
1249 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1250   MachineBasicBlock *MBB = Instr.getParent();
1251   assert(MBB && "Can't get MachineBasicBlock here");
1252   MachineFunction *MF = MBB->getParent();
1253   assert(MF && "Can't get MachineFunction here");
1254   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1255   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1256   MachineRegisterInfo *MRI = &MF->getRegInfo();
1257
1258   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1259        ++OpIdx) {
1260     MachineOperand &MO = Instr.getOperand(OpIdx);
1261     const TargetRegisterClass *OpRegCstraints =
1262         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1263
1264     // If there's no constraint, there's nothing to do.
1265     if (!OpRegCstraints)
1266       continue;
1267     // If the operand is a frame index, there's nothing to do here.
1268     // A frame index operand will resolve correctly during PEI.
1269     if (MO.isFI())
1270       continue;
1271
1272     assert(MO.isReg() &&
1273            "Operand has register constraints without being a register!");
1274
1275     Register Reg = MO.getReg();
1276     if (Reg.isPhysical()) {
1277       if (!OpRegCstraints->contains(Reg))
1278         return false;
1279     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1280                !MRI->constrainRegClass(Reg, OpRegCstraints))
1281       return false;
1282   }
1283
1284   return true;
1285 }
1286
1287 /// Return the opcode that does not set flags when possible - otherwise
1288 /// return the original opcode. The caller is responsible to do the actual
1289 /// substitution and legality checking.
1290 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1291   // Don't convert all compare instructions, because for some the zero register
1292   // encoding becomes the sp register.
1293   bool MIDefinesZeroReg = false;
1294   if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1295       MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1296     MIDefinesZeroReg = true;
1297
1298   switch (MI.getOpcode()) {
1299   default:
1300     return MI.getOpcode();
1301   case AArch64::ADDSWrr:
1302     return AArch64::ADDWrr;
1303   case AArch64::ADDSWri:
1304     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1305   case AArch64::ADDSWrs:
1306     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1307   case AArch64::ADDSWrx:
1308     return AArch64::ADDWrx;
1309   case AArch64::ADDSXrr:
1310     return AArch64::ADDXrr;
1311   case AArch64::ADDSXri:
1312     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1313   case AArch64::ADDSXrs:
1314     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1315   case AArch64::ADDSXrx:
1316     return AArch64::ADDXrx;
1317   case AArch64::SUBSWrr:
1318     return AArch64::SUBWrr;
1319   case AArch64::SUBSWri:
1320     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1321   case AArch64::SUBSWrs:
1322     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1323   case AArch64::SUBSWrx:
1324     return AArch64::SUBWrx;
1325   case AArch64::SUBSXrr:
1326     return AArch64::SUBXrr;
1327   case AArch64::SUBSXri:
1328     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1329   case AArch64::SUBSXrs:
1330     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1331   case AArch64::SUBSXrx:
1332     return AArch64::SUBXrx;
1333   }
1334 }
1335
1336 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1337
1338 /// True when condition flags are accessed (either by writing or reading)
1339 /// on the instruction trace starting at From and ending at To.
1340 ///
1341 /// Note: If From and To are from different blocks it's assumed CC are accessed
1342 ///       on the path.
1343 static bool areCFlagsAccessedBetweenInstrs(
1344     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1345     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1346   // Early exit if To is at the beginning of the BB.
1347   if (To == To->getParent()->begin())
1348     return true;
1349
1350   // Check whether the instructions are in the same basic block
1351   // If not, assume the condition flags might get modified somewhere.
1352   if (To->getParent() != From->getParent())
1353     return true;
1354
1355   // From must be above To.
1356   assert(std::any_of(
1357       ++To.getReverse(), To->getParent()->rend(),
1358       [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1359
1360   // We iterate backward starting at \p To until we hit \p From.
1361   for (const MachineInstr &Instr :
1362        instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1363     if (((AccessToCheck & AK_Write) &&
1364          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1365         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1366       return true;
1367   }
1368   return false;
1369 }
1370
1371 std::optional<unsigned>
1372 AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1373                                       MachineInstr *Pred,
1374                                       const MachineRegisterInfo *MRI) const {
1375   unsigned MaskOpcode = Mask->getOpcode();
1376   unsigned PredOpcode = Pred->getOpcode();
1377   bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1378   bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1379
1380   if (PredIsWhileLike) {
1381     // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1382     // instruction and the condition is "any" since WHILcc does an implicit
1383     // PTEST(ALL, PG) check and PG is always a subset of ALL.
1384     if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1385       return PredOpcode;
1386
1387     // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1388     // redundant since WHILE performs an implicit PTEST with an all active
1389     // mask.
1390     if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1391         getElementSizeForOpcode(MaskOpcode) ==
1392             getElementSizeForOpcode(PredOpcode))
1393       return PredOpcode;
1394
1395     return {};
1396   }
1397
1398   if (PredIsPTestLike) {
1399     // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1400     // instruction that sets the flags as PTEST would and the condition is
1401     // "any" since PG is always a subset of the governing predicate of the
1402     // ptest-like instruction.
1403     if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1404       return PredOpcode;
1405
1406     // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1407     // the element size matches and either the PTEST_LIKE instruction uses
1408     // the same all active mask or the condition is "any".
1409     if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1410         getElementSizeForOpcode(MaskOpcode) ==
1411             getElementSizeForOpcode(PredOpcode)) {
1412       auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1413       if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1414         return PredOpcode;
1415     }
1416
1417     // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1418     // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1419     // on 8-bit predicates like the PTEST.  Otherwise, for instructions like
1420     // compare that also support 16/32/64-bit predicates, the implicit PTEST
1421     // performed by the compare could consider fewer lanes for these element
1422     // sizes.
1423     //
1424     // For example, consider
1425     //
1426     //   ptrue p0.b                    ; P0=1111-1111-1111-1111
1427     //   index z0.s, #0, #1            ; Z0=<0,1,2,3>
1428     //   index z1.s, #1, #1            ; Z1=<1,2,3,4>
1429     //   cmphi p1.s, p0/z, z1.s, z0.s  ; P1=0001-0001-0001-0001
1430     //                                 ;       ^ last active
1431     //   ptest p0, p1.b                ; P1=0001-0001-0001-0001
1432     //                                 ;     ^ last active
1433     //
1434     // where the compare generates a canonical all active 32-bit predicate
1435     // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1436     // active flag, whereas the PTEST instruction with the same mask doesn't.
1437     // For PTEST_ANY this doesn't apply as the flags in this case would be
1438     // identical regardless of element size.
1439     auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1440     uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1441     if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1442                                   PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1443       return PredOpcode;
1444
1445     return {};
1446   }
1447
1448   // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1449   // opcode so the PTEST becomes redundant.
1450   switch (PredOpcode) {
1451   case AArch64::AND_PPzPP:
1452   case AArch64::BIC_PPzPP:
1453   case AArch64::EOR_PPzPP:
1454   case AArch64::NAND_PPzPP:
1455   case AArch64::NOR_PPzPP:
1456   case AArch64::ORN_PPzPP:
1457   case AArch64::ORR_PPzPP:
1458   case AArch64::BRKA_PPzP:
1459   case AArch64::BRKPA_PPzPP:
1460   case AArch64::BRKB_PPzP:
1461   case AArch64::BRKPB_PPzPP:
1462   case AArch64::RDFFR_PPz: {
1463     // Check to see if our mask is the same. If not the resulting flag bits
1464     // may be different and we can't remove the ptest.
1465     auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1466     if (Mask != PredMask)
1467       return {};
1468     break;
1469   }
1470   case AArch64::BRKN_PPzP: {
1471     // BRKN uses an all active implicit mask to set flags unlike the other
1472     // flag-setting instructions.
1473     // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1474     if ((MaskOpcode != AArch64::PTRUE_B) ||
1475         (Mask->getOperand(1).getImm() != 31))
1476       return {};
1477     break;
1478   }
1479   case AArch64::PTRUE_B:
1480     // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1481     break;
1482   default:
1483     // Bail out if we don't recognize the input
1484     return {};
1485   }
1486
1487   return convertToFlagSettingOpc(PredOpcode);
1488 }
1489
1490 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1491 /// operation which could set the flags in an identical manner
1492 bool AArch64InstrInfo::optimizePTestInstr(
1493     MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1494     const MachineRegisterInfo *MRI) const {
1495   auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1496   auto *Pred = MRI->getUniqueVRegDef(PredReg);
1497   unsigned PredOpcode = Pred->getOpcode();
1498   auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1499   if (!NewOp)
1500     return false;
1501
1502   const TargetRegisterInfo *TRI = &getRegisterInfo();
1503
1504   // If another instruction between Pred and PTest accesses flags, don't remove
1505   // the ptest or update the earlier instruction to modify them.
1506   if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1507     return false;
1508
1509   // If we pass all the checks, it's safe to remove the PTEST and use the flags
1510   // as they are prior to PTEST. Sometimes this requires the tested PTEST
1511   // operand to be replaced with an equivalent instruction that also sets the
1512   // flags.
1513   PTest->eraseFromParent();
1514   if (*NewOp != PredOpcode) {
1515     Pred->setDesc(get(*NewOp));
1516     bool succeeded = UpdateOperandRegClass(*Pred);
1517     (void)succeeded;
1518     assert(succeeded && "Operands have incompatible register classes!");
1519     Pred->addRegisterDefined(AArch64::NZCV, TRI);
1520   }
1521
1522   // Ensure that the flags def is live.
1523   if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1524     unsigned i = 0, e = Pred->getNumOperands();
1525     for (; i != e; ++i) {
1526       MachineOperand &MO = Pred->getOperand(i);
1527       if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1528         MO.setIsDead(false);
1529         break;
1530       }
1531     }
1532   }
1533   return true;
1534 }
1535
1536 /// Try to optimize a compare instruction. A compare instruction is an
1537 /// instruction which produces AArch64::NZCV. It can be truly compare
1538 /// instruction
1539 /// when there are no uses of its destination register.
1540 ///
1541 /// The following steps are tried in order:
1542 /// 1. Convert CmpInstr into an unconditional version.
1543 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1544 ///    condition code or an instruction which can be converted into such an
1545 ///    instruction.
1546 ///    Only comparison with zero is supported.
1547 bool AArch64InstrInfo::optimizeCompareInstr(
1548     MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1549     int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1550   assert(CmpInstr.getParent());
1551   assert(MRI);
1552
1553   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1554   int DeadNZCVIdx =
1555       CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1556   if (DeadNZCVIdx != -1) {
1557     if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1558         CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1559       CmpInstr.eraseFromParent();
1560       return true;
1561     }
1562     unsigned Opc = CmpInstr.getOpcode();
1563     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1564     if (NewOpc == Opc)
1565       return false;
1566     const MCInstrDesc &MCID = get(NewOpc);
1567     CmpInstr.setDesc(MCID);
1568     CmpInstr.removeOperand(DeadNZCVIdx);
1569     bool succeeded = UpdateOperandRegClass(CmpInstr);
1570     (void)succeeded;
1571     assert(succeeded && "Some operands reg class are incompatible!");
1572     return true;
1573   }
1574
1575   if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1576       CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1577     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1578
1579   if (SrcReg2 != 0)
1580     return false;
1581
1582   // CmpInstr is a Compare instruction if destination register is not used.
1583   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1584     return false;
1585
1586   if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1587     return true;
1588   return (CmpValue == 0 || CmpValue == 1) &&
1589          removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1590 }
1591
1592 /// Get opcode of S version of Instr.
1593 /// If Instr is S version its opcode is returned.
1594 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1595 /// or we are not interested in it.
1596 static unsigned sForm(MachineInstr &Instr) {
1597   switch (Instr.getOpcode()) {
1598   default:
1599     return AArch64::INSTRUCTION_LIST_END;
1600
1601   case AArch64::ADDSWrr:
1602   case AArch64::ADDSWri:
1603   case AArch64::ADDSXrr:
1604   case AArch64::ADDSXri:
1605   case AArch64::SUBSWrr:
1606   case AArch64::SUBSWri:
1607   case AArch64::SUBSXrr:
1608   case AArch64::SUBSXri:
1609     return Instr.getOpcode();
1610
1611   case AArch64::ADDWrr:
1612     return AArch64::ADDSWrr;
1613   case AArch64::ADDWri:
1614     return AArch64::ADDSWri;
1615   case AArch64::ADDXrr:
1616     return AArch64::ADDSXrr;
1617   case AArch64::ADDXri:
1618     return AArch64::ADDSXri;
1619   case AArch64::ADCWr:
1620     return AArch64::ADCSWr;
1621   case AArch64::ADCXr:
1622     return AArch64::ADCSXr;
1623   case AArch64::SUBWrr:
1624     return AArch64::SUBSWrr;
1625   case AArch64::SUBWri:
1626     return AArch64::SUBSWri;
1627   case AArch64::SUBXrr:
1628     return AArch64::SUBSXrr;
1629   case AArch64::SUBXri:
1630     return AArch64::SUBSXri;
1631   case AArch64::SBCWr:
1632     return AArch64::SBCSWr;
1633   case AArch64::SBCXr:
1634     return AArch64::SBCSXr;
1635   case AArch64::ANDWri:
1636     return AArch64::ANDSWri;
1637   case AArch64::ANDXri:
1638     return AArch64::ANDSXri;
1639   }
1640 }
1641
1642 /// Check if AArch64::NZCV should be alive in successors of MBB.
1643 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1644   for (auto *BB : MBB->successors())
1645     if (BB->isLiveIn(AArch64::NZCV))
1646       return true;
1647   return false;
1648 }
1649
1650 /// \returns The condition code operand index for \p Instr if it is a branch
1651 /// or select and -1 otherwise.
1652 static int
1653 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1654   switch (Instr.getOpcode()) {
1655   default:
1656     return -1;
1657
1658   case AArch64::Bcc: {
1659     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1660     assert(Idx >= 2);
1661     return Idx - 2;
1662   }
1663
1664   case AArch64::CSINVWr:
1665   case AArch64::CSINVXr:
1666   case AArch64::CSINCWr:
1667   case AArch64::CSINCXr:
1668   case AArch64::CSELWr:
1669   case AArch64::CSELXr:
1670   case AArch64::CSNEGWr:
1671   case AArch64::CSNEGXr:
1672   case AArch64::FCSELSrrr:
1673   case AArch64::FCSELDrrr: {
1674     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1675     assert(Idx >= 1);
1676     return Idx - 1;
1677   }
1678   }
1679 }
1680
1681 /// Find a condition code used by the instruction.
1682 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1683 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1684 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1685   int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1686   return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1687                           Instr.getOperand(CCIdx).getImm())
1688                     : AArch64CC::Invalid;
1689 }
1690
1691 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1692   assert(CC != AArch64CC::Invalid);
1693   UsedNZCV UsedFlags;
1694   switch (CC) {
1695   default:
1696     break;
1697
1698   case AArch64CC::EQ: // Z set
1699   case AArch64CC::NE: // Z clear
1700     UsedFlags.Z = true;
1701     break;
1702
1703   case AArch64CC::HI: // Z clear and C set
1704   case AArch64CC::LS: // Z set   or  C clear
1705     UsedFlags.Z = true;
1706     [[fallthrough]];
1707   case AArch64CC::HS: // C set
1708   case AArch64CC::LO: // C clear
1709     UsedFlags.C = true;
1710     break;
1711
1712   case AArch64CC::MI: // N set
1713   case AArch64CC::PL: // N clear
1714     UsedFlags.N = true;
1715     break;
1716
1717   case AArch64CC::VS: // V set
1718   case AArch64CC::VC: // V clear
1719     UsedFlags.V = true;
1720     break;
1721
1722   case AArch64CC::GT: // Z clear, N and V the same
1723   case AArch64CC::LE: // Z set,   N and V differ
1724     UsedFlags.Z = true;
1725     [[fallthrough]];
1726   case AArch64CC::GE: // N and V the same
1727   case AArch64CC::LT: // N and V differ
1728     UsedFlags.N = true;
1729     UsedFlags.V = true;
1730     break;
1731   }
1732   return UsedFlags;
1733 }
1734
1735 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1736 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1737 /// \returns std::nullopt otherwise.
1738 ///
1739 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1740 std::optional<UsedNZCV>
1741 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1742                        const TargetRegisterInfo &TRI,
1743                        SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1744   MachineBasicBlock *CmpParent = CmpInstr.getParent();
1745   if (MI.getParent() != CmpParent)
1746     return std::nullopt;
1747
1748   if (areCFlagsAliveInSuccessors(CmpParent))
1749     return std::nullopt;
1750
1751   UsedNZCV NZCVUsedAfterCmp;
1752   for (MachineInstr &Instr : instructionsWithoutDebug(
1753            std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1754     if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1755       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1756       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1757         return std::nullopt;
1758       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1759       if (CCUseInstrs)
1760         CCUseInstrs->push_back(&Instr);
1761     }
1762     if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1763       break;
1764   }
1765   return NZCVUsedAfterCmp;
1766 }
1767
1768 static bool isADDSRegImm(unsigned Opcode) {
1769   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1770 }
1771
1772 static bool isSUBSRegImm(unsigned Opcode) {
1773   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1774 }
1775
1776 /// Check if CmpInstr can be substituted by MI.
1777 ///
1778 /// CmpInstr can be substituted:
1779 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1780 /// - and, MI and CmpInstr are from the same MachineBB
1781 /// - and, condition flags are not alive in successors of the CmpInstr parent
1782 /// - and, if MI opcode is the S form there must be no defs of flags between
1783 ///        MI and CmpInstr
1784 ///        or if MI opcode is not the S form there must be neither defs of flags
1785 ///        nor uses of flags between MI and CmpInstr.
1786 /// - and, if C/V flags are not used after CmpInstr
1787 ///        or if N flag is used but MI produces poison value if signed overflow
1788 ///        occurs.
1789 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1790                                        const TargetRegisterInfo &TRI) {
1791   // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1792   // that may or may not set flags.
1793   assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1794
1795   const unsigned CmpOpcode = CmpInstr.getOpcode();
1796   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1797     return false;
1798
1799   assert((CmpInstr.getOperand(2).isImm() &&
1800           CmpInstr.getOperand(2).getImm() == 0) &&
1801          "Caller guarantees that CmpInstr compares with constant 0");
1802
1803   std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1804   if (!NZVCUsed || NZVCUsed->C)
1805     return false;
1806
1807   // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1808   // '%vreg = add ...' or '%vreg = sub ...'.
1809   // Condition flag V is used to indicate signed overflow.
1810   // 1) MI and CmpInstr set N and V to the same value.
1811   // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1812   //    signed overflow occurs, so CmpInstr could still be simplified away.
1813   if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1814     return false;
1815
1816   AccessKind AccessToCheck = AK_Write;
1817   if (sForm(MI) != MI.getOpcode())
1818     AccessToCheck = AK_All;
1819   return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1820 }
1821
1822 /// Substitute an instruction comparing to zero with another instruction
1823 /// which produces needed condition flags.
1824 ///
1825 /// Return true on success.
1826 bool AArch64InstrInfo::substituteCmpToZero(
1827     MachineInstr &CmpInstr, unsigned SrcReg,
1828     const MachineRegisterInfo &MRI) const {
1829   // Get the unique definition of SrcReg.
1830   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1831   if (!MI)
1832     return false;
1833
1834   const TargetRegisterInfo &TRI = getRegisterInfo();
1835
1836   unsigned NewOpc = sForm(*MI);
1837   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1838     return false;
1839
1840   if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1841     return false;
1842
1843   // Update the instruction to set NZCV.
1844   MI->setDesc(get(NewOpc));
1845   CmpInstr.eraseFromParent();
1846   bool succeeded = UpdateOperandRegClass(*MI);
1847   (void)succeeded;
1848   assert(succeeded && "Some operands reg class are incompatible!");
1849   MI->addRegisterDefined(AArch64::NZCV, &TRI);
1850   return true;
1851 }
1852
1853 /// \returns True if \p CmpInstr can be removed.
1854 ///
1855 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1856 /// codes used in \p CCUseInstrs must be inverted.
1857 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1858                                  int CmpValue, const TargetRegisterInfo &TRI,
1859                                  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1860                                  bool &IsInvertCC) {
1861   assert((CmpValue == 0 || CmpValue == 1) &&
1862          "Only comparisons to 0 or 1 considered for removal!");
1863
1864   // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1865   unsigned MIOpc = MI.getOpcode();
1866   if (MIOpc == AArch64::CSINCWr) {
1867     if (MI.getOperand(1).getReg() != AArch64::WZR ||
1868         MI.getOperand(2).getReg() != AArch64::WZR)
1869       return false;
1870   } else if (MIOpc == AArch64::CSINCXr) {
1871     if (MI.getOperand(1).getReg() != AArch64::XZR ||
1872         MI.getOperand(2).getReg() != AArch64::XZR)
1873       return false;
1874   } else {
1875     return false;
1876   }
1877   AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1878   if (MICC == AArch64CC::Invalid)
1879     return false;
1880
1881   // NZCV needs to be defined
1882   if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1883     return false;
1884
1885   // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1886   const unsigned CmpOpcode = CmpInstr.getOpcode();
1887   bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1888   if (CmpValue && !IsSubsRegImm)
1889     return false;
1890   if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1891     return false;
1892
1893   // MI conditions allowed: eq, ne, mi, pl
1894   UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1895   if (MIUsedNZCV.C || MIUsedNZCV.V)
1896     return false;
1897
1898   std::optional<UsedNZCV> NZCVUsedAfterCmp =
1899       examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1900   // Condition flags are not used in CmpInstr basic block successors and only
1901   // Z or N flags allowed to be used after CmpInstr within its basic block
1902   if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1903     return false;
1904   // Z or N flag used after CmpInstr must correspond to the flag used in MI
1905   if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1906       (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1907     return false;
1908   // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1909   if (MIUsedNZCV.N && !CmpValue)
1910     return false;
1911
1912   // There must be no defs of flags between MI and CmpInstr
1913   if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1914     return false;
1915
1916   // Condition code is inverted in the following cases:
1917   // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1918   // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1919   IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1920                (!CmpValue && MICC == AArch64CC::NE);
1921   return true;
1922 }
1923
1924 /// Remove comparison in csinc-cmp sequence
1925 ///
1926 /// Examples:
1927 /// 1. \code
1928 ///   csinc w9, wzr, wzr, ne
1929 ///   cmp   w9, #0
1930 ///   b.eq
1931 ///    \endcode
1932 /// to
1933 ///    \code
1934 ///   csinc w9, wzr, wzr, ne
1935 ///   b.ne
1936 ///    \endcode
1937 ///
1938 /// 2. \code
1939 ///   csinc x2, xzr, xzr, mi
1940 ///   cmp   x2, #1
1941 ///   b.pl
1942 ///    \endcode
1943 /// to
1944 ///    \code
1945 ///   csinc x2, xzr, xzr, mi
1946 ///   b.pl
1947 ///    \endcode
1948 ///
1949 /// \param  CmpInstr comparison instruction
1950 /// \return True when comparison removed
1951 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1952     MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1953     const MachineRegisterInfo &MRI) const {
1954   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1955   if (!MI)
1956     return false;
1957   const TargetRegisterInfo &TRI = getRegisterInfo();
1958   SmallVector<MachineInstr *, 4> CCUseInstrs;
1959   bool IsInvertCC = false;
1960   if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1961                             IsInvertCC))
1962     return false;
1963   // Make transformation
1964   CmpInstr.eraseFromParent();
1965   if (IsInvertCC) {
1966     // Invert condition codes in CmpInstr CC users
1967     for (MachineInstr *CCUseInstr : CCUseInstrs) {
1968       int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1969       assert(Idx >= 0 && "Unexpected instruction using CC.");
1970       MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1971       AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1972           static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1973       CCOperand.setImm(CCUse);
1974     }
1975   }
1976   return true;
1977 }
1978
1979 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1980   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1981       MI.getOpcode() != AArch64::CATCHRET)
1982     return false;
1983
1984   MachineBasicBlock &MBB = *MI.getParent();
1985   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1986   auto TRI = Subtarget.getRegisterInfo();
1987   DebugLoc DL = MI.getDebugLoc();
1988
1989   if (MI.getOpcode() == AArch64::CATCHRET) {
1990     // Skip to the first instruction before the epilog.
1991     const TargetInstrInfo *TII =
1992       MBB.getParent()->getSubtarget().getInstrInfo();
1993     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1994     auto MBBI = MachineBasicBlock::iterator(MI);
1995     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1996     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1997            FirstEpilogSEH != MBB.begin())
1998       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1999     if (FirstEpilogSEH != MBB.begin())
2000       FirstEpilogSEH = std::next(FirstEpilogSEH);
2001     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2002         .addReg(AArch64::X0, RegState::Define)
2003         .addMBB(TargetMBB);
2004     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2005         .addReg(AArch64::X0, RegState::Define)
2006         .addReg(AArch64::X0)
2007         .addMBB(TargetMBB)
2008         .addImm(0);
2009     TargetMBB->setMachineBlockAddressTaken();
2010     return true;
2011   }
2012
2013   Register Reg = MI.getOperand(0).getReg();
2014   Module &M = *MBB.getParent()->getFunction().getParent();
2015   if (M.getStackProtectorGuard() == "sysreg") {
2016     const AArch64SysReg::SysReg *SrcReg =
2017         AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2018     if (!SrcReg)
2019       report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2020
2021     // mrs xN, sysreg
2022     BuildMI(MBB, MI, DL, get(AArch64::MRS))
2023         .addDef(Reg, RegState::Renamable)
2024         .addImm(SrcReg->Encoding);
2025     int Offset = M.getStackProtectorGuardOffset();
2026     if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2027       // ldr xN, [xN, #offset]
2028       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2029           .addDef(Reg)
2030           .addUse(Reg, RegState::Kill)
2031           .addImm(Offset / 8);
2032     } else if (Offset >= -256 && Offset <= 255) {
2033       // ldur xN, [xN, #offset]
2034       BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2035           .addDef(Reg)
2036           .addUse(Reg, RegState::Kill)
2037           .addImm(Offset);
2038     } else if (Offset >= -4095 && Offset <= 4095) {
2039       if (Offset > 0) {
2040         // add xN, xN, #offset
2041         BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2042             .addDef(Reg)
2043             .addUse(Reg, RegState::Kill)
2044             .addImm(Offset)
2045             .addImm(0);
2046       } else {
2047         // sub xN, xN, #offset
2048         BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2049             .addDef(Reg)
2050             .addUse(Reg, RegState::Kill)
2051             .addImm(-Offset)
2052             .addImm(0);
2053       }
2054       // ldr xN, [xN]
2055       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2056           .addDef(Reg)
2057           .addUse(Reg, RegState::Kill)
2058           .addImm(0);
2059     } else {
2060       // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2061       // than 23760.
2062       // It might be nice to use AArch64::MOVi32imm here, which would get
2063       // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2064       // contains the MRS result. findScratchNonCalleeSaveRegister() in
2065       // AArch64FrameLowering might help us find such a scratch register
2066       // though. If we failed to find a scratch register, we could emit a
2067       // stream of add instructions to build up the immediate. Or, we could try
2068       // to insert a AArch64::MOVi32imm before register allocation so that we
2069       // didn't need to scavenge for a scratch register.
2070       report_fatal_error("Unable to encode Stack Protector Guard Offset");
2071     }
2072     MBB.erase(MI);
2073     return true;
2074   }
2075
2076   const GlobalValue *GV =
2077       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2078   const TargetMachine &TM = MBB.getParent()->getTarget();
2079   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2080   const unsigned char MO_NC = AArch64II::MO_NC;
2081
2082   if ((OpFlags & AArch64II::MO_GOT) != 0) {
2083     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2084         .addGlobalAddress(GV, 0, OpFlags);
2085     if (Subtarget.isTargetILP32()) {
2086       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2087       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2088           .addDef(Reg32, RegState::Dead)
2089           .addUse(Reg, RegState::Kill)
2090           .addImm(0)
2091           .addMemOperand(*MI.memoperands_begin())
2092           .addDef(Reg, RegState::Implicit);
2093     } else {
2094       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2095           .addReg(Reg, RegState::Kill)
2096           .addImm(0)
2097           .addMemOperand(*MI.memoperands_begin());
2098     }
2099   } else if (TM.getCodeModel() == CodeModel::Large) {
2100     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2101     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2102         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2103         .addImm(0);
2104     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2105         .addReg(Reg, RegState::Kill)
2106         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2107         .addImm(16);
2108     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2109         .addReg(Reg, RegState::Kill)
2110         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2111         .addImm(32);
2112     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2113         .addReg(Reg, RegState::Kill)
2114         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2115         .addImm(48);
2116     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2117         .addReg(Reg, RegState::Kill)
2118         .addImm(0)
2119         .addMemOperand(*MI.memoperands_begin());
2120   } else if (TM.getCodeModel() == CodeModel::Tiny) {
2121     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2122         .addGlobalAddress(GV, 0, OpFlags);
2123   } else {
2124     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2125         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2126     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2127     if (Subtarget.isTargetILP32()) {
2128       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2129       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2130           .addDef(Reg32, RegState::Dead)
2131           .addUse(Reg, RegState::Kill)
2132           .addGlobalAddress(GV, 0, LoFlags)
2133           .addMemOperand(*MI.memoperands_begin())
2134           .addDef(Reg, RegState::Implicit);
2135     } else {
2136       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2137           .addReg(Reg, RegState::Kill)
2138           .addGlobalAddress(GV, 0, LoFlags)
2139           .addMemOperand(*MI.memoperands_begin());
2140     }
2141   }
2142
2143   MBB.erase(MI);
2144
2145   return true;
2146 }
2147
2148 // Return true if this instruction simply sets its single destination register
2149 // to zero. This is equivalent to a register rename of the zero-register.
2150 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2151   switch (MI.getOpcode()) {
2152   default:
2153     break;
2154   case AArch64::MOVZWi:
2155   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2156     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2157       assert(MI.getDesc().getNumOperands() == 3 &&
2158              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2159       return true;
2160     }
2161     break;
2162   case AArch64::ANDWri: // and Rd, Rzr, #imm
2163     return MI.getOperand(1).getReg() == AArch64::WZR;
2164   case AArch64::ANDXri:
2165     return MI.getOperand(1).getReg() == AArch64::XZR;
2166   case TargetOpcode::COPY:
2167     return MI.getOperand(1).getReg() == AArch64::WZR;
2168   }
2169   return false;
2170 }
2171
2172 // Return true if this instruction simply renames a general register without
2173 // modifying bits.
2174 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2175   switch (MI.getOpcode()) {
2176   default:
2177     break;
2178   case TargetOpcode::COPY: {
2179     // GPR32 copies will by lowered to ORRXrs
2180     Register DstReg = MI.getOperand(0).getReg();
2181     return (AArch64::GPR32RegClass.contains(DstReg) ||
2182             AArch64::GPR64RegClass.contains(DstReg));
2183   }
2184   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2185     if (MI.getOperand(1).getReg() == AArch64::XZR) {
2186       assert(MI.getDesc().getNumOperands() == 4 &&
2187              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2188       return true;
2189     }
2190     break;
2191   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2192     if (MI.getOperand(2).getImm() == 0) {
2193       assert(MI.getDesc().getNumOperands() == 4 &&
2194              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2195       return true;
2196     }
2197     break;
2198   }
2199   return false;
2200 }
2201
2202 // Return true if this instruction simply renames a general register without
2203 // modifying bits.
2204 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2205   switch (MI.getOpcode()) {
2206   default:
2207     break;
2208   case TargetOpcode::COPY: {
2209     Register DstReg = MI.getOperand(0).getReg();
2210     return AArch64::FPR128RegClass.contains(DstReg);
2211   }
2212   case AArch64::ORRv16i8:
2213     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2214       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2215              "invalid ORRv16i8 operands");
2216       return true;
2217     }
2218     break;
2219   }
2220   return false;
2221 }
2222
2223 Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2224                                                int &FrameIndex) const {
2225   switch (MI.getOpcode()) {
2226   default:
2227     break;
2228   case AArch64::LDRWui:
2229   case AArch64::LDRXui:
2230   case AArch64::LDRBui:
2231   case AArch64::LDRHui:
2232   case AArch64::LDRSui:
2233   case AArch64::LDRDui:
2234   case AArch64::LDRQui:
2235   case AArch64::LDR_PXI:
2236     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2237         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2238       FrameIndex = MI.getOperand(1).getIndex();
2239       return MI.getOperand(0).getReg();
2240     }
2241     break;
2242   }
2243
2244   return 0;
2245 }
2246
2247 Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2248                                               int &FrameIndex) const {
2249   switch (MI.getOpcode()) {
2250   default:
2251     break;
2252   case AArch64::STRWui:
2253   case AArch64::STRXui:
2254   case AArch64::STRBui:
2255   case AArch64::STRHui:
2256   case AArch64::STRSui:
2257   case AArch64::STRDui:
2258   case AArch64::STRQui:
2259   case AArch64::STR_PXI:
2260     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2261         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2262       FrameIndex = MI.getOperand(1).getIndex();
2263       return MI.getOperand(0).getReg();
2264     }
2265     break;
2266   }
2267   return 0;
2268 }
2269
2270 /// Check all MachineMemOperands for a hint to suppress pairing.
2271 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2272   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2273     return MMO->getFlags() & MOSuppressPair;
2274   });
2275 }
2276
2277 /// Set a flag on the first MachineMemOperand to suppress pairing.
2278 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2279   if (MI.memoperands_empty())
2280     return;
2281   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2282 }
2283
2284 /// Check all MachineMemOperands for a hint that the load/store is strided.
2285 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2286   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2287     return MMO->getFlags() & MOStridedAccess;
2288   });
2289 }
2290
2291 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2292   switch (Opc) {
2293   default:
2294     return false;
2295   case AArch64::STURSi:
2296   case AArch64::STRSpre:
2297   case AArch64::STURDi:
2298   case AArch64::STRDpre:
2299   case AArch64::STURQi:
2300   case AArch64::STRQpre:
2301   case AArch64::STURBBi:
2302   case AArch64::STURHHi:
2303   case AArch64::STURWi:
2304   case AArch64::STRWpre:
2305   case AArch64::STURXi:
2306   case AArch64::STRXpre:
2307   case AArch64::LDURSi:
2308   case AArch64::LDRSpre:
2309   case AArch64::LDURDi:
2310   case AArch64::LDRDpre:
2311   case AArch64::LDURQi:
2312   case AArch64::LDRQpre:
2313   case AArch64::LDURWi:
2314   case AArch64::LDRWpre:
2315   case AArch64::LDURXi:
2316   case AArch64::LDRXpre:
2317   case AArch64::LDRSWpre:
2318   case AArch64::LDURSWi:
2319   case AArch64::LDURHHi:
2320   case AArch64::LDURBBi:
2321   case AArch64::LDURSBWi:
2322   case AArch64::LDURSHWi:
2323     return true;
2324   }
2325 }
2326
2327 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2328   switch (Opc) {
2329   default: return {};
2330   case AArch64::PRFMui: return AArch64::PRFUMi;
2331   case AArch64::LDRXui: return AArch64::LDURXi;
2332   case AArch64::LDRWui: return AArch64::LDURWi;
2333   case AArch64::LDRBui: return AArch64::LDURBi;
2334   case AArch64::LDRHui: return AArch64::LDURHi;
2335   case AArch64::LDRSui: return AArch64::LDURSi;
2336   case AArch64::LDRDui: return AArch64::LDURDi;
2337   case AArch64::LDRQui: return AArch64::LDURQi;
2338   case AArch64::LDRBBui: return AArch64::LDURBBi;
2339   case AArch64::LDRHHui: return AArch64::LDURHHi;
2340   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2341   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2342   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2343   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2344   case AArch64::LDRSWui: return AArch64::LDURSWi;
2345   case AArch64::STRXui: return AArch64::STURXi;
2346   case AArch64::STRWui: return AArch64::STURWi;
2347   case AArch64::STRBui: return AArch64::STURBi;
2348   case AArch64::STRHui: return AArch64::STURHi;
2349   case AArch64::STRSui: return AArch64::STURSi;
2350   case AArch64::STRDui: return AArch64::STURDi;
2351   case AArch64::STRQui: return AArch64::STURQi;
2352   case AArch64::STRBBui: return AArch64::STURBBi;
2353   case AArch64::STRHHui: return AArch64::STURHHi;
2354   }
2355 }
2356
2357 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2358   switch (Opc) {
2359   default:
2360     llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2361   case AArch64::ADDG:
2362   case AArch64::LDAPURBi:
2363   case AArch64::LDAPURHi:
2364   case AArch64::LDAPURi:
2365   case AArch64::LDAPURSBWi:
2366   case AArch64::LDAPURSBXi:
2367   case AArch64::LDAPURSHWi:
2368   case AArch64::LDAPURSHXi:
2369   case AArch64::LDAPURSWi:
2370   case AArch64::LDAPURXi:
2371   case AArch64::LDR_PPXI:
2372   case AArch64::LDR_PXI:
2373   case AArch64::LDR_ZXI:
2374   case AArch64::LDR_ZZXI:
2375   case AArch64::LDR_ZZZXI:
2376   case AArch64::LDR_ZZZZXI:
2377   case AArch64::LDRBBui:
2378   case AArch64::LDRBui:
2379   case AArch64::LDRDui:
2380   case AArch64::LDRHHui:
2381   case AArch64::LDRHui:
2382   case AArch64::LDRQui:
2383   case AArch64::LDRSBWui:
2384   case AArch64::LDRSBXui:
2385   case AArch64::LDRSHWui:
2386   case AArch64::LDRSHXui:
2387   case AArch64::LDRSui:
2388   case AArch64::LDRSWui:
2389   case AArch64::LDRWui:
2390   case AArch64::LDRXui:
2391   case AArch64::LDURBBi:
2392   case AArch64::LDURBi:
2393   case AArch64::LDURDi:
2394   case AArch64::LDURHHi:
2395   case AArch64::LDURHi:
2396   case AArch64::LDURQi:
2397   case AArch64::LDURSBWi:
2398   case AArch64::LDURSBXi:
2399   case AArch64::LDURSHWi:
2400   case AArch64::LDURSHXi:
2401   case AArch64::LDURSi:
2402   case AArch64::LDURSWi:
2403   case AArch64::LDURWi:
2404   case AArch64::LDURXi:
2405   case AArch64::PRFMui:
2406   case AArch64::PRFUMi:
2407   case AArch64::ST2Gi:
2408   case AArch64::STGi:
2409   case AArch64::STLURBi:
2410   case AArch64::STLURHi:
2411   case AArch64::STLURWi:
2412   case AArch64::STLURXi:
2413   case AArch64::StoreSwiftAsyncContext:
2414   case AArch64::STR_PPXI:
2415   case AArch64::STR_PXI:
2416   case AArch64::STR_ZXI:
2417   case AArch64::STR_ZZXI:
2418   case AArch64::STR_ZZZXI:
2419   case AArch64::STR_ZZZZXI:
2420   case AArch64::STRBBui:
2421   case AArch64::STRBui:
2422   case AArch64::STRDui:
2423   case AArch64::STRHHui:
2424   case AArch64::STRHui:
2425   case AArch64::STRQui:
2426   case AArch64::STRSui:
2427   case AArch64::STRWui:
2428   case AArch64::STRXui:
2429   case AArch64::STURBBi:
2430   case AArch64::STURBi:
2431   case AArch64::STURDi:
2432   case AArch64::STURHHi:
2433   case AArch64::STURHi:
2434   case AArch64::STURQi:
2435   case AArch64::STURSi:
2436   case AArch64::STURWi:
2437   case AArch64::STURXi:
2438   case AArch64::STZ2Gi:
2439   case AArch64::STZGi:
2440   case AArch64::TAGPstack:
2441     return 2;
2442   case AArch64::LD1B_D_IMM:
2443   case AArch64::LD1B_H_IMM:
2444   case AArch64::LD1B_IMM:
2445   case AArch64::LD1B_S_IMM:
2446   case AArch64::LD1D_IMM:
2447   case AArch64::LD1H_D_IMM:
2448   case AArch64::LD1H_IMM:
2449   case AArch64::LD1H_S_IMM:
2450   case AArch64::LD1RB_D_IMM:
2451   case AArch64::LD1RB_H_IMM:
2452   case AArch64::LD1RB_IMM:
2453   case AArch64::LD1RB_S_IMM:
2454   case AArch64::LD1RD_IMM:
2455   case AArch64::LD1RH_D_IMM:
2456   case AArch64::LD1RH_IMM:
2457   case AArch64::LD1RH_S_IMM:
2458   case AArch64::LD1RSB_D_IMM:
2459   case AArch64::LD1RSB_H_IMM:
2460   case AArch64::LD1RSB_S_IMM:
2461   case AArch64::LD1RSH_D_IMM:
2462   case AArch64::LD1RSH_S_IMM:
2463   case AArch64::LD1RSW_IMM:
2464   case AArch64::LD1RW_D_IMM:
2465   case AArch64::LD1RW_IMM:
2466   case AArch64::LD1SB_D_IMM:
2467   case AArch64::LD1SB_H_IMM:
2468   case AArch64::LD1SB_S_IMM:
2469   case AArch64::LD1SH_D_IMM:
2470   case AArch64::LD1SH_S_IMM:
2471   case AArch64::LD1SW_D_IMM:
2472   case AArch64::LD1W_D_IMM:
2473   case AArch64::LD1W_IMM:
2474   case AArch64::LD2B_IMM:
2475   case AArch64::LD2D_IMM:
2476   case AArch64::LD2H_IMM:
2477   case AArch64::LD2W_IMM:
2478   case AArch64::LD3B_IMM:
2479   case AArch64::LD3D_IMM:
2480   case AArch64::LD3H_IMM:
2481   case AArch64::LD3W_IMM:
2482   case AArch64::LD4B_IMM:
2483   case AArch64::LD4D_IMM:
2484   case AArch64::LD4H_IMM:
2485   case AArch64::LD4W_IMM:
2486   case AArch64::LDG:
2487   case AArch64::LDNF1B_D_IMM:
2488   case AArch64::LDNF1B_H_IMM:
2489   case AArch64::LDNF1B_IMM:
2490   case AArch64::LDNF1B_S_IMM:
2491   case AArch64::LDNF1D_IMM:
2492   case AArch64::LDNF1H_D_IMM:
2493   case AArch64::LDNF1H_IMM:
2494   case AArch64::LDNF1H_S_IMM:
2495   case AArch64::LDNF1SB_D_IMM:
2496   case AArch64::LDNF1SB_H_IMM:
2497   case AArch64::LDNF1SB_S_IMM:
2498   case AArch64::LDNF1SH_D_IMM:
2499   case AArch64::LDNF1SH_S_IMM:
2500   case AArch64::LDNF1SW_D_IMM:
2501   case AArch64::LDNF1W_D_IMM:
2502   case AArch64::LDNF1W_IMM:
2503   case AArch64::LDNPDi:
2504   case AArch64::LDNPQi:
2505   case AArch64::LDNPSi:
2506   case AArch64::LDNPWi:
2507   case AArch64::LDNPXi:
2508   case AArch64::LDNT1B_ZRI:
2509   case AArch64::LDNT1D_ZRI:
2510   case AArch64::LDNT1H_ZRI:
2511   case AArch64::LDNT1W_ZRI:
2512   case AArch64::LDPDi:
2513   case AArch64::LDPQi:
2514   case AArch64::LDPSi:
2515   case AArch64::LDPWi:
2516   case AArch64::LDPXi:
2517   case AArch64::LDRBBpost:
2518   case AArch64::LDRBBpre:
2519   case AArch64::LDRBpost:
2520   case AArch64::LDRBpre:
2521   case AArch64::LDRDpost:
2522   case AArch64::LDRDpre:
2523   case AArch64::LDRHHpost:
2524   case AArch64::LDRHHpre:
2525   case AArch64::LDRHpost:
2526   case AArch64::LDRHpre:
2527   case AArch64::LDRQpost:
2528   case AArch64::LDRQpre:
2529   case AArch64::LDRSpost:
2530   case AArch64::LDRSpre:
2531   case AArch64::LDRWpost:
2532   case AArch64::LDRWpre:
2533   case AArch64::LDRXpost:
2534   case AArch64::LDRXpre:
2535   case AArch64::ST1B_D_IMM:
2536   case AArch64::ST1B_H_IMM:
2537   case AArch64::ST1B_IMM:
2538   case AArch64::ST1B_S_IMM:
2539   case AArch64::ST1D_IMM:
2540   case AArch64::ST1H_D_IMM:
2541   case AArch64::ST1H_IMM:
2542   case AArch64::ST1H_S_IMM:
2543   case AArch64::ST1W_D_IMM:
2544   case AArch64::ST1W_IMM:
2545   case AArch64::ST2B_IMM:
2546   case AArch64::ST2D_IMM:
2547   case AArch64::ST2H_IMM:
2548   case AArch64::ST2W_IMM:
2549   case AArch64::ST3B_IMM:
2550   case AArch64::ST3D_IMM:
2551   case AArch64::ST3H_IMM:
2552   case AArch64::ST3W_IMM:
2553   case AArch64::ST4B_IMM:
2554   case AArch64::ST4D_IMM:
2555   case AArch64::ST4H_IMM:
2556   case AArch64::ST4W_IMM:
2557   case AArch64::STGPi:
2558   case AArch64::STGPreIndex:
2559   case AArch64::STZGPreIndex:
2560   case AArch64::ST2GPreIndex:
2561   case AArch64::STZ2GPreIndex:
2562   case AArch64::STGPostIndex:
2563   case AArch64::STZGPostIndex:
2564   case AArch64::ST2GPostIndex:
2565   case AArch64::STZ2GPostIndex:
2566   case AArch64::STNPDi:
2567   case AArch64::STNPQi:
2568   case AArch64::STNPSi:
2569   case AArch64::STNPWi:
2570   case AArch64::STNPXi:
2571   case AArch64::STNT1B_ZRI:
2572   case AArch64::STNT1D_ZRI:
2573   case AArch64::STNT1H_ZRI:
2574   case AArch64::STNT1W_ZRI:
2575   case AArch64::STPDi:
2576   case AArch64::STPQi:
2577   case AArch64::STPSi:
2578   case AArch64::STPWi:
2579   case AArch64::STPXi:
2580   case AArch64::STRBBpost:
2581   case AArch64::STRBBpre:
2582   case AArch64::STRBpost:
2583   case AArch64::STRBpre:
2584   case AArch64::STRDpost:
2585   case AArch64::STRDpre:
2586   case AArch64::STRHHpost:
2587   case AArch64::STRHHpre:
2588   case AArch64::STRHpost:
2589   case AArch64::STRHpre:
2590   case AArch64::STRQpost:
2591   case AArch64::STRQpre:
2592   case AArch64::STRSpost:
2593   case AArch64::STRSpre:
2594   case AArch64::STRWpost:
2595   case AArch64::STRWpre:
2596   case AArch64::STRXpost:
2597   case AArch64::STRXpre:
2598     return 3;
2599   case AArch64::LDPDpost:
2600   case AArch64::LDPDpre:
2601   case AArch64::LDPQpost:
2602   case AArch64::LDPQpre:
2603   case AArch64::LDPSpost:
2604   case AArch64::LDPSpre:
2605   case AArch64::LDPWpost:
2606   case AArch64::LDPWpre:
2607   case AArch64::LDPXpost:
2608   case AArch64::LDPXpre:
2609   case AArch64::STGPpre:
2610   case AArch64::STGPpost:
2611   case AArch64::STPDpost:
2612   case AArch64::STPDpre:
2613   case AArch64::STPQpost:
2614   case AArch64::STPQpre:
2615   case AArch64::STPSpost:
2616   case AArch64::STPSpre:
2617   case AArch64::STPWpost:
2618   case AArch64::STPWpre:
2619   case AArch64::STPXpost:
2620   case AArch64::STPXpre:
2621     return 4;
2622   }
2623 }
2624
2625 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2626   switch (MI.getOpcode()) {
2627   default:
2628     return false;
2629   // Scaled instructions.
2630   case AArch64::STRSui:
2631   case AArch64::STRDui:
2632   case AArch64::STRQui:
2633   case AArch64::STRXui:
2634   case AArch64::STRWui:
2635   case AArch64::LDRSui:
2636   case AArch64::LDRDui:
2637   case AArch64::LDRQui:
2638   case AArch64::LDRXui:
2639   case AArch64::LDRWui:
2640   case AArch64::LDRSWui:
2641   // Unscaled instructions.
2642   case AArch64::STURSi:
2643   case AArch64::STRSpre:
2644   case AArch64::STURDi:
2645   case AArch64::STRDpre:
2646   case AArch64::STURQi:
2647   case AArch64::STRQpre:
2648   case AArch64::STURWi:
2649   case AArch64::STRWpre:
2650   case AArch64::STURXi:
2651   case AArch64::STRXpre:
2652   case AArch64::LDURSi:
2653   case AArch64::LDRSpre:
2654   case AArch64::LDURDi:
2655   case AArch64::LDRDpre:
2656   case AArch64::LDURQi:
2657   case AArch64::LDRQpre:
2658   case AArch64::LDURWi:
2659   case AArch64::LDRWpre:
2660   case AArch64::LDURXi:
2661   case AArch64::LDRXpre:
2662   case AArch64::LDURSWi:
2663   case AArch64::LDRSWpre:
2664     return true;
2665   }
2666 }
2667
2668 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2669   switch (MI.getOpcode()) {
2670   default:
2671     assert((!MI.isCall() || !MI.isReturn()) &&
2672            "Unexpected instruction - was a new tail call opcode introduced?");
2673     return false;
2674   case AArch64::TCRETURNdi:
2675   case AArch64::TCRETURNri:
2676   case AArch64::TCRETURNrix16x17:
2677   case AArch64::TCRETURNrix17:
2678   case AArch64::TCRETURNrinotx16:
2679   case AArch64::TCRETURNriALL:
2680   case AArch64::AUTH_TCRETURN:
2681   case AArch64::AUTH_TCRETURN_BTI:
2682     return true;
2683   }
2684 }
2685
2686 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2687   switch (Opc) {
2688   default:
2689     llvm_unreachable("Opcode has no flag setting equivalent!");
2690   // 32-bit cases:
2691   case AArch64::ADDWri:
2692     return AArch64::ADDSWri;
2693   case AArch64::ADDWrr:
2694     return AArch64::ADDSWrr;
2695   case AArch64::ADDWrs:
2696     return AArch64::ADDSWrs;
2697   case AArch64::ADDWrx:
2698     return AArch64::ADDSWrx;
2699   case AArch64::ANDWri:
2700     return AArch64::ANDSWri;
2701   case AArch64::ANDWrr:
2702     return AArch64::ANDSWrr;
2703   case AArch64::ANDWrs:
2704     return AArch64::ANDSWrs;
2705   case AArch64::BICWrr:
2706     return AArch64::BICSWrr;
2707   case AArch64::BICWrs:
2708     return AArch64::BICSWrs;
2709   case AArch64::SUBWri:
2710     return AArch64::SUBSWri;
2711   case AArch64::SUBWrr:
2712     return AArch64::SUBSWrr;
2713   case AArch64::SUBWrs:
2714     return AArch64::SUBSWrs;
2715   case AArch64::SUBWrx:
2716     return AArch64::SUBSWrx;
2717   // 64-bit cases:
2718   case AArch64::ADDXri:
2719     return AArch64::ADDSXri;
2720   case AArch64::ADDXrr:
2721     return AArch64::ADDSXrr;
2722   case AArch64::ADDXrs:
2723     return AArch64::ADDSXrs;
2724   case AArch64::ADDXrx:
2725     return AArch64::ADDSXrx;
2726   case AArch64::ANDXri:
2727     return AArch64::ANDSXri;
2728   case AArch64::ANDXrr:
2729     return AArch64::ANDSXrr;
2730   case AArch64::ANDXrs:
2731     return AArch64::ANDSXrs;
2732   case AArch64::BICXrr:
2733     return AArch64::BICSXrr;
2734   case AArch64::BICXrs:
2735     return AArch64::BICSXrs;
2736   case AArch64::SUBXri:
2737     return AArch64::SUBSXri;
2738   case AArch64::SUBXrr:
2739     return AArch64::SUBSXrr;
2740   case AArch64::SUBXrs:
2741     return AArch64::SUBSXrs;
2742   case AArch64::SUBXrx:
2743     return AArch64::SUBSXrx;
2744   // SVE instructions:
2745   case AArch64::AND_PPzPP:
2746     return AArch64::ANDS_PPzPP;
2747   case AArch64::BIC_PPzPP:
2748     return AArch64::BICS_PPzPP;
2749   case AArch64::EOR_PPzPP:
2750     return AArch64::EORS_PPzPP;
2751   case AArch64::NAND_PPzPP:
2752     return AArch64::NANDS_PPzPP;
2753   case AArch64::NOR_PPzPP:
2754     return AArch64::NORS_PPzPP;
2755   case AArch64::ORN_PPzPP:
2756     return AArch64::ORNS_PPzPP;
2757   case AArch64::ORR_PPzPP:
2758     return AArch64::ORRS_PPzPP;
2759   case AArch64::BRKA_PPzP:
2760     return AArch64::BRKAS_PPzP;
2761   case AArch64::BRKPA_PPzPP:
2762     return AArch64::BRKPAS_PPzPP;
2763   case AArch64::BRKB_PPzP:
2764     return AArch64::BRKBS_PPzP;
2765   case AArch64::BRKPB_PPzPP:
2766     return AArch64::BRKPBS_PPzPP;
2767   case AArch64::BRKN_PPzP:
2768     return AArch64::BRKNS_PPzP;
2769   case AArch64::RDFFR_PPz:
2770     return AArch64::RDFFRS_PPz;
2771   case AArch64::PTRUE_B:
2772     return AArch64::PTRUES_B;
2773   }
2774 }
2775
2776 // Is this a candidate for ld/st merging or pairing?  For example, we don't
2777 // touch volatiles or load/stores that have a hint to avoid pair formation.
2778 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2779
2780   bool IsPreLdSt = isPreLdSt(MI);
2781
2782   // If this is a volatile load/store, don't mess with it.
2783   if (MI.hasOrderedMemoryRef())
2784     return false;
2785
2786   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2787   // For Pre-inc LD/ST, the operand is shifted by one.
2788   assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2789           MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2790          "Expected a reg or frame index operand.");
2791
2792   // For Pre-indexed addressing quadword instructions, the third operand is the
2793   // immediate value.
2794   bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2795
2796   if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2797     return false;
2798
2799   // Can't merge/pair if the instruction modifies the base register.
2800   // e.g., ldr x0, [x0]
2801   // This case will never occur with an FI base.
2802   // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2803   // STR<S,D,Q,W,X>pre, it can be merged.
2804   // For example:
2805   //   ldr q0, [x11, #32]!
2806   //   ldr q1, [x11, #16]
2807   //   to
2808   //   ldp q0, q1, [x11, #32]!
2809   if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2810     Register BaseReg = MI.getOperand(1).getReg();
2811     const TargetRegisterInfo *TRI = &getRegisterInfo();
2812     if (MI.modifiesRegister(BaseReg, TRI))
2813       return false;
2814   }
2815
2816   // Check if this load/store has a hint to avoid pair formation.
2817   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2818   if (isLdStPairSuppressed(MI))
2819     return false;
2820
2821   // Do not pair any callee-save store/reload instructions in the
2822   // prologue/epilogue if the CFI information encoded the operations as separate
2823   // instructions, as that will cause the size of the actual prologue to mismatch
2824   // with the prologue size recorded in the Windows CFI.
2825   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2826   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2827                      MI.getMF()->getFunction().needsUnwindTableEntry();
2828   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2829                       MI.getFlag(MachineInstr::FrameDestroy)))
2830     return false;
2831
2832   // On some CPUs quad load/store pairs are slower than two single load/stores.
2833   if (Subtarget.isPaired128Slow()) {
2834     switch (MI.getOpcode()) {
2835     default:
2836       break;
2837     case AArch64::LDURQi:
2838     case AArch64::STURQi:
2839     case AArch64::LDRQui:
2840     case AArch64::STRQui:
2841       return false;
2842     }
2843   }
2844
2845   return true;
2846 }
2847
2848 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2849     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2850     int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2851     const TargetRegisterInfo *TRI) const {
2852   if (!LdSt.mayLoadOrStore())
2853     return false;
2854
2855   const MachineOperand *BaseOp;
2856   TypeSize WidthN(0, false);
2857   if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2858                                     WidthN, TRI))
2859     return false;
2860   // The maximum vscale is 16 under AArch64, return the maximal extent for the
2861   // vector.
2862   Width = LocationSize::precise(WidthN);
2863   BaseOps.push_back(BaseOp);
2864   return true;
2865 }
2866
2867 std::optional<ExtAddrMode>
2868 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2869                                           const TargetRegisterInfo *TRI) const {
2870   const MachineOperand *Base; // Filled with the base operand of MI.
2871   int64_t Offset;             // Filled with the offset of MI.
2872   bool OffsetIsScalable;
2873   if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2874     return std::nullopt;
2875
2876   if (!Base->isReg())
2877     return std::nullopt;
2878   ExtAddrMode AM;
2879   AM.BaseReg = Base->getReg();
2880   AM.Displacement = Offset;
2881   AM.ScaledReg = 0;
2882   AM.Scale = 0;
2883   return AM;
2884 }
2885
2886 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2887                                            Register Reg,
2888                                            const MachineInstr &AddrI,
2889                                            ExtAddrMode &AM) const {
2890   // Filter out instructions into which we cannot fold.
2891   unsigned NumBytes;
2892   int64_t OffsetScale = 1;
2893   switch (MemI.getOpcode()) {
2894   default:
2895     return false;
2896
2897   case AArch64::LDURQi:
2898   case AArch64::STURQi:
2899     NumBytes = 16;
2900     break;
2901
2902   case AArch64::LDURDi:
2903   case AArch64::STURDi:
2904   case AArch64::LDURXi:
2905   case AArch64::STURXi:
2906     NumBytes = 8;
2907     break;
2908
2909   case AArch64::LDURWi:
2910   case AArch64::LDURSWi:
2911   case AArch64::STURWi:
2912     NumBytes = 4;
2913     break;
2914
2915   case AArch64::LDURHi:
2916   case AArch64::STURHi:
2917   case AArch64::LDURHHi:
2918   case AArch64::STURHHi:
2919   case AArch64::LDURSHXi:
2920   case AArch64::LDURSHWi:
2921     NumBytes = 2;
2922     break;
2923
2924   case AArch64::LDRBroX:
2925   case AArch64::LDRBBroX:
2926   case AArch64::LDRSBXroX:
2927   case AArch64::LDRSBWroX:
2928   case AArch64::STRBroX:
2929   case AArch64::STRBBroX:
2930   case AArch64::LDURBi:
2931   case AArch64::LDURBBi:
2932   case AArch64::LDURSBXi:
2933   case AArch64::LDURSBWi:
2934   case AArch64::STURBi:
2935   case AArch64::STURBBi:
2936   case AArch64::LDRBui:
2937   case AArch64::LDRBBui:
2938   case AArch64::LDRSBXui:
2939   case AArch64::LDRSBWui:
2940   case AArch64::STRBui:
2941   case AArch64::STRBBui:
2942     NumBytes = 1;
2943     break;
2944
2945   case AArch64::LDRQroX:
2946   case AArch64::STRQroX:
2947   case AArch64::LDRQui:
2948   case AArch64::STRQui:
2949     NumBytes = 16;
2950     OffsetScale = 16;
2951     break;
2952
2953   case AArch64::LDRDroX:
2954   case AArch64::STRDroX:
2955   case AArch64::LDRXroX:
2956   case AArch64::STRXroX:
2957   case AArch64::LDRDui:
2958   case AArch64::STRDui:
2959   case AArch64::LDRXui:
2960   case AArch64::STRXui:
2961     NumBytes = 8;
2962     OffsetScale = 8;
2963     break;
2964
2965   case AArch64::LDRWroX:
2966   case AArch64::LDRSWroX:
2967   case AArch64::STRWroX:
2968   case AArch64::LDRWui:
2969   case AArch64::LDRSWui:
2970   case AArch64::STRWui:
2971     NumBytes = 4;
2972     OffsetScale = 4;
2973     break;
2974
2975   case AArch64::LDRHroX:
2976   case AArch64::STRHroX:
2977   case AArch64::LDRHHroX:
2978   case AArch64::STRHHroX:
2979   case AArch64::LDRSHXroX:
2980   case AArch64::LDRSHWroX:
2981   case AArch64::LDRHui:
2982   case AArch64::STRHui:
2983   case AArch64::LDRHHui:
2984   case AArch64::STRHHui:
2985   case AArch64::LDRSHXui:
2986   case AArch64::LDRSHWui:
2987     NumBytes = 2;
2988     OffsetScale = 2;
2989     break;
2990   }
2991
2992   // Check the fold operand is not the loaded/stored value.
2993   const MachineOperand &BaseRegOp = MemI.getOperand(0);
2994   if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2995     return false;
2996
2997   // Handle memory instructions with a [Reg, Reg] addressing mode.
2998   if (MemI.getOperand(2).isReg()) {
2999     // Bail if the addressing mode already includes extension of the offset
3000     // register.
3001     if (MemI.getOperand(3).getImm())
3002       return false;
3003
3004     // Check if we actually have a scaled offset.
3005     if (MemI.getOperand(4).getImm() == 0)
3006       OffsetScale = 1;
3007
3008     // If the address instructions is folded into the base register, then the
3009     // addressing mode must not have a scale. Then we can swap the base and the
3010     // scaled registers.
3011     if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3012       return false;
3013
3014     switch (AddrI.getOpcode()) {
3015     default:
3016       return false;
3017
3018     case AArch64::SBFMXri:
3019       // sxtw Xa, Wm
3020       // ldr Xd, [Xn, Xa, lsl #N]
3021       // ->
3022       // ldr Xd, [Xn, Wm, sxtw #N]
3023       if (AddrI.getOperand(2).getImm() != 0 ||
3024           AddrI.getOperand(3).getImm() != 31)
3025         return false;
3026
3027       AM.BaseReg = MemI.getOperand(1).getReg();
3028       if (AM.BaseReg == Reg)
3029         AM.BaseReg = MemI.getOperand(2).getReg();
3030       AM.ScaledReg = AddrI.getOperand(1).getReg();
3031       AM.Scale = OffsetScale;
3032       AM.Displacement = 0;
3033       AM.Form = ExtAddrMode::Formula::SExtScaledReg;
3034       return true;
3035
3036     case TargetOpcode::SUBREG_TO_REG: {
3037       // mov Wa, Wm
3038       // ldr Xd, [Xn, Xa, lsl #N]
3039       // ->
3040       // ldr Xd, [Xn, Wm, uxtw #N]
3041
3042       // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3043       if (AddrI.getOperand(1).getImm() != 0 ||
3044           AddrI.getOperand(3).getImm() != AArch64::sub_32)
3045         return false;
3046
3047       const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3048       Register OffsetReg = AddrI.getOperand(2).getReg();
3049       if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3050         return false;
3051
3052       const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3053       if (DefMI.getOpcode() != AArch64::ORRWrs ||
3054           DefMI.getOperand(1).getReg() != AArch64::WZR ||
3055           DefMI.getOperand(3).getImm() != 0)
3056         return false;
3057
3058       AM.BaseReg = MemI.getOperand(1).getReg();
3059       if (AM.BaseReg == Reg)
3060         AM.BaseReg = MemI.getOperand(2).getReg();
3061       AM.ScaledReg = DefMI.getOperand(2).getReg();
3062       AM.Scale = OffsetScale;
3063       AM.Displacement = 0;
3064       AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
3065       return true;
3066     }
3067     }
3068   }
3069
3070   // Handle memory instructions with a [Reg, #Imm] addressing mode.
3071
3072   // Check we are not breaking a potential conversion to an LDP.
3073   auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3074                                  int64_t NewOffset) -> bool {
3075     int64_t MinOffset, MaxOffset;
3076     switch (NumBytes) {
3077     default:
3078       return true;
3079     case 4:
3080       MinOffset = -256;
3081       MaxOffset = 252;
3082       break;
3083     case 8:
3084       MinOffset = -512;
3085       MaxOffset = 504;
3086       break;
3087     case 16:
3088       MinOffset = -1024;
3089       MaxOffset = 1008;
3090       break;
3091     }
3092     return OldOffset < MinOffset || OldOffset > MaxOffset ||
3093            (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3094   };
3095   auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3096     int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3097     int64_t NewOffset = OldOffset + Disp;
3098     if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3099       return false;
3100     // If the old offset would fit into an LDP, but the new offset wouldn't,
3101     // bail out.
3102     if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3103       return false;
3104     AM.BaseReg = AddrI.getOperand(1).getReg();
3105     AM.ScaledReg = 0;
3106     AM.Scale = 0;
3107     AM.Displacement = NewOffset;
3108     AM.Form = ExtAddrMode::Formula::Basic;
3109     return true;
3110   };
3111
3112   auto canFoldAddRegIntoAddrMode =
3113       [&](int64_t Scale,
3114           ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
3115     if (MemI.getOperand(2).getImm() != 0)
3116       return false;
3117     if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3118       return false;
3119     AM.BaseReg = AddrI.getOperand(1).getReg();
3120     AM.ScaledReg = AddrI.getOperand(2).getReg();
3121     AM.Scale = Scale;
3122     AM.Displacement = 0;
3123     AM.Form = Form;
3124     return true;
3125   };
3126
3127   auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3128     unsigned Opcode = MemI.getOpcode();
3129     return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3130            Subtarget.isSTRQroSlow();
3131   };
3132
3133   int64_t Disp = 0;
3134   const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3135   switch (AddrI.getOpcode()) {
3136   default:
3137     return false;
3138
3139   case AArch64::ADDXri:
3140     // add Xa, Xn, #N
3141     // ldr Xd, [Xa, #M]
3142     // ->
3143     // ldr Xd, [Xn, #N'+M]
3144     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3145     return canFoldAddSubImmIntoAddrMode(Disp);
3146
3147   case AArch64::SUBXri:
3148     // sub Xa, Xn, #N
3149     // ldr Xd, [Xa, #M]
3150     // ->
3151     // ldr Xd, [Xn, #N'+M]
3152     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3153     return canFoldAddSubImmIntoAddrMode(-Disp);
3154
3155   case AArch64::ADDXrs: {
3156     // add Xa, Xn, Xm, lsl #N
3157     // ldr Xd, [Xa]
3158     // ->
3159     // ldr Xd, [Xn, Xm, lsl #N]
3160
3161     // Don't fold the add if the result would be slower, unless optimising for
3162     // size.
3163     unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3164     if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
3165       return false;
3166     Shift = AArch64_AM::getShiftValue(Shift);
3167     if (!OptSize) {
3168       if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3169         return false;
3170       if (avoidSlowSTRQ(MemI))
3171         return false;
3172     }
3173     return canFoldAddRegIntoAddrMode(1ULL << Shift);
3174   }
3175
3176   case AArch64::ADDXrr:
3177     // add Xa, Xn, Xm
3178     // ldr Xd, [Xa]
3179     // ->
3180     // ldr Xd, [Xn, Xm, lsl #0]
3181
3182     // Don't fold the add if the result would be slower, unless optimising for
3183     // size.
3184     if (!OptSize && avoidSlowSTRQ(MemI))
3185       return false;
3186     return canFoldAddRegIntoAddrMode(1);
3187
3188   case AArch64::ADDXrx:
3189     // add Xa, Xn, Wm, {s,u}xtw #N
3190     // ldr Xd, [Xa]
3191     // ->
3192     // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3193
3194     // Don't fold the add if the result would be slower, unless optimising for
3195     // size.
3196     if (!OptSize && avoidSlowSTRQ(MemI))
3197       return false;
3198
3199     // Can fold only sign-/zero-extend of a word.
3200     unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3201     AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3202     if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3203       return false;
3204
3205     return canFoldAddRegIntoAddrMode(
3206         1ULL << AArch64_AM::getArithShiftValue(Imm),
3207         (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3208                                      : ExtAddrMode::Formula::ZExtScaledReg);
3209   }
3210 }
3211
3212 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3213 // return the opcode of an instruction performing the same operation, but using
3214 // the [Reg, Reg] addressing mode.
3215 static unsigned regOffsetOpcode(unsigned Opcode) {
3216   switch (Opcode) {
3217   default:
3218     llvm_unreachable("Address folding not implemented for instruction");
3219
3220   case AArch64::LDURQi:
3221   case AArch64::LDRQui:
3222     return AArch64::LDRQroX;
3223   case AArch64::STURQi:
3224   case AArch64::STRQui:
3225     return AArch64::STRQroX;
3226   case AArch64::LDURDi:
3227   case AArch64::LDRDui:
3228     return AArch64::LDRDroX;
3229   case AArch64::STURDi:
3230   case AArch64::STRDui:
3231     return AArch64::STRDroX;
3232   case AArch64::LDURXi:
3233   case AArch64::LDRXui:
3234     return AArch64::LDRXroX;
3235   case AArch64::STURXi:
3236   case AArch64::STRXui:
3237     return AArch64::STRXroX;
3238   case AArch64::LDURWi:
3239   case AArch64::LDRWui:
3240     return AArch64::LDRWroX;
3241   case AArch64::LDURSWi:
3242   case AArch64::LDRSWui:
3243     return AArch64::LDRSWroX;
3244   case AArch64::STURWi:
3245   case AArch64::STRWui:
3246     return AArch64::STRWroX;
3247   case AArch64::LDURHi:
3248   case AArch64::LDRHui:
3249     return AArch64::LDRHroX;
3250   case AArch64::STURHi:
3251   case AArch64::STRHui:
3252     return AArch64::STRHroX;
3253   case AArch64::LDURHHi:
3254   case AArch64::LDRHHui:
3255     return AArch64::LDRHHroX;
3256   case AArch64::STURHHi:
3257   case AArch64::STRHHui:
3258     return AArch64::STRHHroX;
3259   case AArch64::LDURSHXi:
3260   case AArch64::LDRSHXui:
3261     return AArch64::LDRSHXroX;
3262   case AArch64::LDURSHWi:
3263   case AArch64::LDRSHWui:
3264     return AArch64::LDRSHWroX;
3265   case AArch64::LDURBi:
3266   case AArch64::LDRBui:
3267     return AArch64::LDRBroX;
3268   case AArch64::LDURBBi:
3269   case AArch64::LDRBBui:
3270     return AArch64::LDRBBroX;
3271   case AArch64::LDURSBXi:
3272   case AArch64::LDRSBXui:
3273     return AArch64::LDRSBXroX;
3274   case AArch64::LDURSBWi:
3275   case AArch64::LDRSBWui:
3276     return AArch64::LDRSBWroX;
3277   case AArch64::STURBi:
3278   case AArch64::STRBui:
3279     return AArch64::STRBroX;
3280   case AArch64::STURBBi:
3281   case AArch64::STRBBui:
3282     return AArch64::STRBBroX;
3283   }
3284 }
3285
3286 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3287 // the opcode of an instruction performing the same operation, but using the
3288 // [Reg, #Imm] addressing mode with scaled offset.
3289 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3290   switch (Opcode) {
3291   default:
3292     llvm_unreachable("Address folding not implemented for instruction");
3293
3294   case AArch64::LDURQi:
3295     Scale = 16;
3296     return AArch64::LDRQui;
3297   case AArch64::STURQi:
3298     Scale = 16;
3299     return AArch64::STRQui;
3300   case AArch64::LDURDi:
3301     Scale = 8;
3302     return AArch64::LDRDui;
3303   case AArch64::STURDi:
3304     Scale = 8;
3305     return AArch64::STRDui;
3306   case AArch64::LDURXi:
3307     Scale = 8;
3308     return AArch64::LDRXui;
3309   case AArch64::STURXi:
3310     Scale = 8;
3311     return AArch64::STRXui;
3312   case AArch64::LDURWi:
3313     Scale = 4;
3314     return AArch64::LDRWui;
3315   case AArch64::LDURSWi:
3316     Scale = 4;
3317     return AArch64::LDRSWui;
3318   case AArch64::STURWi:
3319     Scale = 4;
3320     return AArch64::STRWui;
3321   case AArch64::LDURHi:
3322     Scale = 2;
3323     return AArch64::LDRHui;
3324   case AArch64::STURHi:
3325     Scale = 2;
3326     return AArch64::STRHui;
3327   case AArch64::LDURHHi:
3328     Scale = 2;
3329     return AArch64::LDRHHui;
3330   case AArch64::STURHHi:
3331     Scale = 2;
3332     return AArch64::STRHHui;
3333   case AArch64::LDURSHXi:
3334     Scale = 2;
3335     return AArch64::LDRSHXui;
3336   case AArch64::LDURSHWi:
3337     Scale = 2;
3338     return AArch64::LDRSHWui;
3339   case AArch64::LDURBi:
3340     Scale = 1;
3341     return AArch64::LDRBui;
3342   case AArch64::LDURBBi:
3343     Scale = 1;
3344     return AArch64::LDRBBui;
3345   case AArch64::LDURSBXi:
3346     Scale = 1;
3347     return AArch64::LDRSBXui;
3348   case AArch64::LDURSBWi:
3349     Scale = 1;
3350     return AArch64::LDRSBWui;
3351   case AArch64::STURBi:
3352     Scale = 1;
3353     return AArch64::STRBui;
3354   case AArch64::STURBBi:
3355     Scale = 1;
3356     return AArch64::STRBBui;
3357   case AArch64::LDRQui:
3358   case AArch64::STRQui:
3359     Scale = 16;
3360     return Opcode;
3361   case AArch64::LDRDui:
3362   case AArch64::STRDui:
3363   case AArch64::LDRXui:
3364   case AArch64::STRXui:
3365     Scale = 8;
3366     return Opcode;
3367   case AArch64::LDRWui:
3368   case AArch64::LDRSWui:
3369   case AArch64::STRWui:
3370     Scale = 4;
3371     return Opcode;
3372   case AArch64::LDRHui:
3373   case AArch64::STRHui:
3374   case AArch64::LDRHHui:
3375   case AArch64::STRHHui:
3376   case AArch64::LDRSHXui:
3377   case AArch64::LDRSHWui:
3378     Scale = 2;
3379     return Opcode;
3380   case AArch64::LDRBui:
3381   case AArch64::LDRBBui:
3382   case AArch64::LDRSBXui:
3383   case AArch64::LDRSBWui:
3384   case AArch64::STRBui:
3385   case AArch64::STRBBui:
3386     Scale = 1;
3387     return Opcode;
3388   }
3389 }
3390
3391 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3392 // the opcode of an instruction performing the same operation, but using the
3393 // [Reg, #Imm] addressing mode with unscaled offset.
3394 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3395   switch (Opcode) {
3396   default:
3397     llvm_unreachable("Address folding not implemented for instruction");
3398
3399   case AArch64::LDURQi:
3400   case AArch64::STURQi:
3401   case AArch64::LDURDi:
3402   case AArch64::STURDi:
3403   case AArch64::LDURXi:
3404   case AArch64::STURXi:
3405   case AArch64::LDURWi:
3406   case AArch64::LDURSWi:
3407   case AArch64::STURWi:
3408   case AArch64::LDURHi:
3409   case AArch64::STURHi:
3410   case AArch64::LDURHHi:
3411   case AArch64::STURHHi:
3412   case AArch64::LDURSHXi:
3413   case AArch64::LDURSHWi:
3414   case AArch64::LDURBi:
3415   case AArch64::STURBi:
3416   case AArch64::LDURBBi:
3417   case AArch64::STURBBi:
3418   case AArch64::LDURSBWi:
3419   case AArch64::LDURSBXi:
3420     return Opcode;
3421   case AArch64::LDRQui:
3422     return AArch64::LDURQi;
3423   case AArch64::STRQui:
3424     return AArch64::STURQi;
3425   case AArch64::LDRDui:
3426     return AArch64::LDURDi;
3427   case AArch64::STRDui:
3428     return AArch64::STURDi;
3429   case AArch64::LDRXui:
3430     return AArch64::LDURXi;
3431   case AArch64::STRXui:
3432     return AArch64::STURXi;
3433   case AArch64::LDRWui:
3434     return AArch64::LDURWi;
3435   case AArch64::LDRSWui:
3436     return AArch64::LDURSWi;
3437   case AArch64::STRWui:
3438     return AArch64::STURWi;
3439   case AArch64::LDRHui:
3440     return AArch64::LDURHi;
3441   case AArch64::STRHui:
3442     return AArch64::STURHi;
3443   case AArch64::LDRHHui:
3444     return AArch64::LDURHHi;
3445   case AArch64::STRHHui:
3446     return AArch64::STURHHi;
3447   case AArch64::LDRSHXui:
3448     return AArch64::LDURSHXi;
3449   case AArch64::LDRSHWui:
3450     return AArch64::LDURSHWi;
3451   case AArch64::LDRBBui:
3452     return AArch64::LDURBBi;
3453   case AArch64::LDRBui:
3454     return AArch64::LDURBi;
3455   case AArch64::STRBBui:
3456     return AArch64::STURBBi;
3457   case AArch64::STRBui:
3458     return AArch64::STURBi;
3459   case AArch64::LDRSBWui:
3460     return AArch64::LDURSBWi;
3461   case AArch64::LDRSBXui:
3462     return AArch64::LDURSBXi;
3463   }
3464 }
3465
3466 // Given the opcode of a memory load/store instruction, return the opcode of an
3467 // instruction performing the same operation, but using
3468 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3469 // offset register.
3470 static unsigned offsetExtendOpcode(unsigned Opcode) {
3471   switch (Opcode) {
3472   default:
3473     llvm_unreachable("Address folding not implemented for instruction");
3474
3475   case AArch64::LDRQroX:
3476   case AArch64::LDURQi:
3477   case AArch64::LDRQui:
3478     return AArch64::LDRQroW;
3479   case AArch64::STRQroX:
3480   case AArch64::STURQi:
3481   case AArch64::STRQui:
3482     return AArch64::STRQroW;
3483   case AArch64::LDRDroX:
3484   case AArch64::LDURDi:
3485   case AArch64::LDRDui:
3486     return AArch64::LDRDroW;
3487   case AArch64::STRDroX:
3488   case AArch64::STURDi:
3489   case AArch64::STRDui:
3490     return AArch64::STRDroW;
3491   case AArch64::LDRXroX:
3492   case AArch64::LDURXi:
3493   case AArch64::LDRXui:
3494     return AArch64::LDRXroW;
3495   case AArch64::STRXroX:
3496   case AArch64::STURXi:
3497   case AArch64::STRXui:
3498     return AArch64::STRXroW;
3499   case AArch64::LDRWroX:
3500   case AArch64::LDURWi:
3501   case AArch64::LDRWui:
3502     return AArch64::LDRWroW;
3503   case AArch64::LDRSWroX:
3504   case AArch64::LDURSWi:
3505   case AArch64::LDRSWui:
3506     return AArch64::LDRSWroW;
3507   case AArch64::STRWroX:
3508   case AArch64::STURWi:
3509   case AArch64::STRWui:
3510     return AArch64::STRWroW;
3511   case AArch64::LDRHroX:
3512   case AArch64::LDURHi:
3513   case AArch64::LDRHui:
3514     return AArch64::LDRHroW;
3515   case AArch64::STRHroX:
3516   case AArch64::STURHi:
3517   case AArch64::STRHui:
3518     return AArch64::STRHroW;
3519   case AArch64::LDRHHroX:
3520   case AArch64::LDURHHi:
3521   case AArch64::LDRHHui:
3522     return AArch64::LDRHHroW;
3523   case AArch64::STRHHroX:
3524   case AArch64::STURHHi:
3525   case AArch64::STRHHui:
3526     return AArch64::STRHHroW;
3527   case AArch64::LDRSHXroX:
3528   case AArch64::LDURSHXi:
3529   case AArch64::LDRSHXui:
3530     return AArch64::LDRSHXroW;
3531   case AArch64::LDRSHWroX:
3532   case AArch64::LDURSHWi:
3533   case AArch64::LDRSHWui:
3534     return AArch64::LDRSHWroW;
3535   case AArch64::LDRBroX:
3536   case AArch64::LDURBi:
3537   case AArch64::LDRBui:
3538     return AArch64::LDRBroW;
3539   case AArch64::LDRBBroX:
3540   case AArch64::LDURBBi:
3541   case AArch64::LDRBBui:
3542     return AArch64::LDRBBroW;
3543   case AArch64::LDRSBXroX:
3544   case AArch64::LDURSBXi:
3545   case AArch64::LDRSBXui:
3546     return AArch64::LDRSBXroW;
3547   case AArch64::LDRSBWroX:
3548   case AArch64::LDURSBWi:
3549   case AArch64::LDRSBWui:
3550     return AArch64::LDRSBWroW;
3551   case AArch64::STRBroX:
3552   case AArch64::STURBi:
3553   case AArch64::STRBui:
3554     return AArch64::STRBroW;
3555   case AArch64::STRBBroX:
3556   case AArch64::STURBBi:
3557   case AArch64::STRBBui:
3558     return AArch64::STRBBroW;
3559   }
3560 }
3561
3562 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3563                                                  const ExtAddrMode &AM) const {
3564
3565   const DebugLoc &DL = MemI.getDebugLoc();
3566   MachineBasicBlock &MBB = *MemI.getParent();
3567   MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3568
3569   if (AM.Form == ExtAddrMode::Formula::Basic) {
3570     if (AM.ScaledReg) {
3571       // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3572       unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3573       MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3574       auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3575                    .addReg(MemI.getOperand(0).getReg(),
3576                            MemI.mayLoad() ? RegState::Define : 0)
3577                    .addReg(AM.BaseReg)
3578                    .addReg(AM.ScaledReg)
3579                    .addImm(0)
3580                    .addImm(AM.Scale > 1)
3581                    .setMemRefs(MemI.memoperands())
3582                    .setMIFlags(MemI.getFlags());
3583       return B.getInstr();
3584     }
3585
3586     assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3587            "Addressing mode not supported for folding");
3588
3589     // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3590     unsigned Scale = 1;
3591     unsigned Opcode = MemI.getOpcode();
3592     if (isInt<9>(AM.Displacement))
3593       Opcode = unscaledOffsetOpcode(Opcode);
3594     else
3595       Opcode = scaledOffsetOpcode(Opcode, Scale);
3596
3597     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3598                  .addReg(MemI.getOperand(0).getReg(),
3599                          MemI.mayLoad() ? RegState::Define : 0)
3600                  .addReg(AM.BaseReg)
3601                  .addImm(AM.Displacement / Scale)
3602                  .setMemRefs(MemI.memoperands())
3603                  .setMIFlags(MemI.getFlags());
3604     return B.getInstr();
3605   }
3606
3607   if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3608       AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3609     // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3610     assert(AM.ScaledReg && !AM.Displacement &&
3611            "Address offset can be a register or an immediate, but not both");
3612     unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3613     MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3614     // Make sure the offset register is in the correct register class.
3615     Register OffsetReg = AM.ScaledReg;
3616     const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3617     if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3618       OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3619       BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3620           .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3621     }
3622     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3623                  .addReg(MemI.getOperand(0).getReg(),
3624                          MemI.mayLoad() ? RegState::Define : 0)
3625                  .addReg(AM.BaseReg)
3626                  .addReg(OffsetReg)
3627                  .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3628                  .addImm(AM.Scale != 1)
3629                  .setMemRefs(MemI.memoperands())
3630                  .setMIFlags(MemI.getFlags());
3631
3632     return B.getInstr();
3633   }
3634
3635   llvm_unreachable(
3636       "Function must not be called with an addressing mode it can't handle");
3637 }
3638
3639 /// Return true if the opcode is a post-index ld/st instruction, which really
3640 /// loads from base+0.
3641 static bool isPostIndexLdStOpcode(unsigned Opcode) {
3642   switch (Opcode) {
3643   default:
3644     return false;
3645   case AArch64::LD1Fourv16b_POST:
3646   case AArch64::LD1Fourv1d_POST:
3647   case AArch64::LD1Fourv2d_POST:
3648   case AArch64::LD1Fourv2s_POST:
3649   case AArch64::LD1Fourv4h_POST:
3650   case AArch64::LD1Fourv4s_POST:
3651   case AArch64::LD1Fourv8b_POST:
3652   case AArch64::LD1Fourv8h_POST:
3653   case AArch64::LD1Onev16b_POST:
3654   case AArch64::LD1Onev1d_POST:
3655   case AArch64::LD1Onev2d_POST:
3656   case AArch64::LD1Onev2s_POST:
3657   case AArch64::LD1Onev4h_POST:
3658   case AArch64::LD1Onev4s_POST:
3659   case AArch64::LD1Onev8b_POST:
3660   case AArch64::LD1Onev8h_POST:
3661   case AArch64::LD1Rv16b_POST:
3662   case AArch64::LD1Rv1d_POST:
3663   case AArch64::LD1Rv2d_POST:
3664   case AArch64::LD1Rv2s_POST:
3665   case AArch64::LD1Rv4h_POST:
3666   case AArch64::LD1Rv4s_POST:
3667   case AArch64::LD1Rv8b_POST:
3668   case AArch64::LD1Rv8h_POST:
3669   case AArch64::LD1Threev16b_POST:
3670   case AArch64::LD1Threev1d_POST:
3671   case AArch64::LD1Threev2d_POST:
3672   case AArch64::LD1Threev2s_POST:
3673   case AArch64::LD1Threev4h_POST:
3674   case AArch64::LD1Threev4s_POST:
3675   case AArch64::LD1Threev8b_POST:
3676   case AArch64::LD1Threev8h_POST:
3677   case AArch64::LD1Twov16b_POST:
3678   case AArch64::LD1Twov1d_POST:
3679   case AArch64::LD1Twov2d_POST:
3680   case AArch64::LD1Twov2s_POST:
3681   case AArch64::LD1Twov4h_POST:
3682   case AArch64::LD1Twov4s_POST:
3683   case AArch64::LD1Twov8b_POST:
3684   case AArch64::LD1Twov8h_POST:
3685   case AArch64::LD1i16_POST:
3686   case AArch64::LD1i32_POST:
3687   case AArch64::LD1i64_POST:
3688   case AArch64::LD1i8_POST:
3689   case AArch64::LD2Rv16b_POST:
3690   case AArch64::LD2Rv1d_POST:
3691   case AArch64::LD2Rv2d_POST:
3692   case AArch64::LD2Rv2s_POST:
3693   case AArch64::LD2Rv4h_POST:
3694   case AArch64::LD2Rv4s_POST:
3695   case AArch64::LD2Rv8b_POST:
3696   case AArch64::LD2Rv8h_POST:
3697   case AArch64::LD2Twov16b_POST:
3698   case AArch64::LD2Twov2d_POST:
3699   case AArch64::LD2Twov2s_POST:
3700   case AArch64::LD2Twov4h_POST:
3701   case AArch64::LD2Twov4s_POST:
3702   case AArch64::LD2Twov8b_POST:
3703   case AArch64::LD2Twov8h_POST:
3704   case AArch64::LD2i16_POST:
3705   case AArch64::LD2i32_POST:
3706   case AArch64::LD2i64_POST:
3707   case AArch64::LD2i8_POST:
3708   case AArch64::LD3Rv16b_POST:
3709   case AArch64::LD3Rv1d_POST:
3710   case AArch64::LD3Rv2d_POST:
3711   case AArch64::LD3Rv2s_POST:
3712   case AArch64::LD3Rv4h_POST:
3713   case AArch64::LD3Rv4s_POST:
3714   case AArch64::LD3Rv8b_POST:
3715   case AArch64::LD3Rv8h_POST:
3716   case AArch64::LD3Threev16b_POST:
3717   case AArch64::LD3Threev2d_POST:
3718   case AArch64::LD3Threev2s_POST:
3719   case AArch64::LD3Threev4h_POST:
3720   case AArch64::LD3Threev4s_POST:
3721   case AArch64::LD3Threev8b_POST:
3722   case AArch64::LD3Threev8h_POST:
3723   case AArch64::LD3i16_POST:
3724   case AArch64::LD3i32_POST:
3725   case AArch64::LD3i64_POST:
3726   case AArch64::LD3i8_POST:
3727   case AArch64::LD4Fourv16b_POST:
3728   case AArch64::LD4Fourv2d_POST:
3729   case AArch64::LD4Fourv2s_POST:
3730   case AArch64::LD4Fourv4h_POST:
3731   case AArch64::LD4Fourv4s_POST:
3732   case AArch64::LD4Fourv8b_POST:
3733   case AArch64::LD4Fourv8h_POST:
3734   case AArch64::LD4Rv16b_POST:
3735   case AArch64::LD4Rv1d_POST:
3736   case AArch64::LD4Rv2d_POST:
3737   case AArch64::LD4Rv2s_POST:
3738   case AArch64::LD4Rv4h_POST:
3739   case AArch64::LD4Rv4s_POST:
3740   case AArch64::LD4Rv8b_POST:
3741   case AArch64::LD4Rv8h_POST:
3742   case AArch64::LD4i16_POST:
3743   case AArch64::LD4i32_POST:
3744   case AArch64::LD4i64_POST:
3745   case AArch64::LD4i8_POST:
3746   case AArch64::LDAPRWpost:
3747   case AArch64::LDAPRXpost:
3748   case AArch64::LDIAPPWpost:
3749   case AArch64::LDIAPPXpost:
3750   case AArch64::LDPDpost:
3751   case AArch64::LDPQpost:
3752   case AArch64::LDPSWpost:
3753   case AArch64::LDPSpost:
3754   case AArch64::LDPWpost:
3755   case AArch64::LDPXpost:
3756   case AArch64::LDRBBpost:
3757   case AArch64::LDRBpost:
3758   case AArch64::LDRDpost:
3759   case AArch64::LDRHHpost:
3760   case AArch64::LDRHpost:
3761   case AArch64::LDRQpost:
3762   case AArch64::LDRSBWpost:
3763   case AArch64::LDRSBXpost:
3764   case AArch64::LDRSHWpost:
3765   case AArch64::LDRSHXpost:
3766   case AArch64::LDRSWpost:
3767   case AArch64::LDRSpost:
3768   case AArch64::LDRWpost:
3769   case AArch64::LDRXpost:
3770   case AArch64::ST1Fourv16b_POST:
3771   case AArch64::ST1Fourv1d_POST:
3772   case AArch64::ST1Fourv2d_POST:
3773   case AArch64::ST1Fourv2s_POST:
3774   case AArch64::ST1Fourv4h_POST:
3775   case AArch64::ST1Fourv4s_POST:
3776   case AArch64::ST1Fourv8b_POST:
3777   case AArch64::ST1Fourv8h_POST:
3778   case AArch64::ST1Onev16b_POST:
3779   case AArch64::ST1Onev1d_POST:
3780   case AArch64::ST1Onev2d_POST:
3781   case AArch64::ST1Onev2s_POST:
3782   case AArch64::ST1Onev4h_POST:
3783   case AArch64::ST1Onev4s_POST:
3784   case AArch64::ST1Onev8b_POST:
3785   case AArch64::ST1Onev8h_POST:
3786   case AArch64::ST1Threev16b_POST:
3787   case AArch64::ST1Threev1d_POST:
3788   case AArch64::ST1Threev2d_POST:
3789   case AArch64::ST1Threev2s_POST:
3790   case AArch64::ST1Threev4h_POST:
3791   case AArch64::ST1Threev4s_POST:
3792   case AArch64::ST1Threev8b_POST:
3793   case AArch64::ST1Threev8h_POST:
3794   case AArch64::ST1Twov16b_POST:
3795   case AArch64::ST1Twov1d_POST:
3796   case AArch64::ST1Twov2d_POST:
3797   case AArch64::ST1Twov2s_POST:
3798   case AArch64::ST1Twov4h_POST:
3799   case AArch64::ST1Twov4s_POST:
3800   case AArch64::ST1Twov8b_POST:
3801   case AArch64::ST1Twov8h_POST:
3802   case AArch64::ST1i16_POST:
3803   case AArch64::ST1i32_POST:
3804   case AArch64::ST1i64_POST:
3805   case AArch64::ST1i8_POST:
3806   case AArch64::ST2GPostIndex:
3807   case AArch64::ST2Twov16b_POST:
3808   case AArch64::ST2Twov2d_POST:
3809   case AArch64::ST2Twov2s_POST:
3810   case AArch64::ST2Twov4h_POST:
3811   case AArch64::ST2Twov4s_POST:
3812   case AArch64::ST2Twov8b_POST:
3813   case AArch64::ST2Twov8h_POST:
3814   case AArch64::ST2i16_POST:
3815   case AArch64::ST2i32_POST:
3816   case AArch64::ST2i64_POST:
3817   case AArch64::ST2i8_POST:
3818   case AArch64::ST3Threev16b_POST:
3819   case AArch64::ST3Threev2d_POST:
3820   case AArch64::ST3Threev2s_POST:
3821   case AArch64::ST3Threev4h_POST:
3822   case AArch64::ST3Threev4s_POST:
3823   case AArch64::ST3Threev8b_POST:
3824   case AArch64::ST3Threev8h_POST:
3825   case AArch64::ST3i16_POST:
3826   case AArch64::ST3i32_POST:
3827   case AArch64::ST3i64_POST:
3828   case AArch64::ST3i8_POST:
3829   case AArch64::ST4Fourv16b_POST:
3830   case AArch64::ST4Fourv2d_POST:
3831   case AArch64::ST4Fourv2s_POST:
3832   case AArch64::ST4Fourv4h_POST:
3833   case AArch64::ST4Fourv4s_POST:
3834   case AArch64::ST4Fourv8b_POST:
3835   case AArch64::ST4Fourv8h_POST:
3836   case AArch64::ST4i16_POST:
3837   case AArch64::ST4i32_POST:
3838   case AArch64::ST4i64_POST:
3839   case AArch64::ST4i8_POST:
3840   case AArch64::STGPostIndex:
3841   case AArch64::STGPpost:
3842   case AArch64::STPDpost:
3843   case AArch64::STPQpost:
3844   case AArch64::STPSpost:
3845   case AArch64::STPWpost:
3846   case AArch64::STPXpost:
3847   case AArch64::STRBBpost:
3848   case AArch64::STRBpost:
3849   case AArch64::STRDpost:
3850   case AArch64::STRHHpost:
3851   case AArch64::STRHpost:
3852   case AArch64::STRQpost:
3853   case AArch64::STRSpost:
3854   case AArch64::STRWpost:
3855   case AArch64::STRXpost:
3856   case AArch64::STZ2GPostIndex:
3857   case AArch64::STZGPostIndex:
3858     return true;
3859   }
3860 }
3861
3862 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3863     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3864     bool &OffsetIsScalable, TypeSize &Width,
3865     const TargetRegisterInfo *TRI) const {
3866   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3867   // Handle only loads/stores with base register followed by immediate offset.
3868   if (LdSt.getNumExplicitOperands() == 3) {
3869     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3870     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3871         !LdSt.getOperand(2).isImm())
3872       return false;
3873   } else if (LdSt.getNumExplicitOperands() == 4) {
3874     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3875     if (!LdSt.getOperand(1).isReg() ||
3876         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3877         !LdSt.getOperand(3).isImm())
3878       return false;
3879   } else
3880     return false;
3881
3882   // Get the scaling factor for the instruction and set the width for the
3883   // instruction.
3884   TypeSize Scale(0U, false);
3885   int64_t Dummy1, Dummy2;
3886
3887   // If this returns false, then it's an instruction we don't want to handle.
3888   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3889     return false;
3890
3891   // Compute the offset. Offset is calculated as the immediate operand
3892   // multiplied by the scaling factor. Unscaled instructions have scaling factor
3893   // set to 1. Postindex are a special case which have an offset of 0.
3894   if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
3895     BaseOp = &LdSt.getOperand(2);
3896     Offset = 0;
3897   } else if (LdSt.getNumExplicitOperands() == 3) {
3898     BaseOp = &LdSt.getOperand(1);
3899     Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3900   } else {
3901     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3902     BaseOp = &LdSt.getOperand(2);
3903     Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3904   }
3905   OffsetIsScalable = Scale.isScalable();
3906
3907   return BaseOp->isReg() || BaseOp->isFI();
3908 }
3909
3910 MachineOperand &
3911 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3912   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3913   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3914   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3915   return OfsOp;
3916 }
3917
3918 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3919                                     TypeSize &Width, int64_t &MinOffset,
3920                                     int64_t &MaxOffset) {
3921   switch (Opcode) {
3922   // Not a memory operation or something we want to handle.
3923   default:
3924     Scale = TypeSize::getFixed(0);
3925     Width = TypeSize::getFixed(0);
3926     MinOffset = MaxOffset = 0;
3927     return false;
3928   // LDR / STR
3929   case AArch64::LDRQui:
3930   case AArch64::STRQui:
3931     Scale = TypeSize::getFixed(16);
3932     Width = TypeSize::getFixed(16);
3933     MinOffset = 0;
3934     MaxOffset = 4095;
3935     break;
3936   case AArch64::LDRXui:
3937   case AArch64::LDRDui:
3938   case AArch64::STRXui:
3939   case AArch64::STRDui:
3940   case AArch64::PRFMui:
3941     Scale = TypeSize::getFixed(8);
3942     Width = TypeSize::getFixed(8);
3943     MinOffset = 0;
3944     MaxOffset = 4095;
3945     break;
3946   case AArch64::LDRWui:
3947   case AArch64::LDRSui:
3948   case AArch64::LDRSWui:
3949   case AArch64::STRWui:
3950   case AArch64::STRSui:
3951     Scale = TypeSize::getFixed(4);
3952     Width = TypeSize::getFixed(4);
3953     MinOffset = 0;
3954     MaxOffset = 4095;
3955     break;
3956   case AArch64::LDRHui:
3957   case AArch64::LDRHHui:
3958   case AArch64::LDRSHWui:
3959   case AArch64::LDRSHXui:
3960   case AArch64::STRHui:
3961   case AArch64::STRHHui:
3962     Scale = TypeSize::getFixed(2);
3963     Width = TypeSize::getFixed(2);
3964     MinOffset = 0;
3965     MaxOffset = 4095;
3966     break;
3967   case AArch64::LDRBui:
3968   case AArch64::LDRBBui:
3969   case AArch64::LDRSBWui:
3970   case AArch64::LDRSBXui:
3971   case AArch64::STRBui:
3972   case AArch64::STRBBui:
3973     Scale = TypeSize::getFixed(1);
3974     Width = TypeSize::getFixed(1);
3975     MinOffset = 0;
3976     MaxOffset = 4095;
3977     break;
3978   // post/pre inc
3979   case AArch64::STRQpre:
3980   case AArch64::LDRQpost:
3981     Scale = TypeSize::getFixed(1);
3982     Width = TypeSize::getFixed(16);
3983     MinOffset = -256;
3984     MaxOffset = 255;
3985     break;
3986   case AArch64::LDRDpost:
3987   case AArch64::LDRDpre:
3988   case AArch64::LDRXpost:
3989   case AArch64::LDRXpre:
3990   case AArch64::STRDpost:
3991   case AArch64::STRDpre:
3992   case AArch64::STRXpost:
3993   case AArch64::STRXpre:
3994     Scale = TypeSize::getFixed(1);
3995     Width = TypeSize::getFixed(8);
3996     MinOffset = -256;
3997     MaxOffset = 255;
3998     break;
3999   case AArch64::STRWpost:
4000   case AArch64::STRWpre:
4001   case AArch64::LDRWpost:
4002   case AArch64::LDRWpre:
4003   case AArch64::STRSpost:
4004   case AArch64::STRSpre:
4005   case AArch64::LDRSpost:
4006   case AArch64::LDRSpre:
4007     Scale = TypeSize::getFixed(1);
4008     Width = TypeSize::getFixed(4);
4009     MinOffset = -256;
4010     MaxOffset = 255;
4011     break;
4012   case AArch64::LDRHpost:
4013   case AArch64::LDRHpre:
4014   case AArch64::STRHpost:
4015   case AArch64::STRHpre:
4016   case AArch64::LDRHHpost:
4017   case AArch64::LDRHHpre:
4018   case AArch64::STRHHpost:
4019   case AArch64::STRHHpre:
4020     Scale = TypeSize::getFixed(1);
4021     Width = TypeSize::getFixed(2);
4022     MinOffset = -256;
4023     MaxOffset = 255;
4024     break;
4025   case AArch64::LDRBpost:
4026   case AArch64::LDRBpre:
4027   case AArch64::STRBpost:
4028   case AArch64::STRBpre:
4029   case AArch64::LDRBBpost:
4030   case AArch64::LDRBBpre:
4031   case AArch64::STRBBpost:
4032   case AArch64::STRBBpre:
4033     Scale = TypeSize::getFixed(1);
4034     Width = TypeSize::getFixed(1);
4035     MinOffset = -256;
4036     MaxOffset = 255;
4037     break;
4038   // Unscaled
4039   case AArch64::LDURQi:
4040   case AArch64::STURQi:
4041     Scale = TypeSize::getFixed(1);
4042     Width = TypeSize::getFixed(16);
4043     MinOffset = -256;
4044     MaxOffset = 255;
4045     break;
4046   case AArch64::LDURXi:
4047   case AArch64::LDURDi:
4048   case AArch64::LDAPURXi:
4049   case AArch64::STURXi:
4050   case AArch64::STURDi:
4051   case AArch64::STLURXi:
4052   case AArch64::PRFUMi:
4053     Scale = TypeSize::getFixed(1);
4054     Width = TypeSize::getFixed(8);
4055     MinOffset = -256;
4056     MaxOffset = 255;
4057     break;
4058   case AArch64::LDURWi:
4059   case AArch64::LDURSi:
4060   case AArch64::LDURSWi:
4061   case AArch64::LDAPURi:
4062   case AArch64::LDAPURSWi:
4063   case AArch64::STURWi:
4064   case AArch64::STURSi:
4065   case AArch64::STLURWi:
4066     Scale = TypeSize::getFixed(1);
4067     Width = TypeSize::getFixed(4);
4068     MinOffset = -256;
4069     MaxOffset = 255;
4070     break;
4071   case AArch64::LDURHi:
4072   case AArch64::LDURHHi:
4073   case AArch64::LDURSHXi:
4074   case AArch64::LDURSHWi:
4075   case AArch64::LDAPURHi:
4076   case AArch64::LDAPURSHWi:
4077   case AArch64::LDAPURSHXi:
4078   case AArch64::STURHi:
4079   case AArch64::STURHHi:
4080   case AArch64::STLURHi:
4081     Scale = TypeSize::getFixed(1);
4082     Width = TypeSize::getFixed(2);
4083     MinOffset = -256;
4084     MaxOffset = 255;
4085     break;
4086   case AArch64::LDURBi:
4087   case AArch64::LDURBBi:
4088   case AArch64::LDURSBXi:
4089   case AArch64::LDURSBWi:
4090   case AArch64::LDAPURBi:
4091   case AArch64::LDAPURSBWi:
4092   case AArch64::LDAPURSBXi:
4093   case AArch64::STURBi:
4094   case AArch64::STURBBi:
4095   case AArch64::STLURBi:
4096     Scale = TypeSize::getFixed(1);
4097     Width = TypeSize::getFixed(1);
4098     MinOffset = -256;
4099     MaxOffset = 255;
4100     break;
4101   // LDP / STP (including pre/post inc)
4102   case AArch64::LDPQi:
4103   case AArch64::LDNPQi:
4104   case AArch64::STPQi:
4105   case AArch64::STNPQi:
4106   case AArch64::LDPQpost:
4107   case AArch64::LDPQpre:
4108   case AArch64::STPQpost:
4109   case AArch64::STPQpre:
4110     Scale = TypeSize::getFixed(16);
4111     Width = TypeSize::getFixed(16 * 2);
4112     MinOffset = -64;
4113     MaxOffset = 63;
4114     break;
4115   case AArch64::LDPXi:
4116   case AArch64::LDPDi:
4117   case AArch64::LDNPXi:
4118   case AArch64::LDNPDi:
4119   case AArch64::STPXi:
4120   case AArch64::STPDi:
4121   case AArch64::STNPXi:
4122   case AArch64::STNPDi:
4123   case AArch64::LDPDpost:
4124   case AArch64::LDPDpre:
4125   case AArch64::LDPXpost:
4126   case AArch64::LDPXpre:
4127   case AArch64::STPDpost:
4128   case AArch64::STPDpre:
4129   case AArch64::STPXpost:
4130   case AArch64::STPXpre:
4131     Scale = TypeSize::getFixed(8);
4132     Width = TypeSize::getFixed(8 * 2);
4133     MinOffset = -64;
4134     MaxOffset = 63;
4135     break;
4136   case AArch64::LDPWi:
4137   case AArch64::LDPSi:
4138   case AArch64::LDNPWi:
4139   case AArch64::LDNPSi:
4140   case AArch64::STPWi:
4141   case AArch64::STPSi:
4142   case AArch64::STNPWi:
4143   case AArch64::STNPSi:
4144   case AArch64::LDPSpost:
4145   case AArch64::LDPSpre:
4146   case AArch64::LDPWpost:
4147   case AArch64::LDPWpre:
4148   case AArch64::STPSpost:
4149   case AArch64::STPSpre:
4150   case AArch64::STPWpost:
4151   case AArch64::STPWpre:
4152     Scale = TypeSize::getFixed(4);
4153     Width = TypeSize::getFixed(4 * 2);
4154     MinOffset = -64;
4155     MaxOffset = 63;
4156     break;
4157   case AArch64::StoreSwiftAsyncContext:
4158     // Store is an STRXui, but there might be an ADDXri in the expansion too.
4159     Scale = TypeSize::getFixed(1);
4160     Width = TypeSize::getFixed(8);
4161     MinOffset = 0;
4162     MaxOffset = 4095;
4163     break;
4164   case AArch64::ADDG:
4165     Scale = TypeSize::getFixed(16);
4166     Width = TypeSize::getFixed(0);
4167     MinOffset = 0;
4168     MaxOffset = 63;
4169     break;
4170   case AArch64::TAGPstack:
4171     Scale = TypeSize::getFixed(16);
4172     Width = TypeSize::getFixed(0);
4173     // TAGP with a negative offset turns into SUBP, which has a maximum offset
4174     // of 63 (not 64!).
4175     MinOffset = -63;
4176     MaxOffset = 63;
4177     break;
4178   case AArch64::LDG:
4179   case AArch64::STGi:
4180   case AArch64::STGPreIndex:
4181   case AArch64::STGPostIndex:
4182   case AArch64::STZGi:
4183   case AArch64::STZGPreIndex:
4184   case AArch64::STZGPostIndex:
4185     Scale = TypeSize::getFixed(16);
4186     Width = TypeSize::getFixed(16);
4187     MinOffset = -256;
4188     MaxOffset = 255;
4189     break;
4190   // SVE
4191   case AArch64::STR_ZZZZXI:
4192   case AArch64::LDR_ZZZZXI:
4193     Scale = TypeSize::getScalable(16);
4194     Width = TypeSize::getScalable(16 * 4);
4195     MinOffset = -256;
4196     MaxOffset = 252;
4197     break;
4198   case AArch64::STR_ZZZXI:
4199   case AArch64::LDR_ZZZXI:
4200     Scale = TypeSize::getScalable(16);
4201     Width = TypeSize::getScalable(16 * 3);
4202     MinOffset = -256;
4203     MaxOffset = 253;
4204     break;
4205   case AArch64::STR_ZZXI:
4206   case AArch64::LDR_ZZXI:
4207     Scale = TypeSize::getScalable(16);
4208     Width = TypeSize::getScalable(16 * 2);
4209     MinOffset = -256;
4210     MaxOffset = 254;
4211     break;
4212   case AArch64::LDR_PXI:
4213   case AArch64::STR_PXI:
4214     Scale = TypeSize::getScalable(2);
4215     Width = TypeSize::getScalable(2);
4216     MinOffset = -256;
4217     MaxOffset = 255;
4218     break;
4219   case AArch64::LDR_PPXI:
4220   case AArch64::STR_PPXI:
4221     Scale = TypeSize::getScalable(2);
4222     Width = TypeSize::getScalable(2 * 2);
4223     MinOffset = -256;
4224     MaxOffset = 254;
4225     break;
4226   case AArch64::LDR_ZXI:
4227   case AArch64::STR_ZXI:
4228     Scale = TypeSize::getScalable(16);
4229     Width = TypeSize::getScalable(16);
4230     MinOffset = -256;
4231     MaxOffset = 255;
4232     break;
4233   case AArch64::LD1B_IMM:
4234   case AArch64::LD1H_IMM:
4235   case AArch64::LD1W_IMM:
4236   case AArch64::LD1D_IMM:
4237   case AArch64::LDNT1B_ZRI:
4238   case AArch64::LDNT1H_ZRI:
4239   case AArch64::LDNT1W_ZRI:
4240   case AArch64::LDNT1D_ZRI:
4241   case AArch64::ST1B_IMM:
4242   case AArch64::ST1H_IMM:
4243   case AArch64::ST1W_IMM:
4244   case AArch64::ST1D_IMM:
4245   case AArch64::STNT1B_ZRI:
4246   case AArch64::STNT1H_ZRI:
4247   case AArch64::STNT1W_ZRI:
4248   case AArch64::STNT1D_ZRI:
4249   case AArch64::LDNF1B_IMM:
4250   case AArch64::LDNF1H_IMM:
4251   case AArch64::LDNF1W_IMM:
4252   case AArch64::LDNF1D_IMM:
4253     // A full vectors worth of data
4254     // Width = mbytes * elements
4255     Scale = TypeSize::getScalable(16);
4256     Width = TypeSize::getScalable(16);
4257     MinOffset = -8;
4258     MaxOffset = 7;
4259     break;
4260   case AArch64::LD2B_IMM:
4261   case AArch64::LD2H_IMM:
4262   case AArch64::LD2W_IMM:
4263   case AArch64::LD2D_IMM:
4264   case AArch64::ST2B_IMM:
4265   case AArch64::ST2H_IMM:
4266   case AArch64::ST2W_IMM:
4267   case AArch64::ST2D_IMM:
4268     Scale = TypeSize::getScalable(32);
4269     Width = TypeSize::getScalable(16 * 2);
4270     MinOffset = -8;
4271     MaxOffset = 7;
4272     break;
4273   case AArch64::LD3B_IMM:
4274   case AArch64::LD3H_IMM:
4275   case AArch64::LD3W_IMM:
4276   case AArch64::LD3D_IMM:
4277   case AArch64::ST3B_IMM:
4278   case AArch64::ST3H_IMM:
4279   case AArch64::ST3W_IMM:
4280   case AArch64::ST3D_IMM:
4281     Scale = TypeSize::getScalable(48);
4282     Width = TypeSize::getScalable(16 * 3);
4283     MinOffset = -8;
4284     MaxOffset = 7;
4285     break;
4286   case AArch64::LD4B_IMM:
4287   case AArch64::LD4H_IMM:
4288   case AArch64::LD4W_IMM:
4289   case AArch64::LD4D_IMM:
4290   case AArch64::ST4B_IMM:
4291   case AArch64::ST4H_IMM:
4292   case AArch64::ST4W_IMM:
4293   case AArch64::ST4D_IMM:
4294     Scale = TypeSize::getScalable(64);
4295     Width = TypeSize::getScalable(16 * 4);
4296     MinOffset = -8;
4297     MaxOffset = 7;
4298     break;
4299   case AArch64::LD1B_H_IMM:
4300   case AArch64::LD1SB_H_IMM:
4301   case AArch64::LD1H_S_IMM:
4302   case AArch64::LD1SH_S_IMM:
4303   case AArch64::LD1W_D_IMM:
4304   case AArch64::LD1SW_D_IMM:
4305   case AArch64::ST1B_H_IMM:
4306   case AArch64::ST1H_S_IMM:
4307   case AArch64::ST1W_D_IMM:
4308   case AArch64::LDNF1B_H_IMM:
4309   case AArch64::LDNF1SB_H_IMM:
4310   case AArch64::LDNF1H_S_IMM:
4311   case AArch64::LDNF1SH_S_IMM:
4312   case AArch64::LDNF1W_D_IMM:
4313   case AArch64::LDNF1SW_D_IMM:
4314     // A half vector worth of data
4315     // Width = mbytes * elements
4316     Scale = TypeSize::getScalable(8);
4317     Width = TypeSize::getScalable(8);
4318     MinOffset = -8;
4319     MaxOffset = 7;
4320     break;
4321   case AArch64::LD1B_S_IMM:
4322   case AArch64::LD1SB_S_IMM:
4323   case AArch64::LD1H_D_IMM:
4324   case AArch64::LD1SH_D_IMM:
4325   case AArch64::ST1B_S_IMM:
4326   case AArch64::ST1H_D_IMM:
4327   case AArch64::LDNF1B_S_IMM:
4328   case AArch64::LDNF1SB_S_IMM:
4329   case AArch64::LDNF1H_D_IMM:
4330   case AArch64::LDNF1SH_D_IMM:
4331     // A quarter vector worth of data
4332     // Width = mbytes * elements
4333     Scale = TypeSize::getScalable(4);
4334     Width = TypeSize::getScalable(4);
4335     MinOffset = -8;
4336     MaxOffset = 7;
4337     break;
4338   case AArch64::LD1B_D_IMM:
4339   case AArch64::LD1SB_D_IMM:
4340   case AArch64::ST1B_D_IMM:
4341   case AArch64::LDNF1B_D_IMM:
4342   case AArch64::LDNF1SB_D_IMM:
4343     // A eighth vector worth of data
4344     // Width = mbytes * elements
4345     Scale = TypeSize::getScalable(2);
4346     Width = TypeSize::getScalable(2);
4347     MinOffset = -8;
4348     MaxOffset = 7;
4349     break;
4350   case AArch64::ST2Gi:
4351   case AArch64::ST2GPreIndex:
4352   case AArch64::ST2GPostIndex:
4353   case AArch64::STZ2Gi:
4354   case AArch64::STZ2GPreIndex:
4355   case AArch64::STZ2GPostIndex:
4356     Scale = TypeSize::getFixed(16);
4357     Width = TypeSize::getFixed(32);
4358     MinOffset = -256;
4359     MaxOffset = 255;
4360     break;
4361   case AArch64::STGPi:
4362   case AArch64::STGPpost:
4363   case AArch64::STGPpre:
4364     Scale = TypeSize::getFixed(16);
4365     Width = TypeSize::getFixed(16);
4366     MinOffset = -64;
4367     MaxOffset = 63;
4368     break;
4369   case AArch64::LD1RB_IMM:
4370   case AArch64::LD1RB_H_IMM:
4371   case AArch64::LD1RB_S_IMM:
4372   case AArch64::LD1RB_D_IMM:
4373   case AArch64::LD1RSB_H_IMM:
4374   case AArch64::LD1RSB_S_IMM:
4375   case AArch64::LD1RSB_D_IMM:
4376     Scale = TypeSize::getFixed(1);
4377     Width = TypeSize::getFixed(1);
4378     MinOffset = 0;
4379     MaxOffset = 63;
4380     break;
4381   case AArch64::LD1RH_IMM:
4382   case AArch64::LD1RH_S_IMM:
4383   case AArch64::LD1RH_D_IMM:
4384   case AArch64::LD1RSH_S_IMM:
4385   case AArch64::LD1RSH_D_IMM:
4386     Scale = TypeSize::getFixed(2);
4387     Width = TypeSize::getFixed(2);
4388     MinOffset = 0;
4389     MaxOffset = 63;
4390     break;
4391   case AArch64::LD1RW_IMM:
4392   case AArch64::LD1RW_D_IMM:
4393   case AArch64::LD1RSW_IMM:
4394     Scale = TypeSize::getFixed(4);
4395     Width = TypeSize::getFixed(4);
4396     MinOffset = 0;
4397     MaxOffset = 63;
4398     break;
4399   case AArch64::LD1RD_IMM:
4400     Scale = TypeSize::getFixed(8);
4401     Width = TypeSize::getFixed(8);
4402     MinOffset = 0;
4403     MaxOffset = 63;
4404     break;
4405   }
4406
4407   return true;
4408 }
4409
4410 // Scaling factor for unscaled load or store.
4411 int AArch64InstrInfo::getMemScale(unsigned Opc) {
4412   switch (Opc) {
4413   default:
4414     llvm_unreachable("Opcode has unknown scale!");
4415   case AArch64::LDRBBui:
4416   case AArch64::LDURBBi:
4417   case AArch64::LDRSBWui:
4418   case AArch64::LDURSBWi:
4419   case AArch64::STRBBui:
4420   case AArch64::STURBBi:
4421     return 1;
4422   case AArch64::LDRHHui:
4423   case AArch64::LDURHHi:
4424   case AArch64::LDRSHWui:
4425   case AArch64::LDURSHWi:
4426   case AArch64::STRHHui:
4427   case AArch64::STURHHi:
4428     return 2;
4429   case AArch64::LDRSui:
4430   case AArch64::LDURSi:
4431   case AArch64::LDRSpre:
4432   case AArch64::LDRSWui:
4433   case AArch64::LDURSWi:
4434   case AArch64::LDRSWpre:
4435   case AArch64::LDRWpre:
4436   case AArch64::LDRWui:
4437   case AArch64::LDURWi:
4438   case AArch64::STRSui:
4439   case AArch64::STURSi:
4440   case AArch64::STRSpre:
4441   case AArch64::STRWui:
4442   case AArch64::STURWi:
4443   case AArch64::STRWpre:
4444   case AArch64::LDPSi:
4445   case AArch64::LDPSWi:
4446   case AArch64::LDPWi:
4447   case AArch64::STPSi:
4448   case AArch64::STPWi:
4449     return 4;
4450   case AArch64::LDRDui:
4451   case AArch64::LDURDi:
4452   case AArch64::LDRDpre:
4453   case AArch64::LDRXui:
4454   case AArch64::LDURXi:
4455   case AArch64::LDRXpre:
4456   case AArch64::STRDui:
4457   case AArch64::STURDi:
4458   case AArch64::STRDpre:
4459   case AArch64::STRXui:
4460   case AArch64::STURXi:
4461   case AArch64::STRXpre:
4462   case AArch64::LDPDi:
4463   case AArch64::LDPXi:
4464   case AArch64::STPDi:
4465   case AArch64::STPXi:
4466     return 8;
4467   case AArch64::LDRQui:
4468   case AArch64::LDURQi:
4469   case AArch64::STRQui:
4470   case AArch64::STURQi:
4471   case AArch64::STRQpre:
4472   case AArch64::LDPQi:
4473   case AArch64::LDRQpre:
4474   case AArch64::STPQi:
4475   case AArch64::STGi:
4476   case AArch64::STZGi:
4477   case AArch64::ST2Gi:
4478   case AArch64::STZ2Gi:
4479   case AArch64::STGPi:
4480     return 16;
4481   }
4482 }
4483
4484 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4485   switch (MI.getOpcode()) {
4486   default:
4487     return false;
4488   case AArch64::LDRWpre:
4489   case AArch64::LDRXpre:
4490   case AArch64::LDRSWpre:
4491   case AArch64::LDRSpre:
4492   case AArch64::LDRDpre:
4493   case AArch64::LDRQpre:
4494     return true;
4495   }
4496 }
4497
4498 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4499   switch (MI.getOpcode()) {
4500   default:
4501     return false;
4502   case AArch64::STRWpre:
4503   case AArch64::STRXpre:
4504   case AArch64::STRSpre:
4505   case AArch64::STRDpre:
4506   case AArch64::STRQpre:
4507     return true;
4508   }
4509 }
4510
4511 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4512   return isPreLd(MI) || isPreSt(MI);
4513 }
4514
4515 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4516   switch (MI.getOpcode()) {
4517   default:
4518     return false;
4519   case AArch64::LDPSi:
4520   case AArch64::LDPSWi:
4521   case AArch64::LDPDi:
4522   case AArch64::LDPQi:
4523   case AArch64::LDPWi:
4524   case AArch64::LDPXi:
4525   case AArch64::STPSi:
4526   case AArch64::STPDi:
4527   case AArch64::STPQi:
4528   case AArch64::STPWi:
4529   case AArch64::STPXi:
4530   case AArch64::STGPi:
4531     return true;
4532   }
4533 }
4534
4535 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4536   assert(MI.mayLoadOrStore() && "Load or store instruction expected");
4537   unsigned Idx =
4538       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4539                                                                             : 1;
4540   return MI.getOperand(Idx);
4541 }
4542
4543 const MachineOperand &
4544 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4545   assert(MI.mayLoadOrStore() && "Load or store instruction expected");
4546   unsigned Idx =
4547       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4548                                                                             : 2;
4549   return MI.getOperand(Idx);
4550 }
4551
4552 const MachineOperand &
4553 AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
4554   switch (MI.getOpcode()) {
4555   default:
4556     llvm_unreachable("Unexpected opcode");
4557   case AArch64::LDRBroX:
4558   case AArch64::LDRBBroX:
4559   case AArch64::LDRSBXroX:
4560   case AArch64::LDRSBWroX:
4561   case AArch64::LDRHroX:
4562   case AArch64::LDRHHroX:
4563   case AArch64::LDRSHXroX:
4564   case AArch64::LDRSHWroX:
4565   case AArch64::LDRWroX:
4566   case AArch64::LDRSroX:
4567   case AArch64::LDRSWroX:
4568   case AArch64::LDRDroX:
4569   case AArch64::LDRXroX:
4570   case AArch64::LDRQroX:
4571     return MI.getOperand(4);
4572   }
4573 }
4574
4575 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4576                                               Register Reg) {
4577   if (MI.getParent() == nullptr)
4578     return nullptr;
4579   const MachineFunction *MF = MI.getParent()->getParent();
4580   return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4581 }
4582
4583 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4584   auto IsHFPR = [&](const MachineOperand &Op) {
4585     if (!Op.isReg())
4586       return false;
4587     auto Reg = Op.getReg();
4588     if (Reg.isPhysical())
4589       return AArch64::FPR16RegClass.contains(Reg);
4590     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4591     return TRC == &AArch64::FPR16RegClass ||
4592            TRC == &AArch64::FPR16_loRegClass;
4593   };
4594   return llvm::any_of(MI.operands(), IsHFPR);
4595 }
4596
4597 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4598   auto IsQFPR = [&](const MachineOperand &Op) {
4599     if (!Op.isReg())
4600       return false;
4601     auto Reg = Op.getReg();
4602     if (Reg.isPhysical())
4603       return AArch64::FPR128RegClass.contains(Reg);
4604     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4605     return TRC == &AArch64::FPR128RegClass ||
4606            TRC == &AArch64::FPR128_loRegClass;
4607   };
4608   return llvm::any_of(MI.operands(), IsQFPR);
4609 }
4610
4611 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4612   switch (MI.getOpcode()) {
4613   case AArch64::BRK:
4614   case AArch64::HLT:
4615   case AArch64::PACIASP:
4616   case AArch64::PACIBSP:
4617     // Implicit BTI behavior.
4618     return true;
4619   case AArch64::PAUTH_PROLOGUE:
4620     // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4621     return true;
4622   case AArch64::HINT: {
4623     unsigned Imm = MI.getOperand(0).getImm();
4624     // Explicit BTI instruction.
4625     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4626       return true;
4627     // PACI(A|B)SP instructions.
4628     if (Imm == 25 || Imm == 27)
4629       return true;
4630     return false;
4631   }
4632   default:
4633     return false;
4634   }
4635 }
4636
4637 bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4638   if (Reg == 0)
4639     return false;
4640   assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4641   return AArch64::FPR128RegClass.contains(Reg) ||
4642          AArch64::FPR64RegClass.contains(Reg) ||
4643          AArch64::FPR32RegClass.contains(Reg) ||
4644          AArch64::FPR16RegClass.contains(Reg) ||
4645          AArch64::FPR8RegClass.contains(Reg);
4646 }
4647
4648 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4649   auto IsFPR = [&](const MachineOperand &Op) {
4650     if (!Op.isReg())
4651       return false;
4652     auto Reg = Op.getReg();
4653     if (Reg.isPhysical())
4654       return isFpOrNEON(Reg);
4655
4656     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4657     return TRC == &AArch64::FPR128RegClass ||
4658            TRC == &AArch64::FPR128_loRegClass ||
4659            TRC == &AArch64::FPR64RegClass ||
4660            TRC == &AArch64::FPR64_loRegClass ||
4661            TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4662            TRC == &AArch64::FPR8RegClass;
4663   };
4664   return llvm::any_of(MI.operands(), IsFPR);
4665 }
4666
4667 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
4668 // scaled.
4669 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4670   int Scale = AArch64InstrInfo::getMemScale(Opc);
4671
4672   // If the byte-offset isn't a multiple of the stride, we can't scale this
4673   // offset.
4674   if (Offset % Scale != 0)
4675     return false;
4676
4677   // Convert the byte-offset used by unscaled into an "element" offset used
4678   // by the scaled pair load/store instructions.
4679   Offset /= Scale;
4680   return true;
4681 }
4682
4683 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4684   if (FirstOpc == SecondOpc)
4685     return true;
4686   // We can also pair sign-ext and zero-ext instructions.
4687   switch (FirstOpc) {
4688   default:
4689     return false;
4690   case AArch64::STRSui:
4691   case AArch64::STURSi:
4692     return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4693   case AArch64::STRDui:
4694   case AArch64::STURDi:
4695     return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4696   case AArch64::STRQui:
4697   case AArch64::STURQi:
4698     return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4699   case AArch64::STRWui:
4700   case AArch64::STURWi:
4701     return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4702   case AArch64::STRXui:
4703   case AArch64::STURXi:
4704     return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4705   case AArch64::LDRSui:
4706   case AArch64::LDURSi:
4707     return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4708   case AArch64::LDRDui:
4709   case AArch64::LDURDi:
4710     return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4711   case AArch64::LDRQui:
4712   case AArch64::LDURQi:
4713     return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4714   case AArch64::LDRWui:
4715   case AArch64::LDURWi:
4716     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4717   case AArch64::LDRSWui:
4718   case AArch64::LDURSWi:
4719     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4720   case AArch64::LDRXui:
4721   case AArch64::LDURXi:
4722     return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4723   }
4724   // These instructions can't be paired based on their opcodes.
4725   return false;
4726 }
4727
4728 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4729                             int64_t Offset1, unsigned Opcode1, int FI2,
4730                             int64_t Offset2, unsigned Opcode2) {
4731   // Accesses through fixed stack object frame indices may access a different
4732   // fixed stack slot. Check that the object offsets + offsets match.
4733   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4734     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4735     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4736     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4737     // Convert to scaled object offsets.
4738     int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4739     if (ObjectOffset1 % Scale1 != 0)
4740       return false;
4741     ObjectOffset1 /= Scale1;
4742     int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4743     if (ObjectOffset2 % Scale2 != 0)
4744       return false;
4745     ObjectOffset2 /= Scale2;
4746     ObjectOffset1 += Offset1;
4747     ObjectOffset2 += Offset2;
4748     return ObjectOffset1 + 1 == ObjectOffset2;
4749   }
4750
4751   return FI1 == FI2;
4752 }
4753
4754 /// Detect opportunities for ldp/stp formation.
4755 ///
4756 /// Only called for LdSt for which getMemOperandWithOffset returns true.
4757 bool AArch64InstrInfo::shouldClusterMemOps(
4758     ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4759     bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4760     int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4761     unsigned NumBytes) const {
4762   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4763   const MachineOperand &BaseOp1 = *BaseOps1.front();
4764   const MachineOperand &BaseOp2 = *BaseOps2.front();
4765   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4766   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4767   if (BaseOp1.getType() != BaseOp2.getType())
4768     return false;
4769
4770   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4771          "Only base registers and frame indices are supported.");
4772
4773   // Check for both base regs and base FI.
4774   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4775     return false;
4776
4777   // Only cluster up to a single pair.
4778   if (ClusterSize > 2)
4779     return false;
4780
4781   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4782     return false;
4783
4784   // Can we pair these instructions based on their opcodes?
4785   unsigned FirstOpc = FirstLdSt.getOpcode();
4786   unsigned SecondOpc = SecondLdSt.getOpcode();
4787   if (!canPairLdStOpc(FirstOpc, SecondOpc))
4788     return false;
4789
4790   // Can't merge volatiles or load/stores that have a hint to avoid pair
4791   // formation, for example.
4792   if (!isCandidateToMergeOrPair(FirstLdSt) ||
4793       !isCandidateToMergeOrPair(SecondLdSt))
4794     return false;
4795
4796   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4797   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4798   if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4799     return false;
4800
4801   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4802   if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4803     return false;
4804
4805   // Pairwise instructions have a 7-bit signed offset field.
4806   if (Offset1 > 63 || Offset1 < -64)
4807     return false;
4808
4809   // The caller should already have ordered First/SecondLdSt by offset.
4810   // Note: except for non-equal frame index bases
4811   if (BaseOp1.isFI()) {
4812     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4813            "Caller should have ordered offsets.");
4814
4815     const MachineFrameInfo &MFI =
4816         FirstLdSt.getParent()->getParent()->getFrameInfo();
4817     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4818                            BaseOp2.getIndex(), Offset2, SecondOpc);
4819   }
4820
4821   assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4822
4823   return Offset1 + 1 == Offset2;
4824 }
4825
4826 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4827                                             MCRegister Reg, unsigned SubIdx,
4828                                             unsigned State,
4829                                             const TargetRegisterInfo *TRI) {
4830   if (!SubIdx)
4831     return MIB.addReg(Reg, State);
4832
4833   if (Reg.isPhysical())
4834     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4835   return MIB.addReg(Reg, State, SubIdx);
4836 }
4837
4838 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4839                                         unsigned NumRegs) {
4840   // We really want the positive remainder mod 32 here, that happens to be
4841   // easily obtainable with a mask.
4842   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4843 }
4844
4845 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4846                                         MachineBasicBlock::iterator I,
4847                                         const DebugLoc &DL, MCRegister DestReg,
4848                                         MCRegister SrcReg, bool KillSrc,
4849                                         unsigned Opcode,
4850                                         ArrayRef<unsigned> Indices) const {
4851   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4852   const TargetRegisterInfo *TRI = &getRegisterInfo();
4853   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4854   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4855   unsigned NumRegs = Indices.size();
4856
4857   int SubReg = 0, End = NumRegs, Incr = 1;
4858   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4859     SubReg = NumRegs - 1;
4860     End = -1;
4861     Incr = -1;
4862   }
4863
4864   for (; SubReg != End; SubReg += Incr) {
4865     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4866     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4867     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4868     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4869   }
4870 }
4871
4872 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4873                                        MachineBasicBlock::iterator I,
4874                                        const DebugLoc &DL, MCRegister DestReg,
4875                                        MCRegister SrcReg, bool KillSrc,
4876                                        unsigned Opcode, unsigned ZeroReg,
4877                                        llvm::ArrayRef<unsigned> Indices) const {
4878   const TargetRegisterInfo *TRI = &getRegisterInfo();
4879   unsigned NumRegs = Indices.size();
4880
4881 #ifndef NDEBUG
4882   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4883   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4884   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4885          "GPR reg sequences should not be able to overlap");
4886 #endif
4887
4888   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4889     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4890     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4891     MIB.addReg(ZeroReg);
4892     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4893     MIB.addImm(0);
4894   }
4895 }
4896
4897 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4898                                    MachineBasicBlock::iterator I,
4899                                    const DebugLoc &DL, MCRegister DestReg,
4900                                    MCRegister SrcReg, bool KillSrc,
4901                                    bool RenamableDest,
4902                                    bool RenamableSrc) const {
4903   if (AArch64::GPR32spRegClass.contains(DestReg) &&
4904       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4905     const TargetRegisterInfo *TRI = &getRegisterInfo();
4906
4907     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4908       // If either operand is WSP, expand to ADD #0.
4909       if (Subtarget.hasZeroCycleRegMove()) {
4910         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4911         MCRegister DestRegX = TRI->getMatchingSuperReg(
4912             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4913         MCRegister SrcRegX = TRI->getMatchingSuperReg(
4914             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4915         // This instruction is reading and writing X registers.  This may upset
4916         // the register scavenger and machine verifier, so we need to indicate
4917         // that we are reading an undefined value from SrcRegX, but a proper
4918         // value from SrcReg.
4919         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4920             .addReg(SrcRegX, RegState::Undef)
4921             .addImm(0)
4922             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4923             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4924       } else {
4925         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4926             .addReg(SrcReg, getKillRegState(KillSrc))
4927             .addImm(0)
4928             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4929       }
4930     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4931       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4932           .addImm(0)
4933           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4934     } else {
4935       if (Subtarget.hasZeroCycleRegMove()) {
4936         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4937         MCRegister DestRegX = TRI->getMatchingSuperReg(
4938             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4939         MCRegister SrcRegX = TRI->getMatchingSuperReg(
4940             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4941         // This instruction is reading and writing X registers.  This may upset
4942         // the register scavenger and machine verifier, so we need to indicate
4943         // that we are reading an undefined value from SrcRegX, but a proper
4944         // value from SrcReg.
4945         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4946             .addReg(AArch64::XZR)
4947             .addReg(SrcRegX, RegState::Undef)
4948             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4949       } else {
4950         // Otherwise, expand to ORR WZR.
4951         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4952             .addReg(AArch64::WZR)
4953             .addReg(SrcReg, getKillRegState(KillSrc));
4954       }
4955     }
4956     return;
4957   }
4958
4959   // Copy a Predicate register by ORRing with itself.
4960   if (AArch64::PPRRegClass.contains(DestReg) &&
4961       AArch64::PPRRegClass.contains(SrcReg)) {
4962     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4963            "Unexpected SVE register.");
4964     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4965       .addReg(SrcReg) // Pg
4966       .addReg(SrcReg)
4967       .addReg(SrcReg, getKillRegState(KillSrc));
4968     return;
4969   }
4970
4971   // Copy a predicate-as-counter register by ORRing with itself as if it
4972   // were a regular predicate (mask) register.
4973   bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4974   bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4975   if (DestIsPNR || SrcIsPNR) {
4976     auto ToPPR = [](MCRegister R) -> MCRegister {
4977       return (R - AArch64::PN0) + AArch64::P0;
4978     };
4979     MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4980     MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4981
4982     if (PPRSrcReg != PPRDestReg) {
4983       auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4984                        .addReg(PPRSrcReg) // Pg
4985                        .addReg(PPRSrcReg)
4986                        .addReg(PPRSrcReg, getKillRegState(KillSrc));
4987       if (DestIsPNR)
4988         NewMI.addDef(DestReg, RegState::Implicit);
4989     }
4990     return;
4991   }
4992
4993   // Copy a Z register by ORRing with itself.
4994   if (AArch64::ZPRRegClass.contains(DestReg) &&
4995       AArch64::ZPRRegClass.contains(SrcReg)) {
4996     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4997            "Unexpected SVE register.");
4998     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4999       .addReg(SrcReg)
5000       .addReg(SrcReg, getKillRegState(KillSrc));
5001     return;
5002   }
5003
5004   // Copy a Z register pair by copying the individual sub-registers.
5005   if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5006        AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5007       (AArch64::ZPR2RegClass.contains(SrcReg) ||
5008        AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5009     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5010            "Unexpected SVE register.");
5011     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5012     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5013                      Indices);
5014     return;
5015   }
5016
5017   // Copy a Z register triple by copying the individual sub-registers.
5018   if (AArch64::ZPR3RegClass.contains(DestReg) &&
5019       AArch64::ZPR3RegClass.contains(SrcReg)) {
5020     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5021            "Unexpected SVE register.");
5022     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5023                                        AArch64::zsub2};
5024     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5025                      Indices);
5026     return;
5027   }
5028
5029   // Copy a Z register quad by copying the individual sub-registers.
5030   if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5031        AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5032       (AArch64::ZPR4RegClass.contains(SrcReg) ||
5033        AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5034     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5035            "Unexpected SVE register.");
5036     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5037                                        AArch64::zsub2, AArch64::zsub3};
5038     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5039                      Indices);
5040     return;
5041   }
5042
5043   if (AArch64::GPR64spRegClass.contains(DestReg) &&
5044       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
5045     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5046       // If either operand is SP, expand to ADD #0.
5047       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5048           .addReg(SrcReg, getKillRegState(KillSrc))
5049           .addImm(0)
5050           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5051     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
5052       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5053           .addImm(0)
5054           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5055     } else {
5056       // Otherwise, expand to ORR XZR.
5057       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5058           .addReg(AArch64::XZR)
5059           .addReg(SrcReg, getKillRegState(KillSrc));
5060     }
5061     return;
5062   }
5063
5064   // Copy a DDDD register quad by copying the individual sub-registers.
5065   if (AArch64::DDDDRegClass.contains(DestReg) &&
5066       AArch64::DDDDRegClass.contains(SrcReg)) {
5067     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5068                                        AArch64::dsub2, AArch64::dsub3};
5069     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5070                      Indices);
5071     return;
5072   }
5073
5074   // Copy a DDD register triple by copying the individual sub-registers.
5075   if (AArch64::DDDRegClass.contains(DestReg) &&
5076       AArch64::DDDRegClass.contains(SrcReg)) {
5077     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5078                                        AArch64::dsub2};
5079     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5080                      Indices);
5081     return;
5082   }
5083
5084   // Copy a DD register pair by copying the individual sub-registers.
5085   if (AArch64::DDRegClass.contains(DestReg) &&
5086       AArch64::DDRegClass.contains(SrcReg)) {
5087     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5088     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5089                      Indices);
5090     return;
5091   }
5092
5093   // Copy a QQQQ register quad by copying the individual sub-registers.
5094   if (AArch64::QQQQRegClass.contains(DestReg) &&
5095       AArch64::QQQQRegClass.contains(SrcReg)) {
5096     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5097                                        AArch64::qsub2, AArch64::qsub3};
5098     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5099                      Indices);
5100     return;
5101   }
5102
5103   // Copy a QQQ register triple by copying the individual sub-registers.
5104   if (AArch64::QQQRegClass.contains(DestReg) &&
5105       AArch64::QQQRegClass.contains(SrcReg)) {
5106     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5107                                        AArch64::qsub2};
5108     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5109                      Indices);
5110     return;
5111   }
5112
5113   // Copy a QQ register pair by copying the individual sub-registers.
5114   if (AArch64::QQRegClass.contains(DestReg) &&
5115       AArch64::QQRegClass.contains(SrcReg)) {
5116     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5117     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5118                      Indices);
5119     return;
5120   }
5121
5122   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5123       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5124     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5125     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5126                     AArch64::XZR, Indices);
5127     return;
5128   }
5129
5130   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5131       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5132     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5133     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5134                     AArch64::WZR, Indices);
5135     return;
5136   }
5137
5138   if (AArch64::FPR128RegClass.contains(DestReg) &&
5139       AArch64::FPR128RegClass.contains(SrcReg)) {
5140     if (Subtarget.isSVEorStreamingSVEAvailable() &&
5141         !Subtarget.isNeonAvailable())
5142       BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5143           .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5144           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5145           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5146     else if (Subtarget.isNeonAvailable())
5147       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5148           .addReg(SrcReg)
5149           .addReg(SrcReg, getKillRegState(KillSrc));
5150     else {
5151       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5152           .addReg(AArch64::SP, RegState::Define)
5153           .addReg(SrcReg, getKillRegState(KillSrc))
5154           .addReg(AArch64::SP)
5155           .addImm(-16);
5156       BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5157           .addReg(AArch64::SP, RegState::Define)
5158           .addReg(DestReg, RegState::Define)
5159           .addReg(AArch64::SP)
5160           .addImm(16);
5161     }
5162     return;
5163   }
5164
5165   if (AArch64::FPR64RegClass.contains(DestReg) &&
5166       AArch64::FPR64RegClass.contains(SrcReg)) {
5167     BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5168         .addReg(SrcReg, getKillRegState(KillSrc));
5169     return;
5170   }
5171
5172   if (AArch64::FPR32RegClass.contains(DestReg) &&
5173       AArch64::FPR32RegClass.contains(SrcReg)) {
5174     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5175         .addReg(SrcReg, getKillRegState(KillSrc));
5176     return;
5177   }
5178
5179   if (AArch64::FPR16RegClass.contains(DestReg) &&
5180       AArch64::FPR16RegClass.contains(SrcReg)) {
5181     DestReg =
5182         RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
5183     SrcReg =
5184         RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
5185     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5186         .addReg(SrcReg, getKillRegState(KillSrc));
5187     return;
5188   }
5189
5190   if (AArch64::FPR8RegClass.contains(DestReg) &&
5191       AArch64::FPR8RegClass.contains(SrcReg)) {
5192     DestReg =
5193         RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
5194     SrcReg =
5195         RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
5196     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5197         .addReg(SrcReg, getKillRegState(KillSrc));
5198     return;
5199   }
5200
5201   // Copies between GPR64 and FPR64.
5202   if (AArch64::FPR64RegClass.contains(DestReg) &&
5203       AArch64::GPR64RegClass.contains(SrcReg)) {
5204     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5205         .addReg(SrcReg, getKillRegState(KillSrc));
5206     return;
5207   }
5208   if (AArch64::GPR64RegClass.contains(DestReg) &&
5209       AArch64::FPR64RegClass.contains(SrcReg)) {
5210     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5211         .addReg(SrcReg, getKillRegState(KillSrc));
5212     return;
5213   }
5214   // Copies between GPR32 and FPR32.
5215   if (AArch64::FPR32RegClass.contains(DestReg) &&
5216       AArch64::GPR32RegClass.contains(SrcReg)) {
5217     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5218         .addReg(SrcReg, getKillRegState(KillSrc));
5219     return;
5220   }
5221   if (AArch64::GPR32RegClass.contains(DestReg) &&
5222       AArch64::FPR32RegClass.contains(SrcReg)) {
5223     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5224         .addReg(SrcReg, getKillRegState(KillSrc));
5225     return;
5226   }
5227
5228   if (DestReg == AArch64::NZCV) {
5229     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5230     BuildMI(MBB, I, DL, get(AArch64::MSR))
5231         .addImm(AArch64SysReg::NZCV)
5232         .addReg(SrcReg, getKillRegState(KillSrc))
5233         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
5234     return;
5235   }
5236
5237   if (SrcReg == AArch64::NZCV) {
5238     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5239     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
5240         .addImm(AArch64SysReg::NZCV)
5241         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
5242     return;
5243   }
5244
5245 #ifndef NDEBUG
5246   const TargetRegisterInfo &TRI = getRegisterInfo();
5247   errs() << TRI.getRegAsmName(DestReg) << " = COPY "
5248          << TRI.getRegAsmName(SrcReg) << "\n";
5249 #endif
5250   llvm_unreachable("unimplemented reg-to-reg copy");
5251 }
5252
5253 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
5254                                     MachineBasicBlock &MBB,
5255                                     MachineBasicBlock::iterator InsertBefore,
5256                                     const MCInstrDesc &MCID,
5257                                     Register SrcReg, bool IsKill,
5258                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
5259                                     MachineMemOperand *MMO) {
5260   Register SrcReg0 = SrcReg;
5261   Register SrcReg1 = SrcReg;
5262   if (SrcReg.isPhysical()) {
5263     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
5264     SubIdx0 = 0;
5265     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
5266     SubIdx1 = 0;
5267   }
5268   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5269       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
5270       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
5271       .addFrameIndex(FI)
5272       .addImm(0)
5273       .addMemOperand(MMO);
5274 }
5275
5276 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
5277                                            MachineBasicBlock::iterator MBBI,
5278                                            Register SrcReg, bool isKill, int FI,
5279                                            const TargetRegisterClass *RC,
5280                                            const TargetRegisterInfo *TRI,
5281                                            Register VReg,
5282                                            MachineInstr::MIFlag Flags) const {
5283   MachineFunction &MF = *MBB.getParent();
5284   MachineFrameInfo &MFI = MF.getFrameInfo();
5285
5286   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5287   MachineMemOperand *MMO =
5288       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
5289                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5290   unsigned Opc = 0;
5291   bool Offset = true;
5292   MCRegister PNRReg = MCRegister::NoRegister;
5293   unsigned StackID = TargetStackID::Default;
5294   switch (TRI->getSpillSize(*RC)) {
5295   case 1:
5296     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5297       Opc = AArch64::STRBui;
5298     break;
5299   case 2: {
5300     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5301       Opc = AArch64::STRHui;
5302     else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
5303              AArch64::PPRRegClass.hasSubClassEq(RC)) {
5304       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5305              "Unexpected register store without SVE store instructions");
5306       Opc = AArch64::STR_PXI;
5307       StackID = TargetStackID::ScalableVector;
5308     }
5309     break;
5310   }
5311   case 4:
5312     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5313       Opc = AArch64::STRWui;
5314       if (SrcReg.isVirtual())
5315         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
5316       else
5317         assert(SrcReg != AArch64::WSP);
5318     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5319       Opc = AArch64::STRSui;
5320     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5321       Opc = AArch64::STR_PPXI;
5322       StackID = TargetStackID::ScalableVector;
5323     }
5324     break;
5325   case 8:
5326     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5327       Opc = AArch64::STRXui;
5328       if (SrcReg.isVirtual())
5329         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5330       else
5331         assert(SrcReg != AArch64::SP);
5332     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5333       Opc = AArch64::STRDui;
5334     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5335       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
5336                               get(AArch64::STPWi), SrcReg, isKill,
5337                               AArch64::sube32, AArch64::subo32, FI, MMO);
5338       return;
5339     }
5340     break;
5341   case 16:
5342     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5343       Opc = AArch64::STRQui;
5344     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5345       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5346       Opc = AArch64::ST1Twov1d;
5347       Offset = false;
5348     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5349       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
5350                               get(AArch64::STPXi), SrcReg, isKill,
5351                               AArch64::sube64, AArch64::subo64, FI, MMO);
5352       return;
5353     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5354       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5355              "Unexpected register store without SVE store instructions");
5356       Opc = AArch64::STR_ZXI;
5357       StackID = TargetStackID::ScalableVector;
5358     }
5359     break;
5360   case 24:
5361     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5362       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5363       Opc = AArch64::ST1Threev1d;
5364       Offset = false;
5365     }
5366     break;
5367   case 32:
5368     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5369       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5370       Opc = AArch64::ST1Fourv1d;
5371       Offset = false;
5372     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5373       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5374       Opc = AArch64::ST1Twov2d;
5375       Offset = false;
5376     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5377                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5378       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5379              "Unexpected register store without SVE store instructions");
5380       Opc = AArch64::STR_ZZXI;
5381       StackID = TargetStackID::ScalableVector;
5382     }
5383     break;
5384   case 48:
5385     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5386       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5387       Opc = AArch64::ST1Threev2d;
5388       Offset = false;
5389     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5390       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5391              "Unexpected register store without SVE store instructions");
5392       Opc = AArch64::STR_ZZZXI;
5393       StackID = TargetStackID::ScalableVector;
5394     }
5395     break;
5396   case 64:
5397     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5398       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5399       Opc = AArch64::ST1Fourv2d;
5400       Offset = false;
5401     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5402                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5403       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5404              "Unexpected register store without SVE store instructions");
5405       Opc = AArch64::STR_ZZZZXI;
5406       StackID = TargetStackID::ScalableVector;
5407     }
5408     break;
5409   }
5410   assert(Opc && "Unknown register class");
5411   MFI.setStackID(FI, StackID);
5412
5413   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5414                                      .addReg(SrcReg, getKillRegState(isKill))
5415                                      .addFrameIndex(FI);
5416
5417   if (Offset)
5418     MI.addImm(0);
5419   if (PNRReg.isValid())
5420     MI.addDef(PNRReg, RegState::Implicit);
5421   MI.addMemOperand(MMO);
5422 }
5423
5424 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
5425                                      MachineBasicBlock &MBB,
5426                                      MachineBasicBlock::iterator InsertBefore,
5427                                      const MCInstrDesc &MCID,
5428                                      Register DestReg, unsigned SubIdx0,
5429                                      unsigned SubIdx1, int FI,
5430                                      MachineMemOperand *MMO) {
5431   Register DestReg0 = DestReg;
5432   Register DestReg1 = DestReg;
5433   bool IsUndef = true;
5434   if (DestReg.isPhysical()) {
5435     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
5436     SubIdx0 = 0;
5437     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
5438     SubIdx1 = 0;
5439     IsUndef = false;
5440   }
5441   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5442       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
5443       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
5444       .addFrameIndex(FI)
5445       .addImm(0)
5446       .addMemOperand(MMO);
5447 }
5448
5449 void AArch64InstrInfo::loadRegFromStackSlot(
5450     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
5451     int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
5452     Register VReg, MachineInstr::MIFlag Flags) const {
5453   MachineFunction &MF = *MBB.getParent();
5454   MachineFrameInfo &MFI = MF.getFrameInfo();
5455   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5456   MachineMemOperand *MMO =
5457       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
5458                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5459
5460   unsigned Opc = 0;
5461   bool Offset = true;
5462   unsigned StackID = TargetStackID::Default;
5463   Register PNRReg = MCRegister::NoRegister;
5464   switch (TRI->getSpillSize(*RC)) {
5465   case 1:
5466     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5467       Opc = AArch64::LDRBui;
5468     break;
5469   case 2: {
5470     bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5471     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5472       Opc = AArch64::LDRHui;
5473     else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5474       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5475              "Unexpected register load without SVE load instructions");
5476       if (IsPNR)
5477         PNRReg = DestReg;
5478       Opc = AArch64::LDR_PXI;
5479       StackID = TargetStackID::ScalableVector;
5480     }
5481     break;
5482   }
5483   case 4:
5484     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5485       Opc = AArch64::LDRWui;
5486       if (DestReg.isVirtual())
5487         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5488       else
5489         assert(DestReg != AArch64::WSP);
5490     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5491       Opc = AArch64::LDRSui;
5492     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5493       Opc = AArch64::LDR_PPXI;
5494       StackID = TargetStackID::ScalableVector;
5495     }
5496     break;
5497   case 8:
5498     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5499       Opc = AArch64::LDRXui;
5500       if (DestReg.isVirtual())
5501         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5502       else
5503         assert(DestReg != AArch64::SP);
5504     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5505       Opc = AArch64::LDRDui;
5506     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5507       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5508                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
5509                                AArch64::subo32, FI, MMO);
5510       return;
5511     }
5512     break;
5513   case 16:
5514     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5515       Opc = AArch64::LDRQui;
5516     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5517       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5518       Opc = AArch64::LD1Twov1d;
5519       Offset = false;
5520     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5521       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5522                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
5523                                AArch64::subo64, FI, MMO);
5524       return;
5525     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5526       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5527              "Unexpected register load without SVE load instructions");
5528       Opc = AArch64::LDR_ZXI;
5529       StackID = TargetStackID::ScalableVector;
5530     }
5531     break;
5532   case 24:
5533     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5534       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5535       Opc = AArch64::LD1Threev1d;
5536       Offset = false;
5537     }
5538     break;
5539   case 32:
5540     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5541       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5542       Opc = AArch64::LD1Fourv1d;
5543       Offset = false;
5544     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5545       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5546       Opc = AArch64::LD1Twov2d;
5547       Offset = false;
5548     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5549                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5550       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5551              "Unexpected register load without SVE load instructions");
5552       Opc = AArch64::LDR_ZZXI;
5553       StackID = TargetStackID::ScalableVector;
5554     }
5555     break;
5556   case 48:
5557     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5558       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5559       Opc = AArch64::LD1Threev2d;
5560       Offset = false;
5561     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5562       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5563              "Unexpected register load without SVE load instructions");
5564       Opc = AArch64::LDR_ZZZXI;
5565       StackID = TargetStackID::ScalableVector;
5566     }
5567     break;
5568   case 64:
5569     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5570       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5571       Opc = AArch64::LD1Fourv2d;
5572       Offset = false;
5573     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5574                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5575       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5576              "Unexpected register load without SVE load instructions");
5577       Opc = AArch64::LDR_ZZZZXI;
5578       StackID = TargetStackID::ScalableVector;
5579     }
5580     break;
5581   }
5582
5583   assert(Opc && "Unknown register class");
5584   MFI.setStackID(FI, StackID);
5585
5586   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5587                                      .addReg(DestReg, getDefRegState(true))
5588                                      .addFrameIndex(FI);
5589   if (Offset)
5590     MI.addImm(0);
5591   if (PNRReg.isValid() && !PNRReg.isVirtual())
5592     MI.addDef(PNRReg, RegState::Implicit);
5593   MI.addMemOperand(MMO);
5594 }
5595
5596 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5597                                            const MachineInstr &UseMI,
5598                                            const TargetRegisterInfo *TRI) {
5599   return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5600                                          UseMI.getIterator()),
5601                 [TRI](const MachineInstr &I) {
5602                   return I.modifiesRegister(AArch64::NZCV, TRI) ||
5603                          I.readsRegister(AArch64::NZCV, TRI);
5604                 });
5605 }
5606
5607 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5608     const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5609   // The smallest scalable element supported by scaled SVE addressing
5610   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5611   // byte offset must always be a multiple of 2.
5612   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5613
5614   // VGSized offsets are divided by '2', because the VG register is the
5615   // the number of 64bit granules as opposed to 128bit vector chunks,
5616   // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5617   // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5618   // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5619   ByteSized = Offset.getFixed();
5620   VGSized = Offset.getScalable() / 2;
5621 }
5622
5623 /// Returns the offset in parts to which this frame offset can be
5624 /// decomposed for the purpose of describing a frame offset.
5625 /// For non-scalable offsets this is simply its byte size.
5626 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5627     const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5628     int64_t &NumDataVectors) {
5629   // The smallest scalable element supported by scaled SVE addressing
5630   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5631   // byte offset must always be a multiple of 2.
5632   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5633
5634   NumBytes = Offset.getFixed();
5635   NumDataVectors = 0;
5636   NumPredicateVectors = Offset.getScalable() / 2;
5637   // This method is used to get the offsets to adjust the frame offset.
5638   // If the function requires ADDPL to be used and needs more than two ADDPL
5639   // instructions, part of the offset is folded into NumDataVectors so that it
5640   // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5641   if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5642       NumPredicateVectors > 62) {
5643     NumDataVectors = NumPredicateVectors / 8;
5644     NumPredicateVectors -= NumDataVectors * 8;
5645   }
5646 }
5647
5648 // Convenience function to create a DWARF expression for
5649 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
5650 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5651                                      int NumVGScaledBytes, unsigned VG,
5652                                      llvm::raw_string_ostream &Comment) {
5653   uint8_t buffer[16];
5654
5655   if (NumBytes) {
5656     Expr.push_back(dwarf::DW_OP_consts);
5657     Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5658     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5659     Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5660   }
5661
5662   if (NumVGScaledBytes) {
5663     Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5664     Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5665
5666     Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5667     Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5668     Expr.push_back(0);
5669
5670     Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5671     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5672
5673     Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5674             << std::abs(NumVGScaledBytes) << " * VG";
5675   }
5676 }
5677
5678 // Creates an MCCFIInstruction:
5679 //    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
5680 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5681                                                unsigned Reg,
5682                                                const StackOffset &Offset) {
5683   int64_t NumBytes, NumVGScaledBytes;
5684   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5685                                                         NumVGScaledBytes);
5686   std::string CommentBuffer;
5687   llvm::raw_string_ostream Comment(CommentBuffer);
5688
5689   if (Reg == AArch64::SP)
5690     Comment << "sp";
5691   else if (Reg == AArch64::FP)
5692     Comment << "fp";
5693   else
5694     Comment << printReg(Reg, &TRI);
5695
5696   // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5697   SmallString<64> Expr;
5698   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5699   Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5700   Expr.push_back(0);
5701   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5702                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5703
5704   // Wrap this into DW_CFA_def_cfa.
5705   SmallString<64> DefCfaExpr;
5706   DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5707   uint8_t buffer[16];
5708   DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5709   DefCfaExpr.append(Expr.str());
5710   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5711                                         Comment.str());
5712 }
5713
5714 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5715                                     unsigned FrameReg, unsigned Reg,
5716                                     const StackOffset &Offset,
5717                                     bool LastAdjustmentWasScalable) {
5718   if (Offset.getScalable())
5719     return createDefCFAExpression(TRI, Reg, Offset);
5720
5721   if (FrameReg == Reg && !LastAdjustmentWasScalable)
5722     return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5723
5724   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5725   return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5726 }
5727
5728 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5729                                        unsigned Reg,
5730                                        const StackOffset &OffsetFromDefCFA) {
5731   int64_t NumBytes, NumVGScaledBytes;
5732   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5733       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5734
5735   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5736
5737   // Non-scalable offsets can use DW_CFA_offset directly.
5738   if (!NumVGScaledBytes)
5739     return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5740
5741   std::string CommentBuffer;
5742   llvm::raw_string_ostream Comment(CommentBuffer);
5743   Comment << printReg(Reg, &TRI) << "  @ cfa";
5744
5745   // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5746   SmallString<64> OffsetExpr;
5747   appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5748                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5749
5750   // Wrap this into DW_CFA_expression
5751   SmallString<64> CfaExpr;
5752   CfaExpr.push_back(dwarf::DW_CFA_expression);
5753   uint8_t buffer[16];
5754   CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5755   CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5756   CfaExpr.append(OffsetExpr.str());
5757
5758   return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5759                                         Comment.str());
5760 }
5761
5762 // Helper function to emit a frame offset adjustment from a given
5763 // pointer (SrcReg), stored into DestReg. This function is explicit
5764 // in that it requires the opcode.
5765 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5766                                MachineBasicBlock::iterator MBBI,
5767                                const DebugLoc &DL, unsigned DestReg,
5768                                unsigned SrcReg, int64_t Offset, unsigned Opc,
5769                                const TargetInstrInfo *TII,
5770                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5771                                bool *HasWinCFI, bool EmitCFAOffset,
5772                                StackOffset CFAOffset, unsigned FrameReg) {
5773   int Sign = 1;
5774   unsigned MaxEncoding, ShiftSize;
5775   switch (Opc) {
5776   case AArch64::ADDXri:
5777   case AArch64::ADDSXri:
5778   case AArch64::SUBXri:
5779   case AArch64::SUBSXri:
5780     MaxEncoding = 0xfff;
5781     ShiftSize = 12;
5782     break;
5783   case AArch64::ADDVL_XXI:
5784   case AArch64::ADDPL_XXI:
5785   case AArch64::ADDSVL_XXI:
5786   case AArch64::ADDSPL_XXI:
5787     MaxEncoding = 31;
5788     ShiftSize = 0;
5789     if (Offset < 0) {
5790       MaxEncoding = 32;
5791       Sign = -1;
5792       Offset = -Offset;
5793     }
5794     break;
5795   default:
5796     llvm_unreachable("Unsupported opcode");
5797   }
5798
5799   // `Offset` can be in bytes or in "scalable bytes".
5800   int VScale = 1;
5801   if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5802     VScale = 16;
5803   else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5804     VScale = 2;
5805
5806   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5807   // scratch register.  If DestReg is a virtual register, use it as the
5808   // scratch register; otherwise, create a new virtual register (to be
5809   // replaced by the scavenger at the end of PEI).  That case can be optimized
5810   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5811   // register can be loaded with offset%8 and the add/sub can use an extending
5812   // instruction with LSL#3.
5813   // Currently the function handles any offsets but generates a poor sequence
5814   // of code.
5815   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5816
5817   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5818   Register TmpReg = DestReg;
5819   if (TmpReg == AArch64::XZR)
5820     TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5821         &AArch64::GPR64RegClass);
5822   do {
5823     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5824     unsigned LocalShiftSize = 0;
5825     if (ThisVal > MaxEncoding) {
5826       ThisVal = ThisVal >> ShiftSize;
5827       LocalShiftSize = ShiftSize;
5828     }
5829     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5830            "Encoding cannot handle value that big");
5831
5832     Offset -= ThisVal << LocalShiftSize;
5833     if (Offset == 0)
5834       TmpReg = DestReg;
5835     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5836                    .addReg(SrcReg)
5837                    .addImm(Sign * (int)ThisVal);
5838     if (ShiftSize)
5839       MBI = MBI.addImm(
5840           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5841     MBI = MBI.setMIFlag(Flag);
5842
5843     auto Change =
5844         VScale == 1
5845             ? StackOffset::getFixed(ThisVal << LocalShiftSize)
5846             : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5847     if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5848       CFAOffset += Change;
5849     else
5850       CFAOffset -= Change;
5851     if (EmitCFAOffset && DestReg == TmpReg) {
5852       MachineFunction &MF = *MBB.getParent();
5853       const TargetSubtargetInfo &STI = MF.getSubtarget();
5854       const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5855
5856       unsigned CFIIndex = MF.addFrameInst(
5857           createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5858       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5859           .addCFIIndex(CFIIndex)
5860           .setMIFlags(Flag);
5861     }
5862
5863     if (NeedsWinCFI) {
5864       assert(Sign == 1 && "SEH directives should always have a positive sign");
5865       int Imm = (int)(ThisVal << LocalShiftSize);
5866       if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5867           (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5868         if (HasWinCFI)
5869           *HasWinCFI = true;
5870         if (Imm == 0)
5871           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5872         else
5873           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5874               .addImm(Imm)
5875               .setMIFlag(Flag);
5876         assert(Offset == 0 && "Expected remaining offset to be zero to "
5877                               "emit a single SEH directive");
5878       } else if (DestReg == AArch64::SP) {
5879         if (HasWinCFI)
5880           *HasWinCFI = true;
5881         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5882         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5883             .addImm(Imm)
5884             .setMIFlag(Flag);
5885       }
5886     }
5887
5888     SrcReg = TmpReg;
5889   } while (Offset);
5890 }
5891
5892 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5893                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5894                            unsigned DestReg, unsigned SrcReg,
5895                            StackOffset Offset, const TargetInstrInfo *TII,
5896                            MachineInstr::MIFlag Flag, bool SetNZCV,
5897                            bool NeedsWinCFI, bool *HasWinCFI,
5898                            bool EmitCFAOffset, StackOffset CFAOffset,
5899                            unsigned FrameReg) {
5900   // If a function is marked as arm_locally_streaming, then the runtime value of
5901   // vscale in the prologue/epilogue is different the runtime value of vscale
5902   // in the function's body. To avoid having to consider multiple vscales,
5903   // we can use `addsvl` to allocate any scalable stack-slots, which under
5904   // most circumstances will be only locals, not callee-save slots.
5905   const Function &F = MBB.getParent()->getFunction();
5906   bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5907
5908   int64_t Bytes, NumPredicateVectors, NumDataVectors;
5909   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5910       Offset, Bytes, NumPredicateVectors, NumDataVectors);
5911
5912   // First emit non-scalable frame offsets, or a simple 'mov'.
5913   if (Bytes || (!Offset && SrcReg != DestReg)) {
5914     assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5915            "SP increment/decrement not 8-byte aligned");
5916     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5917     if (Bytes < 0) {
5918       Bytes = -Bytes;
5919       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5920     }
5921     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5922                        NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5923                        FrameReg);
5924     CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5925                      ? StackOffset::getFixed(-Bytes)
5926                      : StackOffset::getFixed(Bytes);
5927     SrcReg = DestReg;
5928     FrameReg = DestReg;
5929   }
5930
5931   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5932          "SetNZCV not supported with SVE vectors");
5933   assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5934          "WinCFI not supported with SVE vectors");
5935
5936   if (NumDataVectors) {
5937     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5938                        UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5939                        TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5940                        CFAOffset, FrameReg);
5941     CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5942     SrcReg = DestReg;
5943   }
5944
5945   if (NumPredicateVectors) {
5946     assert(DestReg != AArch64::SP && "Unaligned access to SP");
5947     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5948                        UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5949                        TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5950                        CFAOffset, FrameReg);
5951   }
5952 }
5953
5954 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5955     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5956     MachineBasicBlock::iterator InsertPt, int FrameIndex,
5957     LiveIntervals *LIS, VirtRegMap *VRM) const {
5958   // This is a bit of a hack. Consider this instruction:
5959   //
5960   //   %0 = COPY %sp; GPR64all:%0
5961   //
5962   // We explicitly chose GPR64all for the virtual register so such a copy might
5963   // be eliminated by RegisterCoalescer. However, that may not be possible, and
5964   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5965   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5966   //
5967   // To prevent that, we are going to constrain the %0 register class here.
5968   if (MI.isFullCopy()) {
5969     Register DstReg = MI.getOperand(0).getReg();
5970     Register SrcReg = MI.getOperand(1).getReg();
5971     if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5972       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5973       return nullptr;
5974     }
5975     if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5976       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5977       return nullptr;
5978     }
5979     // Nothing can folded with copy from/to NZCV.
5980     if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5981       return nullptr;
5982   }
5983
5984   // Handle the case where a copy is being spilled or filled but the source
5985   // and destination register class don't match.  For example:
5986   //
5987   //   %0 = COPY %xzr; GPR64common:%0
5988   //
5989   // In this case we can still safely fold away the COPY and generate the
5990   // following spill code:
5991   //
5992   //   STRXui %xzr, %stack.0
5993   //
5994   // This also eliminates spilled cross register class COPYs (e.g. between x and
5995   // d regs) of the same size.  For example:
5996   //
5997   //   %0 = COPY %1; GPR64:%0, FPR64:%1
5998   //
5999   // will be filled as
6000   //
6001   //   LDRDui %0, fi<#0>
6002   //
6003   // instead of
6004   //
6005   //   LDRXui %Temp, fi<#0>
6006   //   %0 = FMOV %Temp
6007   //
6008   if (MI.isCopy() && Ops.size() == 1 &&
6009       // Make sure we're only folding the explicit COPY defs/uses.
6010       (Ops[0] == 0 || Ops[0] == 1)) {
6011     bool IsSpill = Ops[0] == 0;
6012     bool IsFill = !IsSpill;
6013     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
6014     const MachineRegisterInfo &MRI = MF.getRegInfo();
6015     MachineBasicBlock &MBB = *MI.getParent();
6016     const MachineOperand &DstMO = MI.getOperand(0);
6017     const MachineOperand &SrcMO = MI.getOperand(1);
6018     Register DstReg = DstMO.getReg();
6019     Register SrcReg = SrcMO.getReg();
6020     // This is slightly expensive to compute for physical regs since
6021     // getMinimalPhysRegClass is slow.
6022     auto getRegClass = [&](unsigned Reg) {
6023       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6024                                               : TRI.getMinimalPhysRegClass(Reg);
6025     };
6026
6027     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6028       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6029                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6030              "Mismatched register size in non subreg COPY");
6031       if (IsSpill)
6032         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6033                             getRegClass(SrcReg), &TRI, Register());
6034       else
6035         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6036                              getRegClass(DstReg), &TRI, Register());
6037       return &*--InsertPt;
6038     }
6039
6040     // Handle cases like spilling def of:
6041     //
6042     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6043     //
6044     // where the physical register source can be widened and stored to the full
6045     // virtual reg destination stack slot, in this case producing:
6046     //
6047     //   STRXui %xzr, %stack.0
6048     //
6049     if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6050         TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6051       assert(SrcMO.getSubReg() == 0 &&
6052              "Unexpected subreg on physical register");
6053       storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6054                           FrameIndex, &AArch64::GPR64RegClass, &TRI,
6055                           Register());
6056       return &*--InsertPt;
6057     }
6058
6059     // Handle cases like filling use of:
6060     //
6061     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6062     //
6063     // where we can load the full virtual reg source stack slot, into the subreg
6064     // destination, in this case producing:
6065     //
6066     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
6067     //
6068     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6069       const TargetRegisterClass *FillRC;
6070       switch (DstMO.getSubReg()) {
6071       default:
6072         FillRC = nullptr;
6073         break;
6074       case AArch64::sub_32:
6075         FillRC = &AArch64::GPR32RegClass;
6076         break;
6077       case AArch64::ssub:
6078         FillRC = &AArch64::FPR32RegClass;
6079         break;
6080       case AArch64::dsub:
6081         FillRC = &AArch64::FPR64RegClass;
6082         break;
6083       }
6084
6085       if (FillRC) {
6086         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6087                    TRI.getRegSizeInBits(*FillRC) &&
6088                "Mismatched regclass size on folded subreg COPY");
6089         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
6090                              Register());
6091         MachineInstr &LoadMI = *--InsertPt;
6092         MachineOperand &LoadDst = LoadMI.getOperand(0);
6093         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6094         LoadDst.setSubReg(DstMO.getSubReg());
6095         LoadDst.setIsUndef();
6096         return &LoadMI;
6097       }
6098     }
6099   }
6100
6101   // Cannot fold.
6102   return nullptr;
6103 }
6104
6105 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
6106                                     StackOffset &SOffset,
6107                                     bool *OutUseUnscaledOp,
6108                                     unsigned *OutUnscaledOp,
6109                                     int64_t *EmittableOffset) {
6110   // Set output values in case of early exit.
6111   if (EmittableOffset)
6112     *EmittableOffset = 0;
6113   if (OutUseUnscaledOp)
6114     *OutUseUnscaledOp = false;
6115   if (OutUnscaledOp)
6116     *OutUnscaledOp = 0;
6117
6118   // Exit early for structured vector spills/fills as they can't take an
6119   // immediate offset.
6120   switch (MI.getOpcode()) {
6121   default:
6122     break;
6123   case AArch64::LD1Rv1d:
6124   case AArch64::LD1Rv2s:
6125   case AArch64::LD1Rv2d:
6126   case AArch64::LD1Rv4h:
6127   case AArch64::LD1Rv4s:
6128   case AArch64::LD1Rv8b:
6129   case AArch64::LD1Rv8h:
6130   case AArch64::LD1Rv16b:
6131   case AArch64::LD1Twov2d:
6132   case AArch64::LD1Threev2d:
6133   case AArch64::LD1Fourv2d:
6134   case AArch64::LD1Twov1d:
6135   case AArch64::LD1Threev1d:
6136   case AArch64::LD1Fourv1d:
6137   case AArch64::ST1Twov2d:
6138   case AArch64::ST1Threev2d:
6139   case AArch64::ST1Fourv2d:
6140   case AArch64::ST1Twov1d:
6141   case AArch64::ST1Threev1d:
6142   case AArch64::ST1Fourv1d:
6143   case AArch64::ST1i8:
6144   case AArch64::ST1i16:
6145   case AArch64::ST1i32:
6146   case AArch64::ST1i64:
6147   case AArch64::IRG:
6148   case AArch64::IRGstack:
6149   case AArch64::STGloop:
6150   case AArch64::STZGloop:
6151     return AArch64FrameOffsetCannotUpdate;
6152   }
6153
6154   // Get the min/max offset and the scale.
6155   TypeSize ScaleValue(0U, false), Width(0U, false);
6156   int64_t MinOff, MaxOff;
6157   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
6158                                       MaxOff))
6159     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6160
6161   // Construct the complete offset.
6162   bool IsMulVL = ScaleValue.isScalable();
6163   unsigned Scale = ScaleValue.getKnownMinValue();
6164   int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6165
6166   const MachineOperand &ImmOpnd =
6167       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
6168   Offset += ImmOpnd.getImm() * Scale;
6169
6170   // If the offset doesn't match the scale, we rewrite the instruction to
6171   // use the unscaled instruction instead. Likewise, if we have a negative
6172   // offset and there is an unscaled op to use.
6173   std::optional<unsigned> UnscaledOp =
6174       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
6175   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6176   if (useUnscaledOp &&
6177       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
6178                                       MaxOff))
6179     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6180
6181   Scale = ScaleValue.getKnownMinValue();
6182   assert(IsMulVL == ScaleValue.isScalable() &&
6183          "Unscaled opcode has different value for scalable");
6184
6185   int64_t Remainder = Offset % Scale;
6186   assert(!(Remainder && useUnscaledOp) &&
6187          "Cannot have remainder when using unscaled op");
6188
6189   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6190   int64_t NewOffset = Offset / Scale;
6191   if (MinOff <= NewOffset && NewOffset <= MaxOff)
6192     Offset = Remainder;
6193   else {
6194     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6195     Offset = Offset - (NewOffset * Scale);
6196   }
6197
6198   if (EmittableOffset)
6199     *EmittableOffset = NewOffset;
6200   if (OutUseUnscaledOp)
6201     *OutUseUnscaledOp = useUnscaledOp;
6202   if (OutUnscaledOp && UnscaledOp)
6203     *OutUnscaledOp = *UnscaledOp;
6204
6205   if (IsMulVL)
6206     SOffset = StackOffset::get(SOffset.getFixed(), Offset);
6207   else
6208     SOffset = StackOffset::get(Offset, SOffset.getScalable());
6209   return AArch64FrameOffsetCanUpdate |
6210          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
6211 }
6212
6213 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
6214                                     unsigned FrameReg, StackOffset &Offset,
6215                                     const AArch64InstrInfo *TII) {
6216   unsigned Opcode = MI.getOpcode();
6217   unsigned ImmIdx = FrameRegIdx + 1;
6218
6219   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
6220     Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
6221     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
6222                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
6223                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
6224     MI.eraseFromParent();
6225     Offset = StackOffset();
6226     return true;
6227   }
6228
6229   int64_t NewOffset;
6230   unsigned UnscaledOp;
6231   bool UseUnscaledOp;
6232   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
6233                                          &UnscaledOp, &NewOffset);
6234   if (Status & AArch64FrameOffsetCanUpdate) {
6235     if (Status & AArch64FrameOffsetIsLegal)
6236       // Replace the FrameIndex with FrameReg.
6237       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
6238     if (UseUnscaledOp)
6239       MI.setDesc(TII->get(UnscaledOp));
6240
6241     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
6242     return !Offset;
6243   }
6244
6245   return false;
6246 }
6247
6248 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
6249                                   MachineBasicBlock::iterator MI) const {
6250   DebugLoc DL;
6251   BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
6252 }
6253
6254 MCInst AArch64InstrInfo::getNop() const {
6255   return MCInstBuilder(AArch64::HINT).addImm(0);
6256 }
6257
6258 // AArch64 supports MachineCombiner.
6259 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
6260
6261 // True when Opc sets flag
6262 static bool isCombineInstrSettingFlag(unsigned Opc) {
6263   switch (Opc) {
6264   case AArch64::ADDSWrr:
6265   case AArch64::ADDSWri:
6266   case AArch64::ADDSXrr:
6267   case AArch64::ADDSXri:
6268   case AArch64::SUBSWrr:
6269   case AArch64::SUBSXrr:
6270   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6271   case AArch64::SUBSWri:
6272   case AArch64::SUBSXri:
6273     return true;
6274   default:
6275     break;
6276   }
6277   return false;
6278 }
6279
6280 // 32b Opcodes that can be combined with a MUL
6281 static bool isCombineInstrCandidate32(unsigned Opc) {
6282   switch (Opc) {
6283   case AArch64::ADDWrr:
6284   case AArch64::ADDWri:
6285   case AArch64::SUBWrr:
6286   case AArch64::ADDSWrr:
6287   case AArch64::ADDSWri:
6288   case AArch64::SUBSWrr:
6289   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6290   case AArch64::SUBWri:
6291   case AArch64::SUBSWri:
6292     return true;
6293   default:
6294     break;
6295   }
6296   return false;
6297 }
6298
6299 // 64b Opcodes that can be combined with a MUL
6300 static bool isCombineInstrCandidate64(unsigned Opc) {
6301   switch (Opc) {
6302   case AArch64::ADDXrr:
6303   case AArch64::ADDXri:
6304   case AArch64::SUBXrr:
6305   case AArch64::ADDSXrr:
6306   case AArch64::ADDSXri:
6307   case AArch64::SUBSXrr:
6308   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6309   case AArch64::SUBXri:
6310   case AArch64::SUBSXri:
6311   case AArch64::ADDv8i8:
6312   case AArch64::ADDv16i8:
6313   case AArch64::ADDv4i16:
6314   case AArch64::ADDv8i16:
6315   case AArch64::ADDv2i32:
6316   case AArch64::ADDv4i32:
6317   case AArch64::SUBv8i8:
6318   case AArch64::SUBv16i8:
6319   case AArch64::SUBv4i16:
6320   case AArch64::SUBv8i16:
6321   case AArch64::SUBv2i32:
6322   case AArch64::SUBv4i32:
6323     return true;
6324   default:
6325     break;
6326   }
6327   return false;
6328 }
6329
6330 // FP Opcodes that can be combined with a FMUL.
6331 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
6332   switch (Inst.getOpcode()) {
6333   default:
6334     break;
6335   case AArch64::FADDHrr:
6336   case AArch64::FADDSrr:
6337   case AArch64::FADDDrr:
6338   case AArch64::FADDv4f16:
6339   case AArch64::FADDv8f16:
6340   case AArch64::FADDv2f32:
6341   case AArch64::FADDv2f64:
6342   case AArch64::FADDv4f32:
6343   case AArch64::FSUBHrr:
6344   case AArch64::FSUBSrr:
6345   case AArch64::FSUBDrr:
6346   case AArch64::FSUBv4f16:
6347   case AArch64::FSUBv8f16:
6348   case AArch64::FSUBv2f32:
6349   case AArch64::FSUBv2f64:
6350   case AArch64::FSUBv4f32:
6351     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
6352     // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
6353     // the target options or if FADD/FSUB has the contract fast-math flag.
6354     return Options.UnsafeFPMath ||
6355            Options.AllowFPOpFusion == FPOpFusion::Fast ||
6356            Inst.getFlag(MachineInstr::FmContract);
6357     return true;
6358   }
6359   return false;
6360 }
6361
6362 // Opcodes that can be combined with a MUL
6363 static bool isCombineInstrCandidate(unsigned Opc) {
6364   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
6365 }
6366
6367 //
6368 // Utility routine that checks if \param MO is defined by an
6369 // \param CombineOpc instruction in the basic block \param MBB
6370 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
6371                        unsigned CombineOpc, unsigned ZeroReg = 0,
6372                        bool CheckZeroReg = false) {
6373   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6374   MachineInstr *MI = nullptr;
6375
6376   if (MO.isReg() && MO.getReg().isVirtual())
6377     MI = MRI.getUniqueVRegDef(MO.getReg());
6378   // And it needs to be in the trace (otherwise, it won't have a depth).
6379   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
6380     return false;
6381   // Must only used by the user we combine with.
6382   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
6383     return false;
6384
6385   if (CheckZeroReg) {
6386     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
6387            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
6388            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
6389     // The third input reg must be zero.
6390     if (MI->getOperand(3).getReg() != ZeroReg)
6391       return false;
6392   }
6393
6394   if (isCombineInstrSettingFlag(CombineOpc) &&
6395       MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
6396     return false;
6397
6398   return true;
6399 }
6400
6401 //
6402 // Is \param MO defined by an integer multiply and can be combined?
6403 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
6404                               unsigned MulOpc, unsigned ZeroReg) {
6405   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
6406 }
6407
6408 //
6409 // Is \param MO defined by a floating-point multiply and can be combined?
6410 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
6411                                unsigned MulOpc) {
6412   return canCombine(MBB, MO, MulOpc);
6413 }
6414
6415 // TODO: There are many more machine instruction opcodes to match:
6416 //       1. Other data types (integer, vectors)
6417 //       2. Other math / logic operations (xor, or)
6418 //       3. Other forms of the same operation (intrinsics and other variants)
6419 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
6420                                                    bool Invert) const {
6421   if (Invert)
6422     return false;
6423   switch (Inst.getOpcode()) {
6424   // == Floating-point types ==
6425   // -- Floating-point instructions --
6426   case AArch64::FADDHrr:
6427   case AArch64::FADDSrr:
6428   case AArch64::FADDDrr:
6429   case AArch64::FMULHrr:
6430   case AArch64::FMULSrr:
6431   case AArch64::FMULDrr:
6432   case AArch64::FMULX16:
6433   case AArch64::FMULX32:
6434   case AArch64::FMULX64:
6435   // -- Advanced SIMD instructions --
6436   case AArch64::FADDv4f16:
6437   case AArch64::FADDv8f16:
6438   case AArch64::FADDv2f32:
6439   case AArch64::FADDv4f32:
6440   case AArch64::FADDv2f64:
6441   case AArch64::FMULv4f16:
6442   case AArch64::FMULv8f16:
6443   case AArch64::FMULv2f32:
6444   case AArch64::FMULv4f32:
6445   case AArch64::FMULv2f64:
6446   case AArch64::FMULXv4f16:
6447   case AArch64::FMULXv8f16:
6448   case AArch64::FMULXv2f32:
6449   case AArch64::FMULXv4f32:
6450   case AArch64::FMULXv2f64:
6451   // -- SVE instructions --
6452   // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6453   // in the SVE instruction set (though there are predicated ones).
6454   case AArch64::FADD_ZZZ_H:
6455   case AArch64::FADD_ZZZ_S:
6456   case AArch64::FADD_ZZZ_D:
6457   case AArch64::FMUL_ZZZ_H:
6458   case AArch64::FMUL_ZZZ_S:
6459   case AArch64::FMUL_ZZZ_D:
6460     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6461            (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
6462             Inst.getFlag(MachineInstr::MIFlag::FmNsz));
6463
6464   // == Integer types ==
6465   // -- Base instructions --
6466   // Opcodes MULWrr and MULXrr don't exist because
6467   // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6468   // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6469   // The machine-combiner does not support three-source-operands machine
6470   // instruction. So we cannot reassociate MULs.
6471   case AArch64::ADDWrr:
6472   case AArch64::ADDXrr:
6473   case AArch64::ANDWrr:
6474   case AArch64::ANDXrr:
6475   case AArch64::ORRWrr:
6476   case AArch64::ORRXrr:
6477   case AArch64::EORWrr:
6478   case AArch64::EORXrr:
6479   case AArch64::EONWrr:
6480   case AArch64::EONXrr:
6481   // -- Advanced SIMD instructions --
6482   // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6483   // in the Advanced SIMD instruction set.
6484   case AArch64::ADDv8i8:
6485   case AArch64::ADDv16i8:
6486   case AArch64::ADDv4i16:
6487   case AArch64::ADDv8i16:
6488   case AArch64::ADDv2i32:
6489   case AArch64::ADDv4i32:
6490   case AArch64::ADDv1i64:
6491   case AArch64::ADDv2i64:
6492   case AArch64::MULv8i8:
6493   case AArch64::MULv16i8:
6494   case AArch64::MULv4i16:
6495   case AArch64::MULv8i16:
6496   case AArch64::MULv2i32:
6497   case AArch64::MULv4i32:
6498   case AArch64::ANDv8i8:
6499   case AArch64::ANDv16i8:
6500   case AArch64::ORRv8i8:
6501   case AArch64::ORRv16i8:
6502   case AArch64::EORv8i8:
6503   case AArch64::EORv16i8:
6504   // -- SVE instructions --
6505   case AArch64::ADD_ZZZ_B:
6506   case AArch64::ADD_ZZZ_H:
6507   case AArch64::ADD_ZZZ_S:
6508   case AArch64::ADD_ZZZ_D:
6509   case AArch64::MUL_ZZZ_B:
6510   case AArch64::MUL_ZZZ_H:
6511   case AArch64::MUL_ZZZ_S:
6512   case AArch64::MUL_ZZZ_D:
6513   case AArch64::AND_ZZZ:
6514   case AArch64::ORR_ZZZ:
6515   case AArch64::EOR_ZZZ:
6516     return true;
6517
6518   default:
6519     return false;
6520   }
6521 }
6522
6523 /// Find instructions that can be turned into madd.
6524 static bool getMaddPatterns(MachineInstr &Root,
6525                             SmallVectorImpl<unsigned> &Patterns) {
6526   unsigned Opc = Root.getOpcode();
6527   MachineBasicBlock &MBB = *Root.getParent();
6528   bool Found = false;
6529
6530   if (!isCombineInstrCandidate(Opc))
6531     return false;
6532   if (isCombineInstrSettingFlag(Opc)) {
6533     int Cmp_NZCV =
6534         Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6535     // When NZCV is live bail out.
6536     if (Cmp_NZCV == -1)
6537       return false;
6538     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6539     // When opcode can't change bail out.
6540     // CHECKME: do we miss any cases for opcode conversion?
6541     if (NewOpc == Opc)
6542       return false;
6543     Opc = NewOpc;
6544   }
6545
6546   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6547                       unsigned Pattern) {
6548     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6549       Patterns.push_back(Pattern);
6550       Found = true;
6551     }
6552   };
6553
6554   auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6555     if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6556       Patterns.push_back(Pattern);
6557       Found = true;
6558     }
6559   };
6560
6561   typedef AArch64MachineCombinerPattern MCP;
6562
6563   switch (Opc) {
6564   default:
6565     break;
6566   case AArch64::ADDWrr:
6567     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6568            "ADDWrr does not have register operands");
6569     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6570     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6571     break;
6572   case AArch64::ADDXrr:
6573     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6574     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6575     break;
6576   case AArch64::SUBWrr:
6577     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6578     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6579     break;
6580   case AArch64::SUBXrr:
6581     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6582     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6583     break;
6584   case AArch64::ADDWri:
6585     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6586     break;
6587   case AArch64::ADDXri:
6588     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6589     break;
6590   case AArch64::SUBWri:
6591     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6592     break;
6593   case AArch64::SUBXri:
6594     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6595     break;
6596   case AArch64::ADDv8i8:
6597     setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6598     setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6599     break;
6600   case AArch64::ADDv16i8:
6601     setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6602     setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6603     break;
6604   case AArch64::ADDv4i16:
6605     setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6606     setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6607     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6608     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6609     break;
6610   case AArch64::ADDv8i16:
6611     setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6612     setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6613     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6614     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6615     break;
6616   case AArch64::ADDv2i32:
6617     setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6618     setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6619     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6620     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6621     break;
6622   case AArch64::ADDv4i32:
6623     setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6624     setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6625     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6626     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6627     break;
6628   case AArch64::SUBv8i8:
6629     setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6630     setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6631     break;
6632   case AArch64::SUBv16i8:
6633     setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6634     setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6635     break;
6636   case AArch64::SUBv4i16:
6637     setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6638     setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6639     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6640     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6641     break;
6642   case AArch64::SUBv8i16:
6643     setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6644     setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6645     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6646     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6647     break;
6648   case AArch64::SUBv2i32:
6649     setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6650     setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6651     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6652     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6653     break;
6654   case AArch64::SUBv4i32:
6655     setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6656     setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6657     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6658     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6659     break;
6660   }
6661   return Found;
6662 }
6663 /// Floating-Point Support
6664
6665 /// Find instructions that can be turned into madd.
6666 static bool getFMAPatterns(MachineInstr &Root,
6667                            SmallVectorImpl<unsigned> &Patterns) {
6668
6669   if (!isCombineInstrCandidateFP(Root))
6670     return false;
6671
6672   MachineBasicBlock &MBB = *Root.getParent();
6673   bool Found = false;
6674
6675   auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
6676     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6677       Patterns.push_back(Pattern);
6678       return true;
6679     }
6680     return false;
6681   };
6682
6683   typedef AArch64MachineCombinerPattern MCP;
6684
6685   switch (Root.getOpcode()) {
6686   default:
6687     assert(false && "Unsupported FP instruction in combiner\n");
6688     break;
6689   case AArch64::FADDHrr:
6690     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6691            "FADDHrr does not have register operands");
6692
6693     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6694     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6695     break;
6696   case AArch64::FADDSrr:
6697     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6698            "FADDSrr does not have register operands");
6699
6700     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6701              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6702
6703     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6704              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6705     break;
6706   case AArch64::FADDDrr:
6707     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6708              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6709
6710     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6711              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6712     break;
6713   case AArch64::FADDv4f16:
6714     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6715              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6716
6717     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6718              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6719     break;
6720   case AArch64::FADDv8f16:
6721     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6722              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6723
6724     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6725              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6726     break;
6727   case AArch64::FADDv2f32:
6728     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6729              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6730
6731     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6732              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6733     break;
6734   case AArch64::FADDv2f64:
6735     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6736              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6737
6738     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6739              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6740     break;
6741   case AArch64::FADDv4f32:
6742     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6743              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6744
6745     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6746              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6747     break;
6748   case AArch64::FSUBHrr:
6749     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6750     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6751     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6752     break;
6753   case AArch64::FSUBSrr:
6754     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6755
6756     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6757              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6758
6759     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6760     break;
6761   case AArch64::FSUBDrr:
6762     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6763
6764     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6765              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6766
6767     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6768     break;
6769   case AArch64::FSUBv4f16:
6770     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6771              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6772
6773     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6774              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6775     break;
6776   case AArch64::FSUBv8f16:
6777     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6778              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6779
6780     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6781              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6782     break;
6783   case AArch64::FSUBv2f32:
6784     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6785              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6786
6787     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6788              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6789     break;
6790   case AArch64::FSUBv2f64:
6791     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6792              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6793
6794     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6795              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6796     break;
6797   case AArch64::FSUBv4f32:
6798     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6799              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6800
6801     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6802              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6803     break;
6804   }
6805   return Found;
6806 }
6807
6808 static bool getFMULPatterns(MachineInstr &Root,
6809                             SmallVectorImpl<unsigned> &Patterns) {
6810   MachineBasicBlock &MBB = *Root.getParent();
6811   bool Found = false;
6812
6813   auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
6814     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6815     MachineOperand &MO = Root.getOperand(Operand);
6816     MachineInstr *MI = nullptr;
6817     if (MO.isReg() && MO.getReg().isVirtual())
6818       MI = MRI.getUniqueVRegDef(MO.getReg());
6819     // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6820     if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6821         MI->getOperand(1).getReg().isVirtual())
6822       MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6823     if (MI && MI->getOpcode() == Opcode) {
6824       Patterns.push_back(Pattern);
6825       return true;
6826     }
6827     return false;
6828   };
6829
6830   typedef AArch64MachineCombinerPattern MCP;
6831
6832   switch (Root.getOpcode()) {
6833   default:
6834     return false;
6835   case AArch64::FMULv2f32:
6836     Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6837     Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6838     break;
6839   case AArch64::FMULv2f64:
6840     Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6841     Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6842     break;
6843   case AArch64::FMULv4f16:
6844     Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6845     Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6846     break;
6847   case AArch64::FMULv4f32:
6848     Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6849     Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6850     break;
6851   case AArch64::FMULv8f16:
6852     Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6853     Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6854     break;
6855   }
6856
6857   return Found;
6858 }
6859
6860 static bool getFNEGPatterns(MachineInstr &Root,
6861                             SmallVectorImpl<unsigned> &Patterns) {
6862   unsigned Opc = Root.getOpcode();
6863   MachineBasicBlock &MBB = *Root.getParent();
6864   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6865
6866   auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
6867     MachineOperand &MO = Root.getOperand(1);
6868     MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6869     if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6870         MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6871         Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6872         Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6873         MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6874         MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6875       Patterns.push_back(Pattern);
6876       return true;
6877     }
6878     return false;
6879   };
6880
6881   switch (Opc) {
6882   default:
6883     break;
6884   case AArch64::FNEGDr:
6885     return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
6886   case AArch64::FNEGSr:
6887     return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
6888   }
6889
6890   return false;
6891 }
6892
6893 /// Return true when a code sequence can improve throughput. It
6894 /// should be called only for instructions in loops.
6895 /// \param Pattern - combiner pattern
6896 bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
6897   switch (Pattern) {
6898   default:
6899     break;
6900   case AArch64MachineCombinerPattern::FMULADDH_OP1:
6901   case AArch64MachineCombinerPattern::FMULADDH_OP2:
6902   case AArch64MachineCombinerPattern::FMULSUBH_OP1:
6903   case AArch64MachineCombinerPattern::FMULSUBH_OP2:
6904   case AArch64MachineCombinerPattern::FMULADDS_OP1:
6905   case AArch64MachineCombinerPattern::FMULADDS_OP2:
6906   case AArch64MachineCombinerPattern::FMULSUBS_OP1:
6907   case AArch64MachineCombinerPattern::FMULSUBS_OP2:
6908   case AArch64MachineCombinerPattern::FMULADDD_OP1:
6909   case AArch64MachineCombinerPattern::FMULADDD_OP2:
6910   case AArch64MachineCombinerPattern::FMULSUBD_OP1:
6911   case AArch64MachineCombinerPattern::FMULSUBD_OP2:
6912   case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
6913   case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
6914   case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
6915   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6916   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6917   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6918   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6919   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6920   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6921   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6922   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6923   case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
6924   case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
6925   case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
6926   case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
6927   case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
6928   case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
6929   case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
6930   case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
6931   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6932   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6933   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6934   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6935   case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
6936   case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
6937   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6938   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6939   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6940   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6941   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6942   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6943   case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6944   case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6945   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6946   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6947   case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
6948   case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
6949   case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
6950   case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
6951   case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
6952   case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
6953   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6954   case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
6955   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
6956   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
6957   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
6958   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
6959   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
6960   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
6961   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
6962   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
6963   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
6964   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
6965   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
6966   case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
6967   case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
6968   case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
6969   case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
6970   case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
6971   case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
6972   case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
6973   case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
6974   case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
6975   case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
6976   case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
6977   case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
6978   case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
6979   case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
6980   case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
6981   case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
6982   case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
6983   case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
6984   case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
6985   case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
6986   case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
6987   case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
6988   case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
6989   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6990   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6991   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6992   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6993   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6994   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6995   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6996   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6997   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6998   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6999   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7000   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7001   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7002   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7003   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7004   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7005     return true;
7006   } // end switch (Pattern)
7007   return false;
7008 }
7009
7010 /// Find other MI combine patterns.
7011 static bool getMiscPatterns(MachineInstr &Root,
7012                             SmallVectorImpl<unsigned> &Patterns) {
7013   // A - (B + C)  ==>   (A - B) - C  or  (A - C) - B
7014   unsigned Opc = Root.getOpcode();
7015   MachineBasicBlock &MBB = *Root.getParent();
7016
7017   switch (Opc) {
7018   case AArch64::SUBWrr:
7019   case AArch64::SUBSWrr:
7020   case AArch64::SUBXrr:
7021   case AArch64::SUBSXrr:
7022     // Found candidate root.
7023     break;
7024   default:
7025     return false;
7026   }
7027
7028   if (isCombineInstrSettingFlag(Opc) &&
7029       Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
7030           -1)
7031     return false;
7032
7033   if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
7034       canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
7035       canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
7036       canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
7037     Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);
7038     Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);
7039     return true;
7040   }
7041
7042   return false;
7043 }
7044
7045 CombinerObjective
7046 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
7047   switch (Pattern) {
7048   case AArch64MachineCombinerPattern::SUBADD_OP1:
7049   case AArch64MachineCombinerPattern::SUBADD_OP2:
7050     return CombinerObjective::MustReduceDepth;
7051   default:
7052     return TargetInstrInfo::getCombinerObjective(Pattern);
7053   }
7054 }
7055
7056 /// Return true when there is potentially a faster code sequence for an
7057 /// instruction chain ending in \p Root. All potential patterns are listed in
7058 /// the \p Pattern vector. Pattern should be sorted in priority order since the
7059 /// pattern evaluator stops checking as soon as it finds a faster sequence.
7060
7061 bool AArch64InstrInfo::getMachineCombinerPatterns(
7062     MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
7063     bool DoRegPressureReduce) const {
7064   // Integer patterns
7065   if (getMaddPatterns(Root, Patterns))
7066     return true;
7067   // Floating point patterns
7068   if (getFMULPatterns(Root, Patterns))
7069     return true;
7070   if (getFMAPatterns(Root, Patterns))
7071     return true;
7072   if (getFNEGPatterns(Root, Patterns))
7073     return true;
7074
7075   // Other patterns
7076   if (getMiscPatterns(Root, Patterns))
7077     return true;
7078
7079   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
7080                                                      DoRegPressureReduce);
7081 }
7082
7083 enum class FMAInstKind { Default, Indexed, Accumulator };
7084 /// genFusedMultiply - Generate fused multiply instructions.
7085 /// This function supports both integer and floating point instructions.
7086 /// A typical example:
7087 ///  F|MUL I=A,B,0
7088 ///  F|ADD R,I,C
7089 ///  ==> F|MADD R,A,B,C
7090 /// \param MF Containing MachineFunction
7091 /// \param MRI Register information
7092 /// \param TII Target information
7093 /// \param Root is the F|ADD instruction
7094 /// \param [out] InsInstrs is a vector of machine instructions and will
7095 /// contain the generated madd instruction
7096 /// \param IdxMulOpd is index of operand in Root that is the result of
7097 /// the F|MUL. In the example above IdxMulOpd is 1.
7098 /// \param MaddOpc the opcode fo the f|madd instruction
7099 /// \param RC Register class of operands
7100 /// \param kind of fma instruction (addressing mode) to be generated
7101 /// \param ReplacedAddend is the result register from the instruction
7102 /// replacing the non-combined operand, if any.
7103 static MachineInstr *
7104 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
7105                  const TargetInstrInfo *TII, MachineInstr &Root,
7106                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
7107                  unsigned MaddOpc, const TargetRegisterClass *RC,
7108                  FMAInstKind kind = FMAInstKind::Default,
7109                  const Register *ReplacedAddend = nullptr) {
7110   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
7111
7112   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
7113   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
7114   Register ResultReg = Root.getOperand(0).getReg();
7115   Register SrcReg0 = MUL->getOperand(1).getReg();
7116   bool Src0IsKill = MUL->getOperand(1).isKill();
7117   Register SrcReg1 = MUL->getOperand(2).getReg();
7118   bool Src1IsKill = MUL->getOperand(2).isKill();
7119
7120   Register SrcReg2;
7121   bool Src2IsKill;
7122   if (ReplacedAddend) {
7123     // If we just generated a new addend, we must be it's only use.
7124     SrcReg2 = *ReplacedAddend;
7125     Src2IsKill = true;
7126   } else {
7127     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
7128     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
7129   }
7130
7131   if (ResultReg.isVirtual())
7132     MRI.constrainRegClass(ResultReg, RC);
7133   if (SrcReg0.isVirtual())
7134     MRI.constrainRegClass(SrcReg0, RC);
7135   if (SrcReg1.isVirtual())
7136     MRI.constrainRegClass(SrcReg1, RC);
7137   if (SrcReg2.isVirtual())
7138     MRI.constrainRegClass(SrcReg2, RC);
7139
7140   MachineInstrBuilder MIB;
7141   if (kind == FMAInstKind::Default)
7142     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7143               .addReg(SrcReg0, getKillRegState(Src0IsKill))
7144               .addReg(SrcReg1, getKillRegState(Src1IsKill))
7145               .addReg(SrcReg2, getKillRegState(Src2IsKill));
7146   else if (kind == FMAInstKind::Indexed)
7147     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7148               .addReg(SrcReg2, getKillRegState(Src2IsKill))
7149               .addReg(SrcReg0, getKillRegState(Src0IsKill))
7150               .addReg(SrcReg1, getKillRegState(Src1IsKill))
7151               .addImm(MUL->getOperand(3).getImm());
7152   else if (kind == FMAInstKind::Accumulator)
7153     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7154               .addReg(SrcReg2, getKillRegState(Src2IsKill))
7155               .addReg(SrcReg0, getKillRegState(Src0IsKill))
7156               .addReg(SrcReg1, getKillRegState(Src1IsKill));
7157   else
7158     assert(false && "Invalid FMA instruction kind \n");
7159   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
7160   InsInstrs.push_back(MIB);
7161   return MUL;
7162 }
7163
7164 static MachineInstr *
7165 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
7166                const TargetInstrInfo *TII, MachineInstr &Root,
7167                SmallVectorImpl<MachineInstr *> &InsInstrs) {
7168   MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7169
7170   unsigned Opc = 0;
7171   const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
7172   if (AArch64::FPR32RegClass.hasSubClassEq(RC))
7173     Opc = AArch64::FNMADDSrrr;
7174   else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
7175     Opc = AArch64::FNMADDDrrr;
7176   else
7177     return nullptr;
7178
7179   Register ResultReg = Root.getOperand(0).getReg();
7180   Register SrcReg0 = MAD->getOperand(1).getReg();
7181   Register SrcReg1 = MAD->getOperand(2).getReg();
7182   Register SrcReg2 = MAD->getOperand(3).getReg();
7183   bool Src0IsKill = MAD->getOperand(1).isKill();
7184   bool Src1IsKill = MAD->getOperand(2).isKill();
7185   bool Src2IsKill = MAD->getOperand(3).isKill();
7186   if (ResultReg.isVirtual())
7187     MRI.constrainRegClass(ResultReg, RC);
7188   if (SrcReg0.isVirtual())
7189     MRI.constrainRegClass(SrcReg0, RC);
7190   if (SrcReg1.isVirtual())
7191     MRI.constrainRegClass(SrcReg1, RC);
7192   if (SrcReg2.isVirtual())
7193     MRI.constrainRegClass(SrcReg2, RC);
7194
7195   MachineInstrBuilder MIB =
7196       BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
7197           .addReg(SrcReg0, getKillRegState(Src0IsKill))
7198           .addReg(SrcReg1, getKillRegState(Src1IsKill))
7199           .addReg(SrcReg2, getKillRegState(Src2IsKill));
7200   InsInstrs.push_back(MIB);
7201
7202   return MAD;
7203 }
7204
7205 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
7206 static MachineInstr *
7207 genIndexedMultiply(MachineInstr &Root,
7208                    SmallVectorImpl<MachineInstr *> &InsInstrs,
7209                    unsigned IdxDupOp, unsigned MulOpc,
7210                    const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
7211   assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
7212          "Invalid index of FMUL operand");
7213
7214   MachineFunction &MF = *Root.getMF();
7215   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7216
7217   MachineInstr *Dup =
7218       MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
7219
7220   if (Dup->getOpcode() == TargetOpcode::COPY)
7221     Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
7222
7223   Register DupSrcReg = Dup->getOperand(1).getReg();
7224   MRI.clearKillFlags(DupSrcReg);
7225   MRI.constrainRegClass(DupSrcReg, RC);
7226
7227   unsigned DupSrcLane = Dup->getOperand(2).getImm();
7228
7229   unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
7230   MachineOperand &MulOp = Root.getOperand(IdxMulOp);
7231
7232   Register ResultReg = Root.getOperand(0).getReg();
7233
7234   MachineInstrBuilder MIB;
7235   MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
7236             .add(MulOp)
7237             .addReg(DupSrcReg)
7238             .addImm(DupSrcLane);
7239
7240   InsInstrs.push_back(MIB);
7241   return &Root;
7242 }
7243
7244 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
7245 /// instructions.
7246 ///
7247 /// \see genFusedMultiply
7248 static MachineInstr *genFusedMultiplyAcc(
7249     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7250     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7251     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
7252   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7253                           FMAInstKind::Accumulator);
7254 }
7255
7256 /// genNeg - Helper to generate an intermediate negation of the second operand
7257 /// of Root
7258 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
7259                        const TargetInstrInfo *TII, MachineInstr &Root,
7260                        SmallVectorImpl<MachineInstr *> &InsInstrs,
7261                        DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
7262                        unsigned MnegOpc, const TargetRegisterClass *RC) {
7263   Register NewVR = MRI.createVirtualRegister(RC);
7264   MachineInstrBuilder MIB =
7265       BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
7266           .add(Root.getOperand(2));
7267   InsInstrs.push_back(MIB);
7268
7269   assert(InstrIdxForVirtReg.empty());
7270   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7271
7272   return NewVR;
7273 }
7274
7275 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
7276 /// instructions with an additional negation of the accumulator
7277 static MachineInstr *genFusedMultiplyAccNeg(
7278     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7279     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7280     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
7281     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
7282   assert(IdxMulOpd == 1);
7283
7284   Register NewVR =
7285       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
7286   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7287                           FMAInstKind::Accumulator, &NewVR);
7288 }
7289
7290 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
7291 /// instructions.
7292 ///
7293 /// \see genFusedMultiply
7294 static MachineInstr *genFusedMultiplyIdx(
7295     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7296     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7297     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
7298   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7299                           FMAInstKind::Indexed);
7300 }
7301
7302 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
7303 /// instructions with an additional negation of the accumulator
7304 static MachineInstr *genFusedMultiplyIdxNeg(
7305     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7306     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7307     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
7308     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
7309   assert(IdxMulOpd == 1);
7310
7311   Register NewVR =
7312       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
7313
7314   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7315                           FMAInstKind::Indexed, &NewVR);
7316 }
7317
7318 /// genMaddR - Generate madd instruction and combine mul and add using
7319 /// an extra virtual register
7320 /// Example - an ADD intermediate needs to be stored in a register:
7321 ///   MUL I=A,B,0
7322 ///   ADD R,I,Imm
7323 ///   ==> ORR  V, ZR, Imm
7324 ///   ==> MADD R,A,B,V
7325 /// \param MF Containing MachineFunction
7326 /// \param MRI Register information
7327 /// \param TII Target information
7328 /// \param Root is the ADD instruction
7329 /// \param [out] InsInstrs is a vector of machine instructions and will
7330 /// contain the generated madd instruction
7331 /// \param IdxMulOpd is index of operand in Root that is the result of
7332 /// the MUL. In the example above IdxMulOpd is 1.
7333 /// \param MaddOpc the opcode fo the madd instruction
7334 /// \param VR is a virtual register that holds the value of an ADD operand
7335 /// (V in the example above).
7336 /// \param RC Register class of operands
7337 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
7338                               const TargetInstrInfo *TII, MachineInstr &Root,
7339                               SmallVectorImpl<MachineInstr *> &InsInstrs,
7340                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
7341                               const TargetRegisterClass *RC) {
7342   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
7343
7344   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
7345   Register ResultReg = Root.getOperand(0).getReg();
7346   Register SrcReg0 = MUL->getOperand(1).getReg();
7347   bool Src0IsKill = MUL->getOperand(1).isKill();
7348   Register SrcReg1 = MUL->getOperand(2).getReg();
7349   bool Src1IsKill = MUL->getOperand(2).isKill();
7350
7351   if (ResultReg.isVirtual())
7352     MRI.constrainRegClass(ResultReg, RC);
7353   if (SrcReg0.isVirtual())
7354     MRI.constrainRegClass(SrcReg0, RC);
7355   if (SrcReg1.isVirtual())
7356     MRI.constrainRegClass(SrcReg1, RC);
7357   if (Register::isVirtualRegister(VR))
7358     MRI.constrainRegClass(VR, RC);
7359
7360   MachineInstrBuilder MIB =
7361       BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7362           .addReg(SrcReg0, getKillRegState(Src0IsKill))
7363           .addReg(SrcReg1, getKillRegState(Src1IsKill))
7364           .addReg(VR);
7365   // Insert the MADD
7366   InsInstrs.push_back(MIB);
7367   return MUL;
7368 }
7369
7370 /// Do the following transformation
7371 /// A - (B + C)  ==>   (A - B) - C
7372 /// A - (B + C)  ==>   (A - C) - B
7373 static void
7374 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
7375                  const TargetInstrInfo *TII, MachineInstr &Root,
7376                  SmallVectorImpl<MachineInstr *> &InsInstrs,
7377                  SmallVectorImpl<MachineInstr *> &DelInstrs,
7378                  unsigned IdxOpd1,
7379                  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
7380   assert(IdxOpd1 == 1 || IdxOpd1 == 2);
7381   unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
7382   MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
7383
7384   Register ResultReg = Root.getOperand(0).getReg();
7385   Register RegA = Root.getOperand(1).getReg();
7386   bool RegAIsKill = Root.getOperand(1).isKill();
7387   Register RegB = AddMI->getOperand(IdxOpd1).getReg();
7388   bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
7389   Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
7390   bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
7391   Register NewVR =
7392       MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(2).getReg()));
7393
7394   unsigned Opcode = Root.getOpcode();
7395   if (Opcode == AArch64::SUBSWrr)
7396     Opcode = AArch64::SUBWrr;
7397   else if (Opcode == AArch64::SUBSXrr)
7398     Opcode = AArch64::SUBXrr;
7399   else
7400     assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
7401            "Unexpected instruction opcode.");
7402
7403   uint32_t Flags = Root.mergeFlagsWith(*AddMI);
7404   Flags &= ~MachineInstr::NoSWrap;
7405   Flags &= ~MachineInstr::NoUWrap;
7406
7407   MachineInstrBuilder MIB1 =
7408       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
7409           .addReg(RegA, getKillRegState(RegAIsKill))
7410           .addReg(RegB, getKillRegState(RegBIsKill))
7411           .setMIFlags(Flags);
7412   MachineInstrBuilder MIB2 =
7413       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
7414           .addReg(NewVR, getKillRegState(true))
7415           .addReg(RegC, getKillRegState(RegCIsKill))
7416           .setMIFlags(Flags);
7417
7418   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7419   InsInstrs.push_back(MIB1);
7420   InsInstrs.push_back(MIB2);
7421   DelInstrs.push_back(AddMI);
7422   DelInstrs.push_back(&Root);
7423 }
7424
7425 /// When getMachineCombinerPatterns() finds potential patterns,
7426 /// this function generates the instructions that could replace the
7427 /// original code sequence
7428 void AArch64InstrInfo::genAlternativeCodeSequence(
7429     MachineInstr &Root, unsigned Pattern,
7430     SmallVectorImpl<MachineInstr *> &InsInstrs,
7431     SmallVectorImpl<MachineInstr *> &DelInstrs,
7432     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
7433   MachineBasicBlock &MBB = *Root.getParent();
7434   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7435   MachineFunction &MF = *MBB.getParent();
7436   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7437
7438   MachineInstr *MUL = nullptr;
7439   const TargetRegisterClass *RC;
7440   unsigned Opc;
7441   switch (Pattern) {
7442   default:
7443     // Reassociate instructions.
7444     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
7445                                                 DelInstrs, InstrIdxForVirtReg);
7446     return;
7447   case AArch64MachineCombinerPattern::SUBADD_OP1:
7448     // A - (B + C)
7449     // ==> (A - B) - C
7450     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
7451                      InstrIdxForVirtReg);
7452     return;
7453   case AArch64MachineCombinerPattern::SUBADD_OP2:
7454     // A - (B + C)
7455     // ==> (A - C) - B
7456     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
7457                      InstrIdxForVirtReg);
7458     return;
7459   case AArch64MachineCombinerPattern::MULADDW_OP1:
7460   case AArch64MachineCombinerPattern::MULADDX_OP1:
7461     // MUL I=A,B,0
7462     // ADD R,I,C
7463     // ==> MADD R,A,B,C
7464     // --- Create(MADD);
7465     if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7466       Opc = AArch64::MADDWrrr;
7467       RC = &AArch64::GPR32RegClass;
7468     } else {
7469       Opc = AArch64::MADDXrrr;
7470       RC = &AArch64::GPR64RegClass;
7471     }
7472     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7473     break;
7474   case AArch64MachineCombinerPattern::MULADDW_OP2:
7475   case AArch64MachineCombinerPattern::MULADDX_OP2:
7476     // MUL I=A,B,0
7477     // ADD R,C,I
7478     // ==> MADD R,A,B,C
7479     // --- Create(MADD);
7480     if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7481       Opc = AArch64::MADDWrrr;
7482       RC = &AArch64::GPR32RegClass;
7483     } else {
7484       Opc = AArch64::MADDXrrr;
7485       RC = &AArch64::GPR64RegClass;
7486     }
7487     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7488     break;
7489   case AArch64MachineCombinerPattern::MULADDWI_OP1:
7490   case AArch64MachineCombinerPattern::MULADDXI_OP1: {
7491     // MUL I=A,B,0
7492     // ADD R,I,Imm
7493     // ==> MOV V, Imm
7494     // ==> MADD R,A,B,V
7495     // --- Create(MADD);
7496     const TargetRegisterClass *OrrRC;
7497     unsigned BitSize, OrrOpc, ZeroReg;
7498     if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
7499       OrrOpc = AArch64::ORRWri;
7500       OrrRC = &AArch64::GPR32spRegClass;
7501       BitSize = 32;
7502       ZeroReg = AArch64::WZR;
7503       Opc = AArch64::MADDWrrr;
7504       RC = &AArch64::GPR32RegClass;
7505     } else {
7506       OrrOpc = AArch64::ORRXri;
7507       OrrRC = &AArch64::GPR64spRegClass;
7508       BitSize = 64;
7509       ZeroReg = AArch64::XZR;
7510       Opc = AArch64::MADDXrrr;
7511       RC = &AArch64::GPR64RegClass;
7512     }
7513     Register NewVR = MRI.createVirtualRegister(OrrRC);
7514     uint64_t Imm = Root.getOperand(2).getImm();
7515
7516     if (Root.getOperand(3).isImm()) {
7517       unsigned Val = Root.getOperand(3).getImm();
7518       Imm = Imm << Val;
7519     }
7520     uint64_t UImm = SignExtend64(Imm, BitSize);
7521     // The immediate can be composed via a single instruction.
7522     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7523     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7524     if (Insn.size() != 1)
7525       return;
7526     auto MovI = Insn.begin();
7527     MachineInstrBuilder MIB1;
7528     // MOV is an alias for one of three instructions: movz, movn, and orr.
7529     if (MovI->Opcode == OrrOpc)
7530       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7531                  .addReg(ZeroReg)
7532                  .addImm(MovI->Op2);
7533     else {
7534       if (BitSize == 32)
7535         assert((MovI->Opcode == AArch64::MOVNWi ||
7536                 MovI->Opcode == AArch64::MOVZWi) &&
7537                "Expected opcode");
7538       else
7539         assert((MovI->Opcode == AArch64::MOVNXi ||
7540                 MovI->Opcode == AArch64::MOVZXi) &&
7541                "Expected opcode");
7542       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7543                  .addImm(MovI->Op1)
7544                  .addImm(MovI->Op2);
7545     }
7546     InsInstrs.push_back(MIB1);
7547     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7548     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7549     break;
7550   }
7551   case AArch64MachineCombinerPattern::MULSUBW_OP1:
7552   case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7553     // MUL I=A,B,0
7554     // SUB R,I, C
7555     // ==> SUB  V, 0, C
7556     // ==> MADD R,A,B,V // = -C + A*B
7557     // --- Create(MADD);
7558     const TargetRegisterClass *SubRC;
7559     unsigned SubOpc, ZeroReg;
7560     if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7561       SubOpc = AArch64::SUBWrr;
7562       SubRC = &AArch64::GPR32spRegClass;
7563       ZeroReg = AArch64::WZR;
7564       Opc = AArch64::MADDWrrr;
7565       RC = &AArch64::GPR32RegClass;
7566     } else {
7567       SubOpc = AArch64::SUBXrr;
7568       SubRC = &AArch64::GPR64spRegClass;
7569       ZeroReg = AArch64::XZR;
7570       Opc = AArch64::MADDXrrr;
7571       RC = &AArch64::GPR64RegClass;
7572     }
7573     Register NewVR = MRI.createVirtualRegister(SubRC);
7574     // SUB NewVR, 0, C
7575     MachineInstrBuilder MIB1 =
7576         BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7577             .addReg(ZeroReg)
7578             .add(Root.getOperand(2));
7579     InsInstrs.push_back(MIB1);
7580     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7581     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7582     break;
7583   }
7584   case AArch64MachineCombinerPattern::MULSUBW_OP2:
7585   case AArch64MachineCombinerPattern::MULSUBX_OP2:
7586     // MUL I=A,B,0
7587     // SUB R,C,I
7588     // ==> MSUB R,A,B,C (computes C - A*B)
7589     // --- Create(MSUB);
7590     if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7591       Opc = AArch64::MSUBWrrr;
7592       RC = &AArch64::GPR32RegClass;
7593     } else {
7594       Opc = AArch64::MSUBXrrr;
7595       RC = &AArch64::GPR64RegClass;
7596     }
7597     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7598     break;
7599   case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7600   case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7601     // MUL I=A,B,0
7602     // SUB R,I, Imm
7603     // ==> MOV  V, -Imm
7604     // ==> MADD R,A,B,V // = -Imm + A*B
7605     // --- Create(MADD);
7606     const TargetRegisterClass *OrrRC;
7607     unsigned BitSize, OrrOpc, ZeroReg;
7608     if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7609       OrrOpc = AArch64::ORRWri;
7610       OrrRC = &AArch64::GPR32spRegClass;
7611       BitSize = 32;
7612       ZeroReg = AArch64::WZR;
7613       Opc = AArch64::MADDWrrr;
7614       RC = &AArch64::GPR32RegClass;
7615     } else {
7616       OrrOpc = AArch64::ORRXri;
7617       OrrRC = &AArch64::GPR64spRegClass;
7618       BitSize = 64;
7619       ZeroReg = AArch64::XZR;
7620       Opc = AArch64::MADDXrrr;
7621       RC = &AArch64::GPR64RegClass;
7622     }
7623     Register NewVR = MRI.createVirtualRegister(OrrRC);
7624     uint64_t Imm = Root.getOperand(2).getImm();
7625     if (Root.getOperand(3).isImm()) {
7626       unsigned Val = Root.getOperand(3).getImm();
7627       Imm = Imm << Val;
7628     }
7629     uint64_t UImm = SignExtend64(-Imm, BitSize);
7630     // The immediate can be composed via a single instruction.
7631     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7632     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7633     if (Insn.size() != 1)
7634       return;
7635     auto MovI = Insn.begin();
7636     MachineInstrBuilder MIB1;
7637     // MOV is an alias for one of three instructions: movz, movn, and orr.
7638     if (MovI->Opcode == OrrOpc)
7639       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7640                  .addReg(ZeroReg)
7641                  .addImm(MovI->Op2);
7642     else {
7643       if (BitSize == 32)
7644         assert((MovI->Opcode == AArch64::MOVNWi ||
7645                 MovI->Opcode == AArch64::MOVZWi) &&
7646                "Expected opcode");
7647       else
7648         assert((MovI->Opcode == AArch64::MOVNXi ||
7649                 MovI->Opcode == AArch64::MOVZXi) &&
7650                "Expected opcode");
7651       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7652                  .addImm(MovI->Op1)
7653                  .addImm(MovI->Op2);
7654     }
7655     InsInstrs.push_back(MIB1);
7656     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7657     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7658     break;
7659   }
7660
7661   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7662     Opc = AArch64::MLAv8i8;
7663     RC = &AArch64::FPR64RegClass;
7664     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7665     break;
7666   case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7667     Opc = AArch64::MLAv8i8;
7668     RC = &AArch64::FPR64RegClass;
7669     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7670     break;
7671   case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7672     Opc = AArch64::MLAv16i8;
7673     RC = &AArch64::FPR128RegClass;
7674     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7675     break;
7676   case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7677     Opc = AArch64::MLAv16i8;
7678     RC = &AArch64::FPR128RegClass;
7679     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7680     break;
7681   case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7682     Opc = AArch64::MLAv4i16;
7683     RC = &AArch64::FPR64RegClass;
7684     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7685     break;
7686   case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7687     Opc = AArch64::MLAv4i16;
7688     RC = &AArch64::FPR64RegClass;
7689     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7690     break;
7691   case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7692     Opc = AArch64::MLAv8i16;
7693     RC = &AArch64::FPR128RegClass;
7694     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7695     break;
7696   case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7697     Opc = AArch64::MLAv8i16;
7698     RC = &AArch64::FPR128RegClass;
7699     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7700     break;
7701   case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7702     Opc = AArch64::MLAv2i32;
7703     RC = &AArch64::FPR64RegClass;
7704     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7705     break;
7706   case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7707     Opc = AArch64::MLAv2i32;
7708     RC = &AArch64::FPR64RegClass;
7709     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7710     break;
7711   case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7712     Opc = AArch64::MLAv4i32;
7713     RC = &AArch64::FPR128RegClass;
7714     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7715     break;
7716   case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7717     Opc = AArch64::MLAv4i32;
7718     RC = &AArch64::FPR128RegClass;
7719     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7720     break;
7721
7722   case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7723     Opc = AArch64::MLAv8i8;
7724     RC = &AArch64::FPR64RegClass;
7725     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7726                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7727                                  RC);
7728     break;
7729   case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7730     Opc = AArch64::MLSv8i8;
7731     RC = &AArch64::FPR64RegClass;
7732     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7733     break;
7734   case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7735     Opc = AArch64::MLAv16i8;
7736     RC = &AArch64::FPR128RegClass;
7737     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7738                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7739                                  RC);
7740     break;
7741   case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7742     Opc = AArch64::MLSv16i8;
7743     RC = &AArch64::FPR128RegClass;
7744     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7745     break;
7746   case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7747     Opc = AArch64::MLAv4i16;
7748     RC = &AArch64::FPR64RegClass;
7749     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7750                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7751                                  RC);
7752     break;
7753   case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7754     Opc = AArch64::MLSv4i16;
7755     RC = &AArch64::FPR64RegClass;
7756     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7757     break;
7758   case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7759     Opc = AArch64::MLAv8i16;
7760     RC = &AArch64::FPR128RegClass;
7761     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7762                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7763                                  RC);
7764     break;
7765   case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7766     Opc = AArch64::MLSv8i16;
7767     RC = &AArch64::FPR128RegClass;
7768     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7769     break;
7770   case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7771     Opc = AArch64::MLAv2i32;
7772     RC = &AArch64::FPR64RegClass;
7773     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7774                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7775                                  RC);
7776     break;
7777   case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7778     Opc = AArch64::MLSv2i32;
7779     RC = &AArch64::FPR64RegClass;
7780     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7781     break;
7782   case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7783     Opc = AArch64::MLAv4i32;
7784     RC = &AArch64::FPR128RegClass;
7785     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7786                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7787                                  RC);
7788     break;
7789   case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7790     Opc = AArch64::MLSv4i32;
7791     RC = &AArch64::FPR128RegClass;
7792     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7793     break;
7794
7795   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7796     Opc = AArch64::MLAv4i16_indexed;
7797     RC = &AArch64::FPR64RegClass;
7798     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7799     break;
7800   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7801     Opc = AArch64::MLAv4i16_indexed;
7802     RC = &AArch64::FPR64RegClass;
7803     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7804     break;
7805   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7806     Opc = AArch64::MLAv8i16_indexed;
7807     RC = &AArch64::FPR128RegClass;
7808     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7809     break;
7810   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7811     Opc = AArch64::MLAv8i16_indexed;
7812     RC = &AArch64::FPR128RegClass;
7813     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7814     break;
7815   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7816     Opc = AArch64::MLAv2i32_indexed;
7817     RC = &AArch64::FPR64RegClass;
7818     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7819     break;
7820   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7821     Opc = AArch64::MLAv2i32_indexed;
7822     RC = &AArch64::FPR64RegClass;
7823     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7824     break;
7825   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7826     Opc = AArch64::MLAv4i32_indexed;
7827     RC = &AArch64::FPR128RegClass;
7828     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7829     break;
7830   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7831     Opc = AArch64::MLAv4i32_indexed;
7832     RC = &AArch64::FPR128RegClass;
7833     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7834     break;
7835
7836   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7837     Opc = AArch64::MLAv4i16_indexed;
7838     RC = &AArch64::FPR64RegClass;
7839     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7840                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7841                                  RC);
7842     break;
7843   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7844     Opc = AArch64::MLSv4i16_indexed;
7845     RC = &AArch64::FPR64RegClass;
7846     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7847     break;
7848   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7849     Opc = AArch64::MLAv8i16_indexed;
7850     RC = &AArch64::FPR128RegClass;
7851     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7852                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7853                                  RC);
7854     break;
7855   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7856     Opc = AArch64::MLSv8i16_indexed;
7857     RC = &AArch64::FPR128RegClass;
7858     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7859     break;
7860   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7861     Opc = AArch64::MLAv2i32_indexed;
7862     RC = &AArch64::FPR64RegClass;
7863     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7864                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7865                                  RC);
7866     break;
7867   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7868     Opc = AArch64::MLSv2i32_indexed;
7869     RC = &AArch64::FPR64RegClass;
7870     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7871     break;
7872   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7873     Opc = AArch64::MLAv4i32_indexed;
7874     RC = &AArch64::FPR128RegClass;
7875     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7876                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7877                                  RC);
7878     break;
7879   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7880     Opc = AArch64::MLSv4i32_indexed;
7881     RC = &AArch64::FPR128RegClass;
7882     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7883     break;
7884
7885   // Floating Point Support
7886   case AArch64MachineCombinerPattern::FMULADDH_OP1:
7887     Opc = AArch64::FMADDHrrr;
7888     RC = &AArch64::FPR16RegClass;
7889     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7890     break;
7891   case AArch64MachineCombinerPattern::FMULADDS_OP1:
7892     Opc = AArch64::FMADDSrrr;
7893     RC = &AArch64::FPR32RegClass;
7894     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7895     break;
7896   case AArch64MachineCombinerPattern::FMULADDD_OP1:
7897     Opc = AArch64::FMADDDrrr;
7898     RC = &AArch64::FPR64RegClass;
7899     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7900     break;
7901
7902   case AArch64MachineCombinerPattern::FMULADDH_OP2:
7903     Opc = AArch64::FMADDHrrr;
7904     RC = &AArch64::FPR16RegClass;
7905     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7906     break;
7907   case AArch64MachineCombinerPattern::FMULADDS_OP2:
7908     Opc = AArch64::FMADDSrrr;
7909     RC = &AArch64::FPR32RegClass;
7910     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7911     break;
7912   case AArch64MachineCombinerPattern::FMULADDD_OP2:
7913     Opc = AArch64::FMADDDrrr;
7914     RC = &AArch64::FPR64RegClass;
7915     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7916     break;
7917
7918   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7919     Opc = AArch64::FMLAv1i32_indexed;
7920     RC = &AArch64::FPR32RegClass;
7921     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7922                            FMAInstKind::Indexed);
7923     break;
7924   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7925     Opc = AArch64::FMLAv1i32_indexed;
7926     RC = &AArch64::FPR32RegClass;
7927     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7928                            FMAInstKind::Indexed);
7929     break;
7930
7931   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7932     Opc = AArch64::FMLAv1i64_indexed;
7933     RC = &AArch64::FPR64RegClass;
7934     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7935                            FMAInstKind::Indexed);
7936     break;
7937   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7938     Opc = AArch64::FMLAv1i64_indexed;
7939     RC = &AArch64::FPR64RegClass;
7940     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7941                            FMAInstKind::Indexed);
7942     break;
7943
7944   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7945     RC = &AArch64::FPR64RegClass;
7946     Opc = AArch64::FMLAv4i16_indexed;
7947     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7948                            FMAInstKind::Indexed);
7949     break;
7950   case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7951     RC = &AArch64::FPR64RegClass;
7952     Opc = AArch64::FMLAv4f16;
7953     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7954                            FMAInstKind::Accumulator);
7955     break;
7956   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7957     RC = &AArch64::FPR64RegClass;
7958     Opc = AArch64::FMLAv4i16_indexed;
7959     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7960                            FMAInstKind::Indexed);
7961     break;
7962   case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7963     RC = &AArch64::FPR64RegClass;
7964     Opc = AArch64::FMLAv4f16;
7965     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7966                            FMAInstKind::Accumulator);
7967     break;
7968
7969   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7970   case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7971     RC = &AArch64::FPR64RegClass;
7972     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7973       Opc = AArch64::FMLAv2i32_indexed;
7974       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7975                              FMAInstKind::Indexed);
7976     } else {
7977       Opc = AArch64::FMLAv2f32;
7978       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7979                              FMAInstKind::Accumulator);
7980     }
7981     break;
7982   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7983   case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7984     RC = &AArch64::FPR64RegClass;
7985     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7986       Opc = AArch64::FMLAv2i32_indexed;
7987       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7988                              FMAInstKind::Indexed);
7989     } else {
7990       Opc = AArch64::FMLAv2f32;
7991       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7992                              FMAInstKind::Accumulator);
7993     }
7994     break;
7995
7996   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7997     RC = &AArch64::FPR128RegClass;
7998     Opc = AArch64::FMLAv8i16_indexed;
7999     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8000                            FMAInstKind::Indexed);
8001     break;
8002   case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
8003     RC = &AArch64::FPR128RegClass;
8004     Opc = AArch64::FMLAv8f16;
8005     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8006                            FMAInstKind::Accumulator);
8007     break;
8008   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
8009     RC = &AArch64::FPR128RegClass;
8010     Opc = AArch64::FMLAv8i16_indexed;
8011     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8012                            FMAInstKind::Indexed);
8013     break;
8014   case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
8015     RC = &AArch64::FPR128RegClass;
8016     Opc = AArch64::FMLAv8f16;
8017     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8018                            FMAInstKind::Accumulator);
8019     break;
8020
8021   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
8022   case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
8023     RC = &AArch64::FPR128RegClass;
8024     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
8025       Opc = AArch64::FMLAv2i64_indexed;
8026       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8027                              FMAInstKind::Indexed);
8028     } else {
8029       Opc = AArch64::FMLAv2f64;
8030       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8031                              FMAInstKind::Accumulator);
8032     }
8033     break;
8034   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
8035   case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
8036     RC = &AArch64::FPR128RegClass;
8037     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
8038       Opc = AArch64::FMLAv2i64_indexed;
8039       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8040                              FMAInstKind::Indexed);
8041     } else {
8042       Opc = AArch64::FMLAv2f64;
8043       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8044                              FMAInstKind::Accumulator);
8045     }
8046     break;
8047
8048   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
8049   case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
8050     RC = &AArch64::FPR128RegClass;
8051     if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
8052       Opc = AArch64::FMLAv4i32_indexed;
8053       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8054                              FMAInstKind::Indexed);
8055     } else {
8056       Opc = AArch64::FMLAv4f32;
8057       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8058                              FMAInstKind::Accumulator);
8059     }
8060     break;
8061
8062   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
8063   case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
8064     RC = &AArch64::FPR128RegClass;
8065     if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
8066       Opc = AArch64::FMLAv4i32_indexed;
8067       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8068                              FMAInstKind::Indexed);
8069     } else {
8070       Opc = AArch64::FMLAv4f32;
8071       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8072                              FMAInstKind::Accumulator);
8073     }
8074     break;
8075
8076   case AArch64MachineCombinerPattern::FMULSUBH_OP1:
8077     Opc = AArch64::FNMSUBHrrr;
8078     RC = &AArch64::FPR16RegClass;
8079     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8080     break;
8081   case AArch64MachineCombinerPattern::FMULSUBS_OP1:
8082     Opc = AArch64::FNMSUBSrrr;
8083     RC = &AArch64::FPR32RegClass;
8084     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8085     break;
8086   case AArch64MachineCombinerPattern::FMULSUBD_OP1:
8087     Opc = AArch64::FNMSUBDrrr;
8088     RC = &AArch64::FPR64RegClass;
8089     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8090     break;
8091
8092   case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
8093     Opc = AArch64::FNMADDHrrr;
8094     RC = &AArch64::FPR16RegClass;
8095     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8096     break;
8097   case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
8098     Opc = AArch64::FNMADDSrrr;
8099     RC = &AArch64::FPR32RegClass;
8100     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8101     break;
8102   case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
8103     Opc = AArch64::FNMADDDrrr;
8104     RC = &AArch64::FPR64RegClass;
8105     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8106     break;
8107
8108   case AArch64MachineCombinerPattern::FMULSUBH_OP2:
8109     Opc = AArch64::FMSUBHrrr;
8110     RC = &AArch64::FPR16RegClass;
8111     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8112     break;
8113   case AArch64MachineCombinerPattern::FMULSUBS_OP2:
8114     Opc = AArch64::FMSUBSrrr;
8115     RC = &AArch64::FPR32RegClass;
8116     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8117     break;
8118   case AArch64MachineCombinerPattern::FMULSUBD_OP2:
8119     Opc = AArch64::FMSUBDrrr;
8120     RC = &AArch64::FPR64RegClass;
8121     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8122     break;
8123
8124   case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
8125     Opc = AArch64::FMLSv1i32_indexed;
8126     RC = &AArch64::FPR32RegClass;
8127     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8128                            FMAInstKind::Indexed);
8129     break;
8130
8131   case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
8132     Opc = AArch64::FMLSv1i64_indexed;
8133     RC = &AArch64::FPR64RegClass;
8134     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8135                            FMAInstKind::Indexed);
8136     break;
8137
8138   case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
8139   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
8140     RC = &AArch64::FPR64RegClass;
8141     Register NewVR = MRI.createVirtualRegister(RC);
8142     MachineInstrBuilder MIB1 =
8143         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
8144             .add(Root.getOperand(2));
8145     InsInstrs.push_back(MIB1);
8146     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8147     if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
8148       Opc = AArch64::FMLAv4f16;
8149       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8150                              FMAInstKind::Accumulator, &NewVR);
8151     } else {
8152       Opc = AArch64::FMLAv4i16_indexed;
8153       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8154                              FMAInstKind::Indexed, &NewVR);
8155     }
8156     break;
8157   }
8158   case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
8159     RC = &AArch64::FPR64RegClass;
8160     Opc = AArch64::FMLSv4f16;
8161     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8162                            FMAInstKind::Accumulator);
8163     break;
8164   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
8165     RC = &AArch64::FPR64RegClass;
8166     Opc = AArch64::FMLSv4i16_indexed;
8167     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8168                            FMAInstKind::Indexed);
8169     break;
8170
8171   case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
8172   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
8173     RC = &AArch64::FPR64RegClass;
8174     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
8175       Opc = AArch64::FMLSv2i32_indexed;
8176       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8177                              FMAInstKind::Indexed);
8178     } else {
8179       Opc = AArch64::FMLSv2f32;
8180       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8181                              FMAInstKind::Accumulator);
8182     }
8183     break;
8184
8185   case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
8186   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
8187     RC = &AArch64::FPR128RegClass;
8188     Register NewVR = MRI.createVirtualRegister(RC);
8189     MachineInstrBuilder MIB1 =
8190         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
8191             .add(Root.getOperand(2));
8192     InsInstrs.push_back(MIB1);
8193     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8194     if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
8195       Opc = AArch64::FMLAv8f16;
8196       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8197                              FMAInstKind::Accumulator, &NewVR);
8198     } else {
8199       Opc = AArch64::FMLAv8i16_indexed;
8200       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8201                              FMAInstKind::Indexed, &NewVR);
8202     }
8203     break;
8204   }
8205   case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
8206     RC = &AArch64::FPR128RegClass;
8207     Opc = AArch64::FMLSv8f16;
8208     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8209                            FMAInstKind::Accumulator);
8210     break;
8211   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
8212     RC = &AArch64::FPR128RegClass;
8213     Opc = AArch64::FMLSv8i16_indexed;
8214     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8215                            FMAInstKind::Indexed);
8216     break;
8217
8218   case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
8219   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
8220     RC = &AArch64::FPR128RegClass;
8221     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
8222       Opc = AArch64::FMLSv2i64_indexed;
8223       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8224                              FMAInstKind::Indexed);
8225     } else {
8226       Opc = AArch64::FMLSv2f64;
8227       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8228                              FMAInstKind::Accumulator);
8229     }
8230     break;
8231
8232   case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
8233   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
8234     RC = &AArch64::FPR128RegClass;
8235     if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
8236       Opc = AArch64::FMLSv4i32_indexed;
8237       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8238                              FMAInstKind::Indexed);
8239     } else {
8240       Opc = AArch64::FMLSv4f32;
8241       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8242                              FMAInstKind::Accumulator);
8243     }
8244     break;
8245   case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
8246   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
8247     RC = &AArch64::FPR64RegClass;
8248     Register NewVR = MRI.createVirtualRegister(RC);
8249     MachineInstrBuilder MIB1 =
8250         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
8251             .add(Root.getOperand(2));
8252     InsInstrs.push_back(MIB1);
8253     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8254     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
8255       Opc = AArch64::FMLAv2i32_indexed;
8256       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8257                              FMAInstKind::Indexed, &NewVR);
8258     } else {
8259       Opc = AArch64::FMLAv2f32;
8260       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8261                              FMAInstKind::Accumulator, &NewVR);
8262     }
8263     break;
8264   }
8265   case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
8266   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
8267     RC = &AArch64::FPR128RegClass;
8268     Register NewVR = MRI.createVirtualRegister(RC);
8269     MachineInstrBuilder MIB1 =
8270         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
8271             .add(Root.getOperand(2));
8272     InsInstrs.push_back(MIB1);
8273     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8274     if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
8275       Opc = AArch64::FMLAv4i32_indexed;
8276       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8277                              FMAInstKind::Indexed, &NewVR);
8278     } else {
8279       Opc = AArch64::FMLAv4f32;
8280       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8281                              FMAInstKind::Accumulator, &NewVR);
8282     }
8283     break;
8284   }
8285   case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
8286   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
8287     RC = &AArch64::FPR128RegClass;
8288     Register NewVR = MRI.createVirtualRegister(RC);
8289     MachineInstrBuilder MIB1 =
8290         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
8291             .add(Root.getOperand(2));
8292     InsInstrs.push_back(MIB1);
8293     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8294     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
8295       Opc = AArch64::FMLAv2i64_indexed;
8296       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8297                              FMAInstKind::Indexed, &NewVR);
8298     } else {
8299       Opc = AArch64::FMLAv2f64;
8300       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8301                              FMAInstKind::Accumulator, &NewVR);
8302     }
8303     break;
8304   }
8305   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
8306   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
8307     unsigned IdxDupOp =
8308         (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
8309                                                                           : 2;
8310     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
8311                        &AArch64::FPR128RegClass, MRI);
8312     break;
8313   }
8314   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
8315   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
8316     unsigned IdxDupOp =
8317         (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
8318                                                                           : 2;
8319     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
8320                        &AArch64::FPR128RegClass, MRI);
8321     break;
8322   }
8323   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
8324   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
8325     unsigned IdxDupOp =
8326         (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
8327                                                                           : 2;
8328     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
8329                        &AArch64::FPR128_loRegClass, MRI);
8330     break;
8331   }
8332   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
8333   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
8334     unsigned IdxDupOp =
8335         (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
8336                                                                           : 2;
8337     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
8338                        &AArch64::FPR128RegClass, MRI);
8339     break;
8340   }
8341   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
8342   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
8343     unsigned IdxDupOp =
8344         (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
8345                                                                           : 2;
8346     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
8347                        &AArch64::FPR128_loRegClass, MRI);
8348     break;
8349   }
8350   case AArch64MachineCombinerPattern::FNMADD: {
8351     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
8352     break;
8353   }
8354
8355   } // end switch (Pattern)
8356   // Record MUL and ADD/SUB for deletion
8357   if (MUL)
8358     DelInstrs.push_back(MUL);
8359   DelInstrs.push_back(&Root);
8360
8361   // Set the flags on the inserted instructions to be the merged flags of the
8362   // instructions that we have combined.
8363   uint32_t Flags = Root.getFlags();
8364   if (MUL)
8365     Flags = Root.mergeFlagsWith(*MUL);
8366   for (auto *MI : InsInstrs)
8367     MI->setFlags(Flags);
8368 }
8369
8370 /// Replace csincr-branch sequence by simple conditional branch
8371 ///
8372 /// Examples:
8373 /// 1. \code
8374 ///   csinc  w9, wzr, wzr, <condition code>
8375 ///   tbnz   w9, #0, 0x44
8376 ///    \endcode
8377 /// to
8378 ///    \code
8379 ///   b.<inverted condition code>
8380 ///    \endcode
8381 ///
8382 /// 2. \code
8383 ///   csinc w9, wzr, wzr, <condition code>
8384 ///   tbz   w9, #0, 0x44
8385 ///    \endcode
8386 /// to
8387 ///    \code
8388 ///   b.<condition code>
8389 ///    \endcode
8390 ///
8391 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
8392 /// compare's constant operand is power of 2.
8393 ///
8394 /// Examples:
8395 ///    \code
8396 ///   and  w8, w8, #0x400
8397 ///   cbnz w8, L1
8398 ///    \endcode
8399 /// to
8400 ///    \code
8401 ///   tbnz w8, #10, L1
8402 ///    \endcode
8403 ///
8404 /// \param  MI Conditional Branch
8405 /// \return True when the simple conditional branch is generated
8406 ///
8407 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
8408   bool IsNegativeBranch = false;
8409   bool IsTestAndBranch = false;
8410   unsigned TargetBBInMI = 0;
8411   switch (MI.getOpcode()) {
8412   default:
8413     llvm_unreachable("Unknown branch instruction?");
8414   case AArch64::Bcc:
8415     return false;
8416   case AArch64::CBZW:
8417   case AArch64::CBZX:
8418     TargetBBInMI = 1;
8419     break;
8420   case AArch64::CBNZW:
8421   case AArch64::CBNZX:
8422     TargetBBInMI = 1;
8423     IsNegativeBranch = true;
8424     break;
8425   case AArch64::TBZW:
8426   case AArch64::TBZX:
8427     TargetBBInMI = 2;
8428     IsTestAndBranch = true;
8429     break;
8430   case AArch64::TBNZW:
8431   case AArch64::TBNZX:
8432     TargetBBInMI = 2;
8433     IsNegativeBranch = true;
8434     IsTestAndBranch = true;
8435     break;
8436   }
8437   // So we increment a zero register and test for bits other
8438   // than bit 0? Conservatively bail out in case the verifier
8439   // missed this case.
8440   if (IsTestAndBranch && MI.getOperand(1).getImm())
8441     return false;
8442
8443   // Find Definition.
8444   assert(MI.getParent() && "Incomplete machine instruciton\n");
8445   MachineBasicBlock *MBB = MI.getParent();
8446   MachineFunction *MF = MBB->getParent();
8447   MachineRegisterInfo *MRI = &MF->getRegInfo();
8448   Register VReg = MI.getOperand(0).getReg();
8449   if (!VReg.isVirtual())
8450     return false;
8451
8452   MachineInstr *DefMI = MRI->getVRegDef(VReg);
8453
8454   // Look through COPY instructions to find definition.
8455   while (DefMI->isCopy()) {
8456     Register CopyVReg = DefMI->getOperand(1).getReg();
8457     if (!MRI->hasOneNonDBGUse(CopyVReg))
8458       return false;
8459     if (!MRI->hasOneDef(CopyVReg))
8460       return false;
8461     DefMI = MRI->getVRegDef(CopyVReg);
8462   }
8463
8464   switch (DefMI->getOpcode()) {
8465   default:
8466     return false;
8467   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8468   case AArch64::ANDWri:
8469   case AArch64::ANDXri: {
8470     if (IsTestAndBranch)
8471       return false;
8472     if (DefMI->getParent() != MBB)
8473       return false;
8474     if (!MRI->hasOneNonDBGUse(VReg))
8475       return false;
8476
8477     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8478     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8479         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
8480     if (!isPowerOf2_64(Mask))
8481       return false;
8482
8483     MachineOperand &MO = DefMI->getOperand(1);
8484     Register NewReg = MO.getReg();
8485     if (!NewReg.isVirtual())
8486       return false;
8487
8488     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8489
8490     MachineBasicBlock &RefToMBB = *MBB;
8491     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
8492     DebugLoc DL = MI.getDebugLoc();
8493     unsigned Imm = Log2_64(Mask);
8494     unsigned Opc = (Imm < 32)
8495                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8496                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8497     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8498                               .addReg(NewReg)
8499                               .addImm(Imm)
8500                               .addMBB(TBB);
8501     // Register lives on to the CBZ now.
8502     MO.setIsKill(false);
8503
8504     // For immediate smaller than 32, we need to use the 32-bit
8505     // variant (W) in all cases. Indeed the 64-bit variant does not
8506     // allow to encode them.
8507     // Therefore, if the input register is 64-bit, we need to take the
8508     // 32-bit sub-part.
8509     if (!Is32Bit && Imm < 32)
8510       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8511     MI.eraseFromParent();
8512     return true;
8513   }
8514   // Look for CSINC
8515   case AArch64::CSINCWr:
8516   case AArch64::CSINCXr: {
8517     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8518           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8519         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8520           DefMI->getOperand(2).getReg() == AArch64::XZR))
8521       return false;
8522
8523     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8524                                          true) != -1)
8525       return false;
8526
8527     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8528     // Convert only when the condition code is not modified between
8529     // the CSINC and the branch. The CC may be used by other
8530     // instructions in between.
8531     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8532       return false;
8533     MachineBasicBlock &RefToMBB = *MBB;
8534     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8535     DebugLoc DL = MI.getDebugLoc();
8536     if (IsNegativeBranch)
8537       CC = AArch64CC::getInvertedCondCode(CC);
8538     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8539     MI.eraseFromParent();
8540     return true;
8541   }
8542   }
8543 }
8544
8545 std::pair<unsigned, unsigned>
8546 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8547   const unsigned Mask = AArch64II::MO_FRAGMENT;
8548   return std::make_pair(TF & Mask, TF & ~Mask);
8549 }
8550
8551 ArrayRef<std::pair<unsigned, const char *>>
8552 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8553   using namespace AArch64II;
8554
8555   static const std::pair<unsigned, const char *> TargetFlags[] = {
8556       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8557       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
8558       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
8559       {MO_HI12, "aarch64-hi12"}};
8560   return ArrayRef(TargetFlags);
8561 }
8562
8563 ArrayRef<std::pair<unsigned, const char *>>
8564 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8565   using namespace AArch64II;
8566
8567   static const std::pair<unsigned, const char *> TargetFlags[] = {
8568       {MO_COFFSTUB, "aarch64-coffstub"},
8569       {MO_GOT, "aarch64-got"},
8570       {MO_NC, "aarch64-nc"},
8571       {MO_S, "aarch64-s"},
8572       {MO_TLS, "aarch64-tls"},
8573       {MO_DLLIMPORT, "aarch64-dllimport"},
8574       {MO_PREL, "aarch64-prel"},
8575       {MO_TAGGED, "aarch64-tagged"},
8576       {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8577   };
8578   return ArrayRef(TargetFlags);
8579 }
8580
8581 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
8582 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8583   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8584       {{MOSuppressPair, "aarch64-suppress-pair"},
8585        {MOStridedAccess, "aarch64-strided-access"}};
8586   return ArrayRef(TargetFlags);
8587 }
8588
8589 /// Constants defining how certain sequences should be outlined.
8590 /// This encompasses how an outlined function should be called, and what kind of
8591 /// frame should be emitted for that outlined function.
8592 ///
8593 /// \p MachineOutlinerDefault implies that the function should be called with
8594 /// a save and restore of LR to the stack.
8595 ///
8596 /// That is,
8597 ///
8598 /// I1     Save LR                    OUTLINED_FUNCTION:
8599 /// I2 --> BL OUTLINED_FUNCTION       I1
8600 /// I3     Restore LR                 I2
8601 ///                                   I3
8602 ///                                   RET
8603 ///
8604 /// * Call construction overhead: 3 (save + BL + restore)
8605 /// * Frame construction overhead: 1 (ret)
8606 /// * Requires stack fixups? Yes
8607 ///
8608 /// \p MachineOutlinerTailCall implies that the function is being created from
8609 /// a sequence of instructions ending in a return.
8610 ///
8611 /// That is,
8612 ///
8613 /// I1                             OUTLINED_FUNCTION:
8614 /// I2 --> B OUTLINED_FUNCTION     I1
8615 /// RET                            I2
8616 ///                                RET
8617 ///
8618 /// * Call construction overhead: 1 (B)
8619 /// * Frame construction overhead: 0 (Return included in sequence)
8620 /// * Requires stack fixups? No
8621 ///
8622 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8623 /// a BL instruction, but doesn't require LR to be saved and restored. This
8624 /// happens when LR is known to be dead.
8625 ///
8626 /// That is,
8627 ///
8628 /// I1                                OUTLINED_FUNCTION:
8629 /// I2 --> BL OUTLINED_FUNCTION       I1
8630 /// I3                                I2
8631 ///                                   I3
8632 ///                                   RET
8633 ///
8634 /// * Call construction overhead: 1 (BL)
8635 /// * Frame construction overhead: 1 (RET)
8636 /// * Requires stack fixups? No
8637 ///
8638 /// \p MachineOutlinerThunk implies that the function is being created from
8639 /// a sequence of instructions ending in a call. The outlined function is
8640 /// called with a BL instruction, and the outlined function tail-calls the
8641 /// original call destination.
8642 ///
8643 /// That is,
8644 ///
8645 /// I1                                OUTLINED_FUNCTION:
8646 /// I2 --> BL OUTLINED_FUNCTION       I1
8647 /// BL f                              I2
8648 ///                                   B f
8649 /// * Call construction overhead: 1 (BL)
8650 /// * Frame construction overhead: 0
8651 /// * Requires stack fixups? No
8652 ///
8653 /// \p MachineOutlinerRegSave implies that the function should be called with a
8654 /// save and restore of LR to an available register. This allows us to avoid
8655 /// stack fixups. Note that this outlining variant is compatible with the
8656 /// NoLRSave case.
8657 ///
8658 /// That is,
8659 ///
8660 /// I1     Save LR                    OUTLINED_FUNCTION:
8661 /// I2 --> BL OUTLINED_FUNCTION       I1
8662 /// I3     Restore LR                 I2
8663 ///                                   I3
8664 ///                                   RET
8665 ///
8666 /// * Call construction overhead: 3 (save + BL + restore)
8667 /// * Frame construction overhead: 1 (ret)
8668 /// * Requires stack fixups? No
8669 enum MachineOutlinerClass {
8670   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
8671   MachineOutlinerTailCall, /// Only emit a branch.
8672   MachineOutlinerNoLRSave, /// Emit a call and return.
8673   MachineOutlinerThunk,    /// Emit a call and tail-call.
8674   MachineOutlinerRegSave   /// Same as default, but save to a register.
8675 };
8676
8677 enum MachineOutlinerMBBFlags {
8678   LRUnavailableSomewhere = 0x2,
8679   HasCalls = 0x4,
8680   UnsafeRegsDead = 0x8
8681 };
8682
8683 Register
8684 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8685   MachineFunction *MF = C.getMF();
8686   const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8687   const AArch64RegisterInfo *ARI =
8688       static_cast<const AArch64RegisterInfo *>(&TRI);
8689   // Check if there is an available register across the sequence that we can
8690   // use.
8691   for (unsigned Reg : AArch64::GPR64RegClass) {
8692     if (!ARI->isReservedReg(*MF, Reg) &&
8693         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
8694         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8695         Reg != AArch64::X17 && // Ditto for X17.
8696         C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8697         C.isAvailableInsideSeq(Reg, TRI))
8698       return Reg;
8699   }
8700   return Register();
8701 }
8702
8703 static bool
8704 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8705                                          const outliner::Candidate &b) {
8706   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8707   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8708
8709   return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8710          MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8711 }
8712
8713 static bool
8714 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8715                                        const outliner::Candidate &b) {
8716   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8717   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8718
8719   return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8720 }
8721
8722 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8723                                                 const outliner::Candidate &b) {
8724   const AArch64Subtarget &SubtargetA =
8725       a.getMF()->getSubtarget<AArch64Subtarget>();
8726   const AArch64Subtarget &SubtargetB =
8727       b.getMF()->getSubtarget<AArch64Subtarget>();
8728   return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8729 }
8730
8731 std::optional<std::unique_ptr<outliner::OutlinedFunction>>
8732 AArch64InstrInfo::getOutliningCandidateInfo(
8733     const MachineModuleInfo &MMI,
8734     std::vector<outliner::Candidate> &RepeatedSequenceLocs,
8735     unsigned MinRepeats) const {
8736   unsigned SequenceSize = 0;
8737   for (auto &MI : RepeatedSequenceLocs[0])
8738     SequenceSize += getInstSizeInBytes(MI);
8739
8740   unsigned NumBytesToCreateFrame = 0;
8741
8742   // We only allow outlining for functions having exactly matching return
8743   // address signing attributes, i.e., all share the same value for the
8744   // attribute "sign-return-address" and all share the same type of key they
8745   // are signed with.
8746   // Additionally we require all functions to simultaniously either support
8747   // v8.3a features or not. Otherwise an outlined function could get signed
8748   // using dedicated v8.3 instructions and a call from a function that doesn't
8749   // support v8.3 instructions would therefore be invalid.
8750   if (std::adjacent_find(
8751           RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8752           [](const outliner::Candidate &a, const outliner::Candidate &b) {
8753             // Return true if a and b are non-equal w.r.t. return address
8754             // signing or support of v8.3a features
8755             if (outliningCandidatesSigningScopeConsensus(a, b) &&
8756                 outliningCandidatesSigningKeyConsensus(a, b) &&
8757                 outliningCandidatesV8_3OpsConsensus(a, b)) {
8758               return false;
8759             }
8760             return true;
8761           }) != RepeatedSequenceLocs.end()) {
8762     return std::nullopt;
8763   }
8764
8765   // Since at this point all candidates agree on their return address signing
8766   // picking just one is fine. If the candidate functions potentially sign their
8767   // return addresses, the outlined function should do the same. Note that in
8768   // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8769   // not certainly true that the outlined function will have to sign its return
8770   // address but this decision is made later, when the decision to outline
8771   // has already been made.
8772   // The same holds for the number of additional instructions we need: On
8773   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8774   // necessary. However, at this point we don't know if the outlined function
8775   // will have a RET instruction so we assume the worst.
8776   const TargetRegisterInfo &TRI = getRegisterInfo();
8777   // Performing a tail call may require extra checks when PAuth is enabled.
8778   // If PAuth is disabled, set it to zero for uniformity.
8779   unsigned NumBytesToCheckLRInTCEpilogue = 0;
8780   if (RepeatedSequenceLocs[0]
8781           .getMF()
8782           ->getInfo<AArch64FunctionInfo>()
8783           ->shouldSignReturnAddress(true)) {
8784     // One PAC and one AUT instructions
8785     NumBytesToCreateFrame += 8;
8786
8787     // PAuth is enabled - set extra tail call cost, if any.
8788     auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
8789         *RepeatedSequenceLocs[0].getMF());
8790     NumBytesToCheckLRInTCEpilogue =
8791         AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8792     // Checking the authenticated LR value may significantly impact
8793     // SequenceSize, so account for it for more precise results.
8794     if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
8795       SequenceSize += NumBytesToCheckLRInTCEpilogue;
8796
8797     // We have to check if sp modifying instructions would get outlined.
8798     // If so we only allow outlining if sp is unchanged overall, so matching
8799     // sub and add instructions are okay to outline, all other sp modifications
8800     // are not
8801     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8802       int SPValue = 0;
8803       for (auto &MI : C) {
8804         if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8805           switch (MI.getOpcode()) {
8806           case AArch64::ADDXri:
8807           case AArch64::ADDWri:
8808             assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8809             assert(MI.getOperand(2).isImm() &&
8810                    "Expected operand to be immediate");
8811             assert(MI.getOperand(1).isReg() &&
8812                    "Expected operand to be a register");
8813             // Check if the add just increments sp. If so, we search for
8814             // matching sub instructions that decrement sp. If not, the
8815             // modification is illegal
8816             if (MI.getOperand(1).getReg() == AArch64::SP)
8817               SPValue += MI.getOperand(2).getImm();
8818             else
8819               return true;
8820             break;
8821           case AArch64::SUBXri:
8822           case AArch64::SUBWri:
8823             assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8824             assert(MI.getOperand(2).isImm() &&
8825                    "Expected operand to be immediate");
8826             assert(MI.getOperand(1).isReg() &&
8827                    "Expected operand to be a register");
8828             // Check if the sub just decrements sp. If so, we search for
8829             // matching add instructions that increment sp. If not, the
8830             // modification is illegal
8831             if (MI.getOperand(1).getReg() == AArch64::SP)
8832               SPValue -= MI.getOperand(2).getImm();
8833             else
8834               return true;
8835             break;
8836           default:
8837             return true;
8838           }
8839         }
8840       }
8841       if (SPValue)
8842         return true;
8843       return false;
8844     };
8845     // Remove candidates with illegal stack modifying instructions
8846     llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8847
8848     // If the sequence doesn't have enough candidates left, then we're done.
8849     if (RepeatedSequenceLocs.size() < MinRepeats)
8850       return std::nullopt;
8851   }
8852
8853   // Properties about candidate MBBs that hold for all of them.
8854   unsigned FlagsSetInAll = 0xF;
8855
8856   // Compute liveness information for each candidate, and set FlagsSetInAll.
8857   for (outliner::Candidate &C : RepeatedSequenceLocs)
8858     FlagsSetInAll &= C.Flags;
8859
8860   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8861
8862   // Helper lambda which sets call information for every candidate.
8863   auto SetCandidateCallInfo =
8864       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8865         for (outliner::Candidate &C : RepeatedSequenceLocs)
8866           C.setCallInfo(CallID, NumBytesForCall);
8867       };
8868
8869   unsigned FrameID = MachineOutlinerDefault;
8870   NumBytesToCreateFrame += 4;
8871
8872   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8873     return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8874   });
8875
8876   // We check to see if CFI Instructions are present, and if they are
8877   // we find the number of CFI Instructions in the candidates.
8878   unsigned CFICount = 0;
8879   for (auto &I : RepeatedSequenceLocs[0]) {
8880     if (I.isCFIInstruction())
8881       CFICount++;
8882   }
8883
8884   // We compare the number of found CFI Instructions to  the number of CFI
8885   // instructions in the parent function for each candidate.  We must check this
8886   // since if we outline one of the CFI instructions in a function, we have to
8887   // outline them all for correctness. If we do not, the address offsets will be
8888   // incorrect between the two sections of the program.
8889   for (outliner::Candidate &C : RepeatedSequenceLocs) {
8890     std::vector<MCCFIInstruction> CFIInstructions =
8891         C.getMF()->getFrameInstructions();
8892
8893     if (CFICount > 0 && CFICount != CFIInstructions.size())
8894       return std::nullopt;
8895   }
8896
8897   // Returns true if an instructions is safe to fix up, false otherwise.
8898   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8899     if (MI.isCall())
8900       return true;
8901
8902     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8903         !MI.readsRegister(AArch64::SP, &TRI))
8904       return true;
8905
8906     // Any modification of SP will break our code to save/restore LR.
8907     // FIXME: We could handle some instructions which add a constant
8908     // offset to SP, with a bit more work.
8909     if (MI.modifiesRegister(AArch64::SP, &TRI))
8910       return false;
8911
8912     // At this point, we have a stack instruction that we might need to
8913     // fix up. We'll handle it if it's a load or store.
8914     if (MI.mayLoadOrStore()) {
8915       const MachineOperand *Base; // Filled with the base operand of MI.
8916       int64_t Offset;             // Filled with the offset of MI.
8917       bool OffsetIsScalable;
8918
8919       // Does it allow us to offset the base operand and is the base the
8920       // register SP?
8921       if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8922           !Base->isReg() || Base->getReg() != AArch64::SP)
8923         return false;
8924
8925       // Fixe-up code below assumes bytes.
8926       if (OffsetIsScalable)
8927         return false;
8928
8929       // Find the minimum/maximum offset for this instruction and check
8930       // if fixing it up would be in range.
8931       int64_t MinOffset,
8932           MaxOffset;  // Unscaled offsets for the instruction.
8933       // The scale to multiply the offsets by.
8934       TypeSize Scale(0U, false), DummyWidth(0U, false);
8935       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8936
8937       Offset += 16; // Update the offset to what it would be if we outlined.
8938       if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8939           Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8940         return false;
8941
8942       // It's in range, so we can outline it.
8943       return true;
8944     }
8945
8946     // FIXME: Add handling for instructions like "add x0, sp, #8".
8947
8948     // We can't fix it up, so don't outline it.
8949     return false;
8950   };
8951
8952   // True if it's possible to fix up each stack instruction in this sequence.
8953   // Important for frames/call variants that modify the stack.
8954   bool AllStackInstrsSafe =
8955       llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
8956
8957   // If the last instruction in any candidate is a terminator, then we should
8958   // tail call all of the candidates.
8959   if (RepeatedSequenceLocs[0].back().isTerminator()) {
8960     FrameID = MachineOutlinerTailCall;
8961     NumBytesToCreateFrame = 0;
8962     unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8963     SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8964   }
8965
8966   else if (LastInstrOpcode == AArch64::BL ||
8967            ((LastInstrOpcode == AArch64::BLR ||
8968              LastInstrOpcode == AArch64::BLRNoIP) &&
8969             !HasBTI)) {
8970     // FIXME: Do we need to check if the code after this uses the value of LR?
8971     FrameID = MachineOutlinerThunk;
8972     NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8973     SetCandidateCallInfo(MachineOutlinerThunk, 4);
8974   }
8975
8976   else {
8977     // We need to decide how to emit calls + frames. We can always emit the same
8978     // frame if we don't need to save to the stack. If we have to save to the
8979     // stack, then we need a different frame.
8980     unsigned NumBytesNoStackCalls = 0;
8981     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8982
8983     // Check if we have to save LR.
8984     for (outliner::Candidate &C : RepeatedSequenceLocs) {
8985       bool LRAvailable =
8986           (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8987               ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8988               : true;
8989       // If we have a noreturn caller, then we're going to be conservative and
8990       // say that we have to save LR. If we don't have a ret at the end of the
8991       // block, then we can't reason about liveness accurately.
8992       //
8993       // FIXME: We can probably do better than always disabling this in
8994       // noreturn functions by fixing up the liveness info.
8995       bool IsNoReturn =
8996           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8997
8998       // Is LR available? If so, we don't need a save.
8999       if (LRAvailable && !IsNoReturn) {
9000         NumBytesNoStackCalls += 4;
9001         C.setCallInfo(MachineOutlinerNoLRSave, 4);
9002         CandidatesWithoutStackFixups.push_back(C);
9003       }
9004
9005       // Is an unused register available? If so, we won't modify the stack, so
9006       // we can outline with the same frame type as those that don't save LR.
9007       else if (findRegisterToSaveLRTo(C)) {
9008         NumBytesNoStackCalls += 12;
9009         C.setCallInfo(MachineOutlinerRegSave, 12);
9010         CandidatesWithoutStackFixups.push_back(C);
9011       }
9012
9013       // Is SP used in the sequence at all? If not, we don't have to modify
9014       // the stack, so we are guaranteed to get the same frame.
9015       else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
9016         NumBytesNoStackCalls += 12;
9017         C.setCallInfo(MachineOutlinerDefault, 12);
9018         CandidatesWithoutStackFixups.push_back(C);
9019       }
9020
9021       // If we outline this, we need to modify the stack. Pretend we don't
9022       // outline this by saving all of its bytes.
9023       else {
9024         NumBytesNoStackCalls += SequenceSize;
9025       }
9026     }
9027
9028     // If there are no places where we have to save LR, then note that we
9029     // don't have to update the stack. Otherwise, give every candidate the
9030     // default call type, as long as it's safe to do so.
9031     if (!AllStackInstrsSafe ||
9032         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
9033       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
9034       FrameID = MachineOutlinerNoLRSave;
9035       if (RepeatedSequenceLocs.size() < MinRepeats)
9036         return std::nullopt;
9037     } else {
9038       SetCandidateCallInfo(MachineOutlinerDefault, 12);
9039
9040       // Bugzilla ID: 46767
9041       // TODO: Check if fixing up the stack more than once is safe so we can
9042       // outline these.
9043       //
9044       // An outline resulting in a caller that requires stack fixups at the
9045       // callsite to a callee that also requires stack fixups can happen when
9046       // there are no available registers at the candidate callsite for a
9047       // candidate that itself also has calls.
9048       //
9049       // In other words if function_containing_sequence in the following pseudo
9050       // assembly requires that we save LR at the point of the call, but there
9051       // are no available registers: in this case we save using SP and as a
9052       // result the SP offsets requires stack fixups by multiples of 16.
9053       //
9054       // function_containing_sequence:
9055       //   ...
9056       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
9057       //   call OUTLINED_FUNCTION_N
9058       //   restore LR from SP
9059       //   ...
9060       //
9061       // OUTLINED_FUNCTION_N:
9062       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
9063       //   ...
9064       //   bl foo
9065       //   restore LR from SP
9066       //   ret
9067       //
9068       // Because the code to handle more than one stack fixup does not
9069       // currently have the proper checks for legality, these cases will assert
9070       // in the AArch64 MachineOutliner. This is because the code to do this
9071       // needs more hardening, testing, better checks that generated code is
9072       // legal, etc and because it is only verified to handle a single pass of
9073       // stack fixup.
9074       //
9075       // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
9076       // these cases until they are known to be handled. Bugzilla 46767 is
9077       // referenced in comments at the assert site.
9078       //
9079       // To avoid asserting (or generating non-legal code on noassert builds)
9080       // we remove all candidates which would need more than one stack fixup by
9081       // pruning the cases where the candidate has calls while also having no
9082       // available LR and having no available general purpose registers to copy
9083       // LR to (ie one extra stack save/restore).
9084       //
9085       if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
9086         erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
9087           auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
9088           return (llvm::any_of(C, IsCall)) &&
9089                  (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
9090                   !findRegisterToSaveLRTo(C));
9091         });
9092       }
9093     }
9094
9095     // If we dropped all of the candidates, bail out here.
9096     if (RepeatedSequenceLocs.size() < MinRepeats)
9097       return std::nullopt;
9098   }
9099
9100   // Does every candidate's MBB contain a call? If so, then we might have a call
9101   // in the range.
9102   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
9103     // Check if the range contains a call. These require a save + restore of the
9104     // link register.
9105     outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
9106     bool ModStackToSaveLR = false;
9107     if (any_of(drop_end(FirstCand),
9108                [](const MachineInstr &MI) { return MI.isCall(); }))
9109       ModStackToSaveLR = true;
9110
9111     // Handle the last instruction separately. If this is a tail call, then the
9112     // last instruction is a call. We don't want to save + restore in this case.
9113     // However, it could be possible that the last instruction is a call without
9114     // it being valid to tail call this sequence. We should consider this as
9115     // well.
9116     else if (FrameID != MachineOutlinerThunk &&
9117              FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
9118       ModStackToSaveLR = true;
9119
9120     if (ModStackToSaveLR) {
9121       // We can't fix up the stack. Bail out.
9122       if (!AllStackInstrsSafe)
9123         return std::nullopt;
9124
9125       // Save + restore LR.
9126       NumBytesToCreateFrame += 8;
9127     }
9128   }
9129
9130   // If we have CFI instructions, we can only outline if the outlined section
9131   // can be a tail call
9132   if (FrameID != MachineOutlinerTailCall && CFICount > 0)
9133     return std::nullopt;
9134
9135   return std::make_unique<outliner::OutlinedFunction>(
9136       RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
9137 }
9138
9139 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
9140     Function &F, std::vector<outliner::Candidate> &Candidates) const {
9141   // If a bunch of candidates reach this point they must agree on their return
9142   // address signing. It is therefore enough to just consider the signing
9143   // behaviour of one of them
9144   const auto &CFn = Candidates.front().getMF()->getFunction();
9145
9146   if (CFn.hasFnAttribute("ptrauth-returns"))
9147     F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
9148   if (CFn.hasFnAttribute("ptrauth-auth-traps"))
9149     F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
9150   // Since all candidates belong to the same module, just copy the
9151   // function-level attributes of an arbitrary function.
9152   if (CFn.hasFnAttribute("sign-return-address"))
9153     F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
9154   if (CFn.hasFnAttribute("sign-return-address-key"))
9155     F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
9156
9157   AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
9158 }
9159
9160 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
9161     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
9162   const Function &F = MF.getFunction();
9163
9164   // Can F be deduplicated by the linker? If it can, don't outline from it.
9165   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
9166     return false;
9167
9168   // Don't outline from functions with section markings; the program could
9169   // expect that all the code is in the named section.
9170   // FIXME: Allow outlining from multiple functions with the same section
9171   // marking.
9172   if (F.hasSection())
9173     return false;
9174
9175   // Outlining from functions with redzones is unsafe since the outliner may
9176   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
9177   // outline from it.
9178   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
9179   if (!AFI || AFI->hasRedZone().value_or(true))
9180     return false;
9181
9182   // FIXME: Determine whether it is safe to outline from functions which contain
9183   // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
9184   // outlined together and ensure it is safe to outline with async unwind info,
9185   // required for saving & restoring VG around calls.
9186   if (AFI->hasStreamingModeChanges())
9187     return false;
9188
9189   // FIXME: Teach the outliner to generate/handle Windows unwind info.
9190   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
9191     return false;
9192
9193   // It's safe to outline from MF.
9194   return true;
9195 }
9196
9197 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
9198 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
9199                                       unsigned &Flags) const {
9200   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
9201          "Must track liveness!");
9202   SmallVector<
9203       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
9204       Ranges;
9205   // According to the AArch64 Procedure Call Standard, the following are
9206   // undefined on entry/exit from a function call:
9207   //
9208   // * Registers x16, x17, (and thus w16, w17)
9209   // * Condition codes (and thus the NZCV register)
9210   //
9211   // If any of these registers are used inside or live across an outlined
9212   // function, then they may be modified later, either by the compiler or
9213   // some other tool (like the linker).
9214   //
9215   // To avoid outlining in these situations, partition each block into ranges
9216   // where these registers are dead. We will only outline from those ranges.
9217   LiveRegUnits LRU(getRegisterInfo());
9218   auto AreAllUnsafeRegsDead = [&LRU]() {
9219     return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
9220            LRU.available(AArch64::NZCV);
9221   };
9222
9223   // We need to know if LR is live across an outlining boundary later on in
9224   // order to decide how we'll create the outlined call, frame, etc.
9225   //
9226   // It's pretty expensive to check this for *every candidate* within a block.
9227   // That's some potentially n^2 behaviour, since in the worst case, we'd need
9228   // to compute liveness from the end of the block for O(n) candidates within
9229   // the block.
9230   //
9231   // So, to improve the average case, let's keep track of liveness from the end
9232   // of the block to the beginning of *every outlinable range*. If we know that
9233   // LR is available in every range we could outline from, then we know that
9234   // we don't need to check liveness for any candidate within that range.
9235   bool LRAvailableEverywhere = true;
9236   // Compute liveness bottom-up.
9237   LRU.addLiveOuts(MBB);
9238   // Update flags that require info about the entire MBB.
9239   auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
9240     if (MI.isCall() && !MI.isTerminator())
9241       Flags |= MachineOutlinerMBBFlags::HasCalls;
9242   };
9243   // Range: [RangeBegin, RangeEnd)
9244   MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
9245   unsigned RangeLen;
9246   auto CreateNewRangeStartingAt =
9247       [&RangeBegin, &RangeEnd,
9248        &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
9249         RangeBegin = NewBegin;
9250         RangeEnd = std::next(RangeBegin);
9251         RangeLen = 0;
9252       };
9253   auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
9254     // At least one unsafe register is not dead. We do not want to outline at
9255     // this point. If it is long enough to outline from, save the range
9256     // [RangeBegin, RangeEnd).
9257     if (RangeLen > 1)
9258       Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
9259   };
9260   // Find the first point where all unsafe registers are dead.
9261   // FIND: <safe instr> <-- end of first potential range
9262   // SKIP: <unsafe def>
9263   // SKIP: ... everything between ...
9264   // SKIP: <unsafe use>
9265   auto FirstPossibleEndPt = MBB.instr_rbegin();
9266   for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
9267     LRU.stepBackward(*FirstPossibleEndPt);
9268     // Update flags that impact how we outline across the entire block,
9269     // regardless of safety.
9270     UpdateWholeMBBFlags(*FirstPossibleEndPt);
9271     if (AreAllUnsafeRegsDead())
9272       break;
9273   }
9274   // If we exhausted the entire block, we have no safe ranges to outline.
9275   if (FirstPossibleEndPt == MBB.instr_rend())
9276     return Ranges;
9277   // Current range.
9278   CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
9279   // StartPt points to the first place where all unsafe registers
9280   // are dead (if there is any such point). Begin partitioning the MBB into
9281   // ranges.
9282   for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
9283     LRU.stepBackward(MI);
9284     UpdateWholeMBBFlags(MI);
9285     if (!AreAllUnsafeRegsDead()) {
9286       SaveRangeIfNonEmpty();
9287       CreateNewRangeStartingAt(MI.getIterator());
9288       continue;
9289     }
9290     LRAvailableEverywhere &= LRU.available(AArch64::LR);
9291     RangeBegin = MI.getIterator();
9292     ++RangeLen;
9293   }
9294   // Above loop misses the last (or only) range. If we are still safe, then
9295   // let's save the range.
9296   if (AreAllUnsafeRegsDead())
9297     SaveRangeIfNonEmpty();
9298   if (Ranges.empty())
9299     return Ranges;
9300   // We found the ranges bottom-up. Mapping expects the top-down. Reverse
9301   // the order.
9302   std::reverse(Ranges.begin(), Ranges.end());
9303   // If there is at least one outlinable range where LR is unavailable
9304   // somewhere, remember that.
9305   if (!LRAvailableEverywhere)
9306     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
9307   return Ranges;
9308 }
9309
9310 outliner::InstrType
9311 AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
9312                                        MachineBasicBlock::iterator &MIT,
9313                                        unsigned Flags) const {
9314   MachineInstr &MI = *MIT;
9315   MachineBasicBlock *MBB = MI.getParent();
9316   MachineFunction *MF = MBB->getParent();
9317   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
9318
9319   // Don't outline anything used for return address signing. The outlined
9320   // function will get signed later if needed
9321   switch (MI.getOpcode()) {
9322   case AArch64::PACM:
9323   case AArch64::PACIASP:
9324   case AArch64::PACIBSP:
9325   case AArch64::PACIASPPC:
9326   case AArch64::PACIBSPPC:
9327   case AArch64::AUTIASP:
9328   case AArch64::AUTIBSP:
9329   case AArch64::AUTIASPPCi:
9330   case AArch64::AUTIASPPCr:
9331   case AArch64::AUTIBSPPCi:
9332   case AArch64::AUTIBSPPCr:
9333   case AArch64::RETAA:
9334   case AArch64::RETAB:
9335   case AArch64::RETAASPPCi:
9336   case AArch64::RETAASPPCr:
9337   case AArch64::RETABSPPCi:
9338   case AArch64::RETABSPPCr:
9339   case AArch64::EMITBKEY:
9340   case AArch64::PAUTH_PROLOGUE:
9341   case AArch64::PAUTH_EPILOGUE:
9342     return outliner::InstrType::Illegal;
9343   }
9344
9345   // Don't outline LOHs.
9346   if (FuncInfo->getLOHRelated().count(&MI))
9347     return outliner::InstrType::Illegal;
9348
9349   // We can only outline these if we will tail call the outlined function, or
9350   // fix up the CFI offsets. Currently, CFI instructions are outlined only if
9351   // in a tail call.
9352   //
9353   // FIXME: If the proper fixups for the offset are implemented, this should be
9354   // possible.
9355   if (MI.isCFIInstruction())
9356     return outliner::InstrType::Legal;
9357
9358   // Is this a terminator for a basic block?
9359   if (MI.isTerminator())
9360     // TargetInstrInfo::getOutliningType has already filtered out anything
9361     // that would break this, so we can allow it here.
9362     return outliner::InstrType::Legal;
9363
9364   // Make sure none of the operands are un-outlinable.
9365   for (const MachineOperand &MOP : MI.operands()) {
9366     // A check preventing CFI indices was here before, but only CFI
9367     // instructions should have those.
9368     assert(!MOP.isCFIIndex());
9369
9370     // If it uses LR or W30 explicitly, then don't touch it.
9371     if (MOP.isReg() && !MOP.isImplicit() &&
9372         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
9373       return outliner::InstrType::Illegal;
9374   }
9375
9376   // Special cases for instructions that can always be outlined, but will fail
9377   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
9378   // be outlined because they don't require a *specific* value to be in LR.
9379   if (MI.getOpcode() == AArch64::ADRP)
9380     return outliner::InstrType::Legal;
9381
9382   // If MI is a call we might be able to outline it. We don't want to outline
9383   // any calls that rely on the position of items on the stack. When we outline
9384   // something containing a call, we have to emit a save and restore of LR in
9385   // the outlined function. Currently, this always happens by saving LR to the
9386   // stack. Thus, if we outline, say, half the parameters for a function call
9387   // plus the call, then we'll break the callee's expectations for the layout
9388   // of the stack.
9389   //
9390   // FIXME: Allow calls to functions which construct a stack frame, as long
9391   // as they don't access arguments on the stack.
9392   // FIXME: Figure out some way to analyze functions defined in other modules.
9393   // We should be able to compute the memory usage based on the IR calling
9394   // convention, even if we can't see the definition.
9395   if (MI.isCall()) {
9396     // Get the function associated with the call. Look at each operand and find
9397     // the one that represents the callee and get its name.
9398     const Function *Callee = nullptr;
9399     for (const MachineOperand &MOP : MI.operands()) {
9400       if (MOP.isGlobal()) {
9401         Callee = dyn_cast<Function>(MOP.getGlobal());
9402         break;
9403       }
9404     }
9405
9406     // Never outline calls to mcount.  There isn't any rule that would require
9407     // this, but the Linux kernel's "ftrace" feature depends on it.
9408     if (Callee && Callee->getName() == "\01_mcount")
9409       return outliner::InstrType::Illegal;
9410
9411     // If we don't know anything about the callee, assume it depends on the
9412     // stack layout of the caller. In that case, it's only legal to outline
9413     // as a tail-call. Explicitly list the call instructions we know about so we
9414     // don't get unexpected results with call pseudo-instructions.
9415     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
9416     if (MI.getOpcode() == AArch64::BLR ||
9417         MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
9418       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
9419
9420     if (!Callee)
9421       return UnknownCallOutlineType;
9422
9423     // We have a function we have information about. Check it if it's something
9424     // can safely outline.
9425     MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
9426
9427     // We don't know what's going on with the callee at all. Don't touch it.
9428     if (!CalleeMF)
9429       return UnknownCallOutlineType;
9430
9431     // Check if we know anything about the callee saves on the function. If we
9432     // don't, then don't touch it, since that implies that we haven't
9433     // computed anything about its stack frame yet.
9434     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
9435     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
9436         MFI.getNumObjects() > 0)
9437       return UnknownCallOutlineType;
9438
9439     // At this point, we can say that CalleeMF ought to not pass anything on the
9440     // stack. Therefore, we can outline it.
9441     return outliner::InstrType::Legal;
9442   }
9443
9444   // Don't touch the link register or W30.
9445   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
9446       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
9447     return outliner::InstrType::Illegal;
9448
9449   // Don't outline BTI instructions, because that will prevent the outlining
9450   // site from being indirectly callable.
9451   if (hasBTISemantics(MI))
9452     return outliner::InstrType::Illegal;
9453
9454   return outliner::InstrType::Legal;
9455 }
9456
9457 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9458   for (MachineInstr &MI : MBB) {
9459     const MachineOperand *Base;
9460     TypeSize Width(0, false);
9461     int64_t Offset;
9462     bool OffsetIsScalable;
9463
9464     // Is this a load or store with an immediate offset with SP as the base?
9465     if (!MI.mayLoadOrStore() ||
9466         !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
9467                                       &RI) ||
9468         (Base->isReg() && Base->getReg() != AArch64::SP))
9469       continue;
9470
9471     // It is, so we have to fix it up.
9472     TypeSize Scale(0U, false);
9473     int64_t Dummy1, Dummy2;
9474
9475     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
9476     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9477     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
9478     assert(Scale != 0 && "Unexpected opcode!");
9479     assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9480
9481     // We've pushed the return address to the stack, so add 16 to the offset.
9482     // This is safe, since we already checked if it would overflow when we
9483     // checked if this instruction was legal to outline.
9484     int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9485     StackOffsetOperand.setImm(NewImm);
9486   }
9487 }
9488
9489 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9490                                  const AArch64InstrInfo *TII,
9491                                  bool ShouldSignReturnAddr) {
9492   if (!ShouldSignReturnAddr)
9493     return;
9494
9495   BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
9496       .setMIFlag(MachineInstr::FrameSetup);
9497   BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9498           TII->get(AArch64::PAUTH_EPILOGUE))
9499       .setMIFlag(MachineInstr::FrameDestroy);
9500 }
9501
9502 void AArch64InstrInfo::buildOutlinedFrame(
9503     MachineBasicBlock &MBB, MachineFunction &MF,
9504     const outliner::OutlinedFunction &OF) const {
9505
9506   AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9507
9508   if (OF.FrameConstructionID == MachineOutlinerTailCall)
9509     FI->setOutliningStyle("Tail Call");
9510   else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9511     // For thunk outlining, rewrite the last instruction from a call to a
9512     // tail-call.
9513     MachineInstr *Call = &*--MBB.instr_end();
9514     unsigned TailOpcode;
9515     if (Call->getOpcode() == AArch64::BL) {
9516       TailOpcode = AArch64::TCRETURNdi;
9517     } else {
9518       assert(Call->getOpcode() == AArch64::BLR ||
9519              Call->getOpcode() == AArch64::BLRNoIP);
9520       TailOpcode = AArch64::TCRETURNriALL;
9521     }
9522     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9523                            .add(Call->getOperand(0))
9524                            .addImm(0);
9525     MBB.insert(MBB.end(), TC);
9526     Call->eraseFromParent();
9527
9528     FI->setOutliningStyle("Thunk");
9529   }
9530
9531   bool IsLeafFunction = true;
9532
9533   // Is there a call in the outlined range?
9534   auto IsNonTailCall = [](const MachineInstr &MI) {
9535     return MI.isCall() && !MI.isReturn();
9536   };
9537
9538   if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9539     // Fix up the instructions in the range, since we're going to modify the
9540     // stack.
9541
9542     // Bugzilla ID: 46767
9543     // TODO: Check if fixing up twice is safe so we can outline these.
9544     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9545            "Can only fix up stack references once");
9546     fixupPostOutline(MBB);
9547
9548     IsLeafFunction = false;
9549
9550     // LR has to be a live in so that we can save it.
9551     if (!MBB.isLiveIn(AArch64::LR))
9552       MBB.addLiveIn(AArch64::LR);
9553
9554     MachineBasicBlock::iterator It = MBB.begin();
9555     MachineBasicBlock::iterator Et = MBB.end();
9556
9557     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9558         OF.FrameConstructionID == MachineOutlinerThunk)
9559       Et = std::prev(MBB.end());
9560
9561     // Insert a save before the outlined region
9562     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9563                                 .addReg(AArch64::SP, RegState::Define)
9564                                 .addReg(AArch64::LR)
9565                                 .addReg(AArch64::SP)
9566                                 .addImm(-16);
9567     It = MBB.insert(It, STRXpre);
9568
9569     if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9570       const TargetSubtargetInfo &STI = MF.getSubtarget();
9571       const MCRegisterInfo *MRI = STI.getRegisterInfo();
9572       unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9573
9574       // Add a CFI saying the stack was moved 16 B down.
9575       int64_t StackPosEntry =
9576           MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9577       BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9578           .addCFIIndex(StackPosEntry)
9579           .setMIFlags(MachineInstr::FrameSetup);
9580
9581       // Add a CFI saying that the LR that we want to find is now 16 B higher
9582       // than before.
9583       int64_t LRPosEntry = MF.addFrameInst(
9584           MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9585       BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9586           .addCFIIndex(LRPosEntry)
9587           .setMIFlags(MachineInstr::FrameSetup);
9588     }
9589
9590     // Insert a restore before the terminator for the function.
9591     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9592                                  .addReg(AArch64::SP, RegState::Define)
9593                                  .addReg(AArch64::LR, RegState::Define)
9594                                  .addReg(AArch64::SP)
9595                                  .addImm(16);
9596     Et = MBB.insert(Et, LDRXpost);
9597   }
9598
9599   bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9600
9601   // If this is a tail call outlined function, then there's already a return.
9602   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9603       OF.FrameConstructionID == MachineOutlinerThunk) {
9604     signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9605     return;
9606   }
9607
9608   // It's not a tail call, so we have to insert the return ourselves.
9609
9610   // LR has to be a live in so that we can return to it.
9611   if (!MBB.isLiveIn(AArch64::LR))
9612     MBB.addLiveIn(AArch64::LR);
9613
9614   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9615                           .addReg(AArch64::LR);
9616   MBB.insert(MBB.end(), ret);
9617
9618   signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9619
9620   FI->setOutliningStyle("Function");
9621
9622   // Did we have to modify the stack by saving the link register?
9623   if (OF.FrameConstructionID != MachineOutlinerDefault)
9624     return;
9625
9626   // We modified the stack.
9627   // Walk over the basic block and fix up all the stack accesses.
9628   fixupPostOutline(MBB);
9629 }
9630
9631 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9632     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9633     MachineFunction &MF, outliner::Candidate &C) const {
9634
9635   // Are we tail calling?
9636   if (C.CallConstructionID == MachineOutlinerTailCall) {
9637     // If yes, then we can just branch to the label.
9638     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9639                             .addGlobalAddress(M.getNamedValue(MF.getName()))
9640                             .addImm(0));
9641     return It;
9642   }
9643
9644   // Are we saving the link register?
9645   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9646       C.CallConstructionID == MachineOutlinerThunk) {
9647     // No, so just insert the call.
9648     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9649                             .addGlobalAddress(M.getNamedValue(MF.getName())));
9650     return It;
9651   }
9652
9653   // We want to return the spot where we inserted the call.
9654   MachineBasicBlock::iterator CallPt;
9655
9656   // Instructions for saving and restoring LR around the call instruction we're
9657   // going to insert.
9658   MachineInstr *Save;
9659   MachineInstr *Restore;
9660   // Can we save to a register?
9661   if (C.CallConstructionID == MachineOutlinerRegSave) {
9662     // FIXME: This logic should be sunk into a target-specific interface so that
9663     // we don't have to recompute the register.
9664     Register Reg = findRegisterToSaveLRTo(C);
9665     assert(Reg && "No callee-saved register available?");
9666
9667     // LR has to be a live in so that we can save it.
9668     if (!MBB.isLiveIn(AArch64::LR))
9669       MBB.addLiveIn(AArch64::LR);
9670
9671     // Save and restore LR from Reg.
9672     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9673                .addReg(AArch64::XZR)
9674                .addReg(AArch64::LR)
9675                .addImm(0);
9676     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9677                 .addReg(AArch64::XZR)
9678                 .addReg(Reg)
9679                 .addImm(0);
9680   } else {
9681     // We have the default case. Save and restore from SP.
9682     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9683                .addReg(AArch64::SP, RegState::Define)
9684                .addReg(AArch64::LR)
9685                .addReg(AArch64::SP)
9686                .addImm(-16);
9687     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9688                   .addReg(AArch64::SP, RegState::Define)
9689                   .addReg(AArch64::LR, RegState::Define)
9690                   .addReg(AArch64::SP)
9691                   .addImm(16);
9692   }
9693
9694   It = MBB.insert(It, Save);
9695   It++;
9696
9697   // Insert the call.
9698   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9699                           .addGlobalAddress(M.getNamedValue(MF.getName())));
9700   CallPt = It;
9701   It++;
9702
9703   It = MBB.insert(It, Restore);
9704   return CallPt;
9705 }
9706
9707 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9708   MachineFunction &MF) const {
9709   return MF.getFunction().hasMinSize();
9710 }
9711
9712 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9713                                           MachineBasicBlock::iterator Iter,
9714                                           DebugLoc &DL,
9715                                           bool AllowSideEffects) const {
9716   const MachineFunction &MF = *MBB.getParent();
9717   const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9718   const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9719
9720   if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9721     BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9722   } else if (STI.isSVEorStreamingSVEAvailable()) {
9723     BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9724       .addImm(0)
9725       .addImm(0);
9726   } else if (STI.isNeonAvailable()) {
9727     BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9728       .addImm(0);
9729   } else {
9730     // This is a streaming-compatible function without SVE. We don't have full
9731     // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
9732     // So given `movi v..` would be illegal use `fmov d..` instead.
9733     assert(STI.hasNEON() && "Expected to have NEON.");
9734     Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
9735     BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
9736   }
9737 }
9738
9739 std::optional<DestSourcePair>
9740 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9741
9742   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9743   // and zero immediate operands used as an alias for mov instruction.
9744   if (((MI.getOpcode() == AArch64::ORRWrs &&
9745         MI.getOperand(1).getReg() == AArch64::WZR &&
9746         MI.getOperand(3).getImm() == 0x0) ||
9747        (MI.getOpcode() == AArch64::ORRWrr &&
9748         MI.getOperand(1).getReg() == AArch64::WZR)) &&
9749       // Check that the w->w move is not a zero-extending w->x mov.
9750       (!MI.getOperand(0).getReg().isVirtual() ||
9751        MI.getOperand(0).getSubReg() == 0) &&
9752       (!MI.getOperand(0).getReg().isPhysical() ||
9753        MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9754                                         AArch64::X0,
9755                                     /*TRI=*/nullptr) == -1))
9756     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9757
9758   if (MI.getOpcode() == AArch64::ORRXrs &&
9759       MI.getOperand(1).getReg() == AArch64::XZR &&
9760       MI.getOperand(3).getImm() == 0x0)
9761     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9762
9763   return std::nullopt;
9764 }
9765
9766 std::optional<DestSourcePair>
9767 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9768   if ((MI.getOpcode() == AArch64::ORRWrs &&
9769        MI.getOperand(1).getReg() == AArch64::WZR &&
9770        MI.getOperand(3).getImm() == 0x0) ||
9771       (MI.getOpcode() == AArch64::ORRWrr &&
9772        MI.getOperand(1).getReg() == AArch64::WZR))
9773     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9774   return std::nullopt;
9775 }
9776
9777 std::optional<RegImmPair>
9778 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9779   int Sign = 1;
9780   int64_t Offset = 0;
9781
9782   // TODO: Handle cases where Reg is a super- or sub-register of the
9783   // destination register.
9784   const MachineOperand &Op0 = MI.getOperand(0);
9785   if (!Op0.isReg() || Reg != Op0.getReg())
9786     return std::nullopt;
9787
9788   switch (MI.getOpcode()) {
9789   default:
9790     return std::nullopt;
9791   case AArch64::SUBWri:
9792   case AArch64::SUBXri:
9793   case AArch64::SUBSWri:
9794   case AArch64::SUBSXri:
9795     Sign *= -1;
9796     [[fallthrough]];
9797   case AArch64::ADDSWri:
9798   case AArch64::ADDSXri:
9799   case AArch64::ADDWri:
9800   case AArch64::ADDXri: {
9801     // TODO: Third operand can be global address (usually some string).
9802     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9803         !MI.getOperand(2).isImm())
9804       return std::nullopt;
9805     int Shift = MI.getOperand(3).getImm();
9806     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9807     Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9808   }
9809   }
9810   return RegImmPair{MI.getOperand(1).getReg(), Offset};
9811 }
9812
9813 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9814 /// the destination register then, if possible, describe the value in terms of
9815 /// the source register.
9816 static std::optional<ParamLoadedValue>
9817 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9818                        const TargetInstrInfo *TII,
9819                        const TargetRegisterInfo *TRI) {
9820   auto DestSrc = TII->isCopyLikeInstr(MI);
9821   if (!DestSrc)
9822     return std::nullopt;
9823
9824   Register DestReg = DestSrc->Destination->getReg();
9825   Register SrcReg = DestSrc->Source->getReg();
9826
9827   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9828
9829   // If the described register is the destination, just return the source.
9830   if (DestReg == DescribedReg)
9831     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9832
9833   // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9834   if (MI.getOpcode() == AArch64::ORRWrs &&
9835       TRI->isSuperRegister(DestReg, DescribedReg))
9836     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9837
9838   // We may need to describe the lower part of a ORRXrs move.
9839   if (MI.getOpcode() == AArch64::ORRXrs &&
9840       TRI->isSubRegister(DestReg, DescribedReg)) {
9841     Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9842     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9843   }
9844
9845   assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9846          "Unhandled ORR[XW]rs copy case");
9847
9848   return std::nullopt;
9849 }
9850
9851 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9852   // Functions cannot be split to different sections on AArch64 if they have
9853   // a red zone. This is because relaxing a cross-section branch may require
9854   // incrementing the stack pointer to spill a register, which would overwrite
9855   // the red zone.
9856   if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9857     return false;
9858
9859   return TargetInstrInfo::isFunctionSafeToSplit(MF);
9860 }
9861
9862 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9863     const MachineBasicBlock &MBB) const {
9864   // Asm Goto blocks can contain conditional branches to goto labels, which can
9865   // get moved out of range of the branch instruction.
9866   auto isAsmGoto = [](const MachineInstr &MI) {
9867     return MI.getOpcode() == AArch64::INLINEASM_BR;
9868   };
9869   if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9870     return false;
9871
9872   // Because jump tables are label-relative instead of table-relative, they all
9873   // must be in the same section or relocation fixup handling will fail.
9874
9875   // Check if MBB is a jump table target
9876   const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9877   auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9878     return llvm::is_contained(JTE.MBBs, &MBB);
9879   };
9880   if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9881     return false;
9882
9883   // Check if MBB contains a jump table lookup
9884   for (const MachineInstr &MI : MBB) {
9885     switch (MI.getOpcode()) {
9886     case TargetOpcode::G_BRJT:
9887     case AArch64::JumpTableDest32:
9888     case AArch64::JumpTableDest16:
9889     case AArch64::JumpTableDest8:
9890       return false;
9891     default:
9892       continue;
9893     }
9894   }
9895
9896   // MBB isn't a special case, so it's safe to be split to the cold section.
9897   return true;
9898 }
9899
9900 std::optional<ParamLoadedValue>
9901 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9902                                       Register Reg) const {
9903   const MachineFunction *MF = MI.getMF();
9904   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9905   switch (MI.getOpcode()) {
9906   case AArch64::MOVZWi:
9907   case AArch64::MOVZXi: {
9908     // MOVZWi may be used for producing zero-extended 32-bit immediates in
9909     // 64-bit parameters, so we need to consider super-registers.
9910     if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9911       return std::nullopt;
9912
9913     if (!MI.getOperand(1).isImm())
9914       return std::nullopt;
9915     int64_t Immediate = MI.getOperand(1).getImm();
9916     int Shift = MI.getOperand(2).getImm();
9917     return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9918                             nullptr);
9919   }
9920   case AArch64::ORRWrs:
9921   case AArch64::ORRXrs:
9922     return describeORRLoadedValue(MI, Reg, this, TRI);
9923   }
9924
9925   return TargetInstrInfo::describeLoadedValue(MI, Reg);
9926 }
9927
9928 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9929     MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9930   assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9931          ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9932          ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9933
9934   // Anyexts are nops.
9935   if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9936     return true;
9937
9938   Register DefReg = ExtMI.getOperand(0).getReg();
9939   if (!MRI.hasOneNonDBGUse(DefReg))
9940     return false;
9941
9942   // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9943   // addressing mode.
9944   auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9945   return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9946 }
9947
9948 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9949   return get(Opc).TSFlags & AArch64::ElementSizeMask;
9950 }
9951
9952 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9953   return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9954 }
9955
9956 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9957   return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9958 }
9959
9960 unsigned int
9961 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9962   return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9963 }
9964
9965 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9966                                              unsigned Scale) const {
9967   if (Offset && Scale)
9968     return false;
9969
9970   // Check Reg + Imm
9971   if (!Scale) {
9972     // 9-bit signed offset
9973     if (isInt<9>(Offset))
9974       return true;
9975
9976     // 12-bit unsigned offset
9977     unsigned Shift = Log2_64(NumBytes);
9978     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9979         // Must be a multiple of NumBytes (NumBytes is a power of 2)
9980         (Offset >> Shift) << Shift == Offset)
9981       return true;
9982     return false;
9983   }
9984
9985   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9986   return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9987 }
9988
9989 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9990   if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9991     return AArch64::BLRNoIP;
9992   else
9993     return AArch64::BLR;
9994 }
9995
9996 MachineBasicBlock::iterator
9997 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9998                                    Register TargetReg, bool FrameSetup) const {
9999   assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
10000
10001   MachineBasicBlock &MBB = *MBBI->getParent();
10002   MachineFunction &MF = *MBB.getParent();
10003   const AArch64InstrInfo *TII =
10004       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
10005   int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
10006   DebugLoc DL = MBB.findDebugLoc(MBBI);
10007
10008   MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
10009   MachineBasicBlock *LoopTestMBB =
10010       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10011   MF.insert(MBBInsertPoint, LoopTestMBB);
10012   MachineBasicBlock *LoopBodyMBB =
10013       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10014   MF.insert(MBBInsertPoint, LoopBodyMBB);
10015   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10016   MF.insert(MBBInsertPoint, ExitMBB);
10017   MachineInstr::MIFlag Flags =
10018       FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
10019
10020   // LoopTest:
10021   //   SUB SP, SP, #ProbeSize
10022   emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
10023                   AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
10024
10025   //   CMP SP, TargetReg
10026   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
10027           AArch64::XZR)
10028       .addReg(AArch64::SP)
10029       .addReg(TargetReg)
10030       .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
10031       .setMIFlags(Flags);
10032
10033   //   B.<Cond> LoopExit
10034   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
10035       .addImm(AArch64CC::LE)
10036       .addMBB(ExitMBB)
10037       .setMIFlags(Flags);
10038
10039   //   STR XZR, [SP]
10040   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
10041       .addReg(AArch64::XZR)
10042       .addReg(AArch64::SP)
10043       .addImm(0)
10044       .setMIFlags(Flags);
10045
10046   //   B loop
10047   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
10048       .addMBB(LoopTestMBB)
10049       .setMIFlags(Flags);
10050
10051   // LoopExit:
10052   //   MOV SP, TargetReg
10053   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
10054       .addReg(TargetReg)
10055       .addImm(0)
10056       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
10057       .setMIFlags(Flags);
10058
10059   //   LDR XZR, [SP]
10060   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
10061       .addReg(AArch64::XZR, RegState::Define)
10062       .addReg(AArch64::SP)
10063       .addImm(0)
10064       .setMIFlags(Flags);
10065
10066   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
10067   ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
10068
10069   LoopTestMBB->addSuccessor(ExitMBB);
10070   LoopTestMBB->addSuccessor(LoopBodyMBB);
10071   LoopBodyMBB->addSuccessor(LoopTestMBB);
10072   MBB.addSuccessor(LoopTestMBB);
10073
10074   // Update liveins.
10075   if (MF.getRegInfo().reservedRegsFrozen())
10076     fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
10077
10078   return ExitMBB->begin();
10079 }
10080
10081 namespace {
10082 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
10083   MachineFunction *MF;
10084   const TargetInstrInfo *TII;
10085   const TargetRegisterInfo *TRI;
10086   MachineRegisterInfo &MRI;
10087
10088   /// The block of the loop
10089   MachineBasicBlock *LoopBB;
10090   /// The conditional branch of the loop
10091   MachineInstr *CondBranch;
10092   /// The compare instruction for loop control
10093   MachineInstr *Comp;
10094   /// The number of the operand of the loop counter value in Comp
10095   unsigned CompCounterOprNum;
10096   /// The instruction that updates the loop counter value
10097   MachineInstr *Update;
10098   /// The number of the operand of the loop counter value in Update
10099   unsigned UpdateCounterOprNum;
10100   /// The initial value of the loop counter
10101   Register Init;
10102   /// True iff Update is a predecessor of Comp
10103   bool IsUpdatePriorComp;
10104
10105   /// The normalized condition used by createTripCountGreaterCondition()
10106   SmallVector<MachineOperand, 4> Cond;
10107
10108 public:
10109   AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
10110                            MachineInstr *Comp, unsigned CompCounterOprNum,
10111                            MachineInstr *Update, unsigned UpdateCounterOprNum,
10112                            Register Init, bool IsUpdatePriorComp,
10113                            const SmallVectorImpl<MachineOperand> &Cond)
10114       : MF(Comp->getParent()->getParent()),
10115         TII(MF->getSubtarget().getInstrInfo()),
10116         TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
10117         LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
10118         CompCounterOprNum(CompCounterOprNum), Update(Update),
10119         UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
10120         IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
10121
10122   bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
10123     // Make the instructions for loop control be placed in stage 0.
10124     // The predecessors of Comp are considered by the caller.
10125     return MI == Comp;
10126   }
10127
10128   std::optional<bool> createTripCountGreaterCondition(
10129       int TC, MachineBasicBlock &MBB,
10130       SmallVectorImpl<MachineOperand> &CondParam) override {
10131     // A branch instruction will be inserted as "if (Cond) goto epilogue".
10132     // Cond is normalized for such use.
10133     // The predecessors of the branch are assumed to have already been inserted.
10134     CondParam = Cond;
10135     return {};
10136   }
10137
10138   void createRemainingIterationsGreaterCondition(
10139       int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
10140       DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
10141
10142   void setPreheader(MachineBasicBlock *NewPreheader) override {}
10143
10144   void adjustTripCount(int TripCountAdjust) override {}
10145
10146   bool isMVEExpanderSupported() override { return true; }
10147 };
10148 } // namespace
10149
10150 /// Clone an instruction from MI. The register of ReplaceOprNum-th operand
10151 /// is replaced by ReplaceReg. The output register is newly created.
10152 /// The other operands are unchanged from MI.
10153 static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
10154                            Register ReplaceReg, MachineBasicBlock &MBB,
10155                            MachineBasicBlock::iterator InsertTo) {
10156   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10157   const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
10158   const TargetRegisterInfo *TRI =
10159       MBB.getParent()->getSubtarget().getRegisterInfo();
10160   MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
10161   Register Result = 0;
10162   for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
10163     if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
10164       Result = MRI.createVirtualRegister(
10165           MRI.getRegClass(NewMI->getOperand(0).getReg()));
10166       NewMI->getOperand(I).setReg(Result);
10167     } else if (I == ReplaceOprNum) {
10168       MRI.constrainRegClass(
10169           ReplaceReg,
10170           TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
10171       NewMI->getOperand(I).setReg(ReplaceReg);
10172     }
10173   }
10174   MBB.insert(InsertTo, NewMI);
10175   return Result;
10176 }
10177
10178 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
10179     int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
10180     DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
10181   // Create and accumulate conditions for next TC iterations.
10182   // Example:
10183   //   SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
10184   //                                          # iteration of the kernel
10185   //
10186   //   # insert the following instructions
10187   //   cond = CSINCXr 0, 0, C, implicit $nzcv
10188   //   counter = ADDXri counter, 1            # clone from this->Update
10189   //   SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
10190   //   cond = CSINCXr cond, cond, C, implicit $nzcv
10191   //   ... (repeat TC times)
10192   //   SUBSXri cond, 0, implicit-def $nzcv
10193
10194   assert(CondBranch->getOpcode() == AArch64::Bcc);
10195   // CondCode to exit the loop
10196   AArch64CC::CondCode CC =
10197       (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
10198   if (CondBranch->getOperand(1).getMBB() == LoopBB)
10199     CC = AArch64CC::getInvertedCondCode(CC);
10200
10201   // Accumulate conditions to exit the loop
10202   Register AccCond = AArch64::XZR;
10203
10204   // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
10205   auto AccumulateCond = [&](Register CurCond,
10206                             AArch64CC::CondCode CC) -> Register {
10207     Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
10208     BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
10209         .addReg(NewCond, RegState::Define)
10210         .addReg(CurCond)
10211         .addReg(CurCond)
10212         .addImm(AArch64CC::getInvertedCondCode(CC));
10213     return NewCond;
10214   };
10215
10216   if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
10217     // Update and Comp for I==0 are already exists in MBB
10218     // (MBB is an unrolled kernel)
10219     Register Counter;
10220     for (int I = 0; I <= TC; ++I) {
10221       Register NextCounter;
10222       if (I != 0)
10223         NextCounter =
10224             cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
10225
10226       AccCond = AccumulateCond(AccCond, CC);
10227
10228       if (I != TC) {
10229         if (I == 0) {
10230           if (Update != Comp && IsUpdatePriorComp) {
10231             Counter =
10232                 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
10233             NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
10234                                      MBB.end());
10235           } else {
10236             // can use already calculated value
10237             NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
10238           }
10239         } else if (Update != Comp) {
10240           NextCounter =
10241               cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10242         }
10243       }
10244       Counter = NextCounter;
10245     }
10246   } else {
10247     Register Counter;
10248     if (LastStage0Insts.empty()) {
10249       // use initial counter value (testing if the trip count is sufficient to
10250       // be executed by pipelined code)
10251       Counter = Init;
10252       if (IsUpdatePriorComp)
10253         Counter =
10254             cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10255     } else {
10256       // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
10257       Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
10258     }
10259
10260     for (int I = 0; I <= TC; ++I) {
10261       Register NextCounter;
10262       NextCounter =
10263           cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
10264       AccCond = AccumulateCond(AccCond, CC);
10265       if (I != TC && Update != Comp)
10266         NextCounter =
10267             cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10268       Counter = NextCounter;
10269     }
10270   }
10271
10272   // If AccCond == 0, the remainder is greater than TC.
10273   BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
10274       .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
10275       .addReg(AccCond)
10276       .addImm(0)
10277       .addImm(0);
10278   Cond.clear();
10279   Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
10280 }
10281
10282 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
10283                           Register &RegMBB, Register &RegOther) {
10284   assert(Phi.getNumOperands() == 5);
10285   if (Phi.getOperand(2).getMBB() == MBB) {
10286     RegMBB = Phi.getOperand(1).getReg();
10287     RegOther = Phi.getOperand(3).getReg();
10288   } else {
10289     assert(Phi.getOperand(4).getMBB() == MBB);
10290     RegMBB = Phi.getOperand(3).getReg();
10291     RegOther = Phi.getOperand(1).getReg();
10292   }
10293 }
10294
10295 static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
10296   if (!Reg.isVirtual())
10297     return false;
10298   const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
10299   return MRI.getVRegDef(Reg)->getParent() != BB;
10300 }
10301
10302 /// If Reg is an induction variable, return true and set some parameters
10303 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
10304                           MachineInstr *&UpdateInst,
10305                           unsigned &UpdateCounterOprNum, Register &InitReg,
10306                           bool &IsUpdatePriorComp) {
10307   // Example:
10308   //
10309   // Preheader:
10310   //   InitReg = ...
10311   // LoopBB:
10312   //   Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
10313   //   Reg = COPY Reg0 ; COPY is ignored.
10314   //   Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
10315   //                     ; Reg is the value calculated in the previous
10316   //                     ; iteration, so IsUpdatePriorComp == false.
10317
10318   if (LoopBB->pred_size() != 2)
10319     return false;
10320   if (!Reg.isVirtual())
10321     return false;
10322   const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
10323   UpdateInst = nullptr;
10324   UpdateCounterOprNum = 0;
10325   InitReg = 0;
10326   IsUpdatePriorComp = true;
10327   Register CurReg = Reg;
10328   while (true) {
10329     MachineInstr *Def = MRI.getVRegDef(CurReg);
10330     if (Def->getParent() != LoopBB)
10331       return false;
10332     if (Def->isCopy()) {
10333       // Ignore copy instructions unless they contain subregisters
10334       if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
10335         return false;
10336       CurReg = Def->getOperand(1).getReg();
10337     } else if (Def->isPHI()) {
10338       if (InitReg != 0)
10339         return false;
10340       if (!UpdateInst)
10341         IsUpdatePriorComp = false;
10342       extractPhiReg(*Def, LoopBB, CurReg, InitReg);
10343     } else {
10344       if (UpdateInst)
10345         return false;
10346       switch (Def->getOpcode()) {
10347       case AArch64::ADDSXri:
10348       case AArch64::ADDSWri:
10349       case AArch64::SUBSXri:
10350       case AArch64::SUBSWri:
10351       case AArch64::ADDXri:
10352       case AArch64::ADDWri:
10353       case AArch64::SUBXri:
10354       case AArch64::SUBWri:
10355         UpdateInst = Def;
10356         UpdateCounterOprNum = 1;
10357         break;
10358       case AArch64::ADDSXrr:
10359       case AArch64::ADDSWrr:
10360       case AArch64::SUBSXrr:
10361       case AArch64::SUBSWrr:
10362       case AArch64::ADDXrr:
10363       case AArch64::ADDWrr:
10364       case AArch64::SUBXrr:
10365       case AArch64::SUBWrr:
10366         UpdateInst = Def;
10367         if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
10368           UpdateCounterOprNum = 1;
10369         else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
10370           UpdateCounterOprNum = 2;
10371         else
10372           return false;
10373         break;
10374       default:
10375         return false;
10376       }
10377       CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
10378     }
10379
10380     if (!CurReg.isVirtual())
10381       return false;
10382     if (Reg == CurReg)
10383       break;
10384   }
10385
10386   if (!UpdateInst)
10387     return false;
10388
10389   return true;
10390 }
10391
10392 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
10393 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
10394   // Accept loops that meet the following conditions
10395   // * The conditional branch is BCC
10396   // * The compare instruction is ADDS/SUBS/WHILEXX
10397   // * One operand of the compare is an induction variable and the other is a
10398   //   loop invariant value
10399   // * The induction variable is incremented/decremented by a single instruction
10400   // * Does not contain CALL or instructions which have unmodeled side effects
10401
10402   for (MachineInstr &MI : *LoopBB)
10403     if (MI.isCall() || MI.hasUnmodeledSideEffects())
10404       // This instruction may use NZCV, which interferes with the instruction to
10405       // be inserted for loop control.
10406       return nullptr;
10407
10408   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
10409   SmallVector<MachineOperand, 4> Cond;
10410   if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
10411     return nullptr;
10412
10413   // Infinite loops are not supported
10414   if (TBB == LoopBB && FBB == LoopBB)
10415     return nullptr;
10416
10417   // Must be conditional branch
10418   if (TBB != LoopBB && FBB == nullptr)
10419     return nullptr;
10420
10421   assert((TBB == LoopBB || FBB == LoopBB) &&
10422          "The Loop must be a single-basic-block loop");
10423
10424   MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
10425   const TargetRegisterInfo &TRI = getRegisterInfo();
10426
10427   if (CondBranch->getOpcode() != AArch64::Bcc)
10428     return nullptr;
10429
10430   // Normalization for createTripCountGreaterCondition()
10431   if (TBB == LoopBB)
10432     reverseBranchCondition(Cond);
10433
10434   MachineInstr *Comp = nullptr;
10435   unsigned CompCounterOprNum = 0;
10436   for (MachineInstr &MI : reverse(*LoopBB)) {
10437     if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
10438       // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
10439       // operands is a loop invariant value
10440
10441       switch (MI.getOpcode()) {
10442       case AArch64::SUBSXri:
10443       case AArch64::SUBSWri:
10444       case AArch64::ADDSXri:
10445       case AArch64::ADDSWri:
10446         Comp = &MI;
10447         CompCounterOprNum = 1;
10448         break;
10449       case AArch64::ADDSWrr:
10450       case AArch64::ADDSXrr:
10451       case AArch64::SUBSWrr:
10452       case AArch64::SUBSXrr:
10453         Comp = &MI;
10454         break;
10455       default:
10456         if (isWhileOpcode(MI.getOpcode())) {
10457           Comp = &MI;
10458           break;
10459         }
10460         return nullptr;
10461       }
10462
10463       if (CompCounterOprNum == 0) {
10464         if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
10465           CompCounterOprNum = 2;
10466         else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
10467           CompCounterOprNum = 1;
10468         else
10469           return nullptr;
10470       }
10471       break;
10472     }
10473   }
10474   if (!Comp)
10475     return nullptr;
10476
10477   MachineInstr *Update = nullptr;
10478   Register Init;
10479   bool IsUpdatePriorComp;
10480   unsigned UpdateCounterOprNum;
10481   if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
10482                      Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
10483     return nullptr;
10484
10485   return std::make_unique<AArch64PipelinerLoopInfo>(
10486       LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
10487       Init, IsUpdatePriorComp, Cond);
10488 }
10489
10490 /// verifyInstruction - Perform target specific instruction verification.
10491 bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
10492                                          StringRef &ErrInfo) const {
10493
10494   // Verify that immediate offsets on load/store instructions are within range.
10495   // Stack objects with an FI operand are excluded as they can be fixed up
10496   // during PEI.
10497   TypeSize Scale(0U, false), Width(0U, false);
10498   int64_t MinOffset, MaxOffset;
10499   if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
10500     unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
10501     if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
10502       int64_t Imm = MI.getOperand(ImmIdx).getImm();
10503       if (Imm < MinOffset || Imm > MaxOffset) {
10504         ErrInfo = "Unexpected immediate on load/store instruction";
10505         return false;
10506       }
10507     }
10508   }
10509   return true;
10510 }
10511
10512 #define GET_INSTRINFO_HELPERS
10513 #define GET_INSTRMAP_INFO
10514 #include "AArch64GenInstrInfo.inc"