llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

   1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This pass performs below peephole optimizations on MIR level.
  10 //
  11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
  12 //    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
  13 //
  14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
  15 //    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
  16 //
  17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
  18 //    MOVi64imm + SUBXrr ==> SUBXri + SUBXri
  19 //
  20 //    The mov pseudo instruction could be expanded to multiple mov instructions
  21 //    later. In this case, we could try to split the constant  operand of mov
  22 //    instruction into two immediates which can be directly encoded into
  23 //    *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
  24 //    multiple `mov` + `and/add/sub` instructions.
  25 //
  26 // 4. Remove redundant ORRWrs which is generated by zero-extend.
  27 //
  28 //    %3:gpr32 = ORRWrs $wzr, %2, 0
  29 //    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
  30 //
  31 //    If AArch64's 32-bit form of instruction defines the source operand of
  32 //    ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
  33 //    operand are set to zero.
  34 //
  35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
  36 //     ==> %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
  37 //
  38 // 6. %intermediate:gpr32 = COPY %src:fpr128
  39 //    %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
  40 //     ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
  41 //
  42 //    In cases where a source FPR is copied to a GPR in order to be copied
  43 //    to a destination FPR, we can directly copy the values between the FPRs,
  44 //    eliminating the use of the Integer unit. When we match a pattern of
  45 //    INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
  46 //    source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
  47 //    instructions.
  48 //
  49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
  50 //    64-bits. For example,
  51 //
  52 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
  53 //   %2:fpr64 = MOVID 0
  54 //   %4:fpr128 = IMPLICIT_DEF
  55 //   %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
  56 //   %6:fpr128 = IMPLICIT_DEF
  57 //   %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
  58 //   %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
  59 //   ==>
  60 //   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
  61 //   %6:fpr128 = IMPLICIT_DEF
  62 //   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
  63 //
  64 //===----------------------------------------------------------------------===//
  65
  66 #include "AArch64ExpandImm.h"
  67 #include "AArch64InstrInfo.h"
  68 #include "MCTargetDesc/AArch64AddressingModes.h"
  69 #include "llvm/CodeGen/MachineDominators.h"
  70 #include "llvm/CodeGen/MachineLoopInfo.h"
  71
  72 using namespace llvm;
  73
  74 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
  75
  76 namespace {
  77
  78 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
  79   static char ID;
  80
  81   AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
  82     initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
  83   }
  84
  85   const AArch64InstrInfo *TII;
  86   const AArch64RegisterInfo *TRI;
  87   MachineLoopInfo *MLI;
  88   MachineRegisterInfo *MRI;
  89
  90   using OpcodePair = std::pair<unsigned, unsigned>;
  91   template <typename T>
  92   using SplitAndOpcFunc =
  93       std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
  94   using BuildMIFunc =
  95       std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
  96                          Register, Register, Register)>;
  97
  98   /// For instructions where an immediate operand could be split into two
  99   /// separate immediate instructions, use the splitTwoPartImm two handle the
 100   /// optimization.
 101   ///
 102   /// To implement, the following function types must be passed to
 103   /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
 104   /// splitting the immediate is valid and returns the associated new opcode. A
 105   /// BuildMIFunc must be implemented to build the two immediate instructions.
 106   ///
 107   /// Example Pattern (where IMM would require 2+ MOV instructions):
 108   ///     %dst = <Instr>rr %src IMM [...]
 109   /// becomes:
 110   ///     %tmp = <Instr>ri %src (encode half IMM) [...]
 111   ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
 112   template <typename T>
 113   bool splitTwoPartImm(MachineInstr &MI,
 114                        SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
 115
 116   bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
 117                         MachineInstr *&SubregToRegMI);
 118
 119   template <typename T>
 120   bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
 121   template <typename T>
 122   bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
 123
 124   template <typename T>
 125   bool visitAND(unsigned Opc, MachineInstr &MI);
 126   bool visitORR(MachineInstr &MI);
 127   bool visitINSERT(MachineInstr &MI);
 128   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
 129   bool visitINSvi64lane(MachineInstr &MI);
 130   bool visitFMOVDr(MachineInstr &MI);
 131   bool runOnMachineFunction(MachineFunction &MF) override;
 132
 133   StringRef getPassName() const override {
 134     return "AArch64 MI Peephole Optimization pass";
 135   }
 136
 137   void getAnalysisUsage(AnalysisUsage &AU) const override {
 138     AU.setPreservesCFG();
 139     AU.addRequired<MachineLoopInfo>();
 140     MachineFunctionPass::getAnalysisUsage(AU);
 141   }
 142 };
 143
 144 char AArch64MIPeepholeOpt::ID = 0;
 145
 146 } // end anonymous namespace
 147
 148 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
 149                 "AArch64 MI Peephole Optimization", false, false)
 150
 151 template <typename T>
 152 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 153   T UImm = static_cast<T>(Imm);
 154   if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
 155     return false;
 156
 157   // If this immediate can be handled by one instruction, do not split it.
 158   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
 159   AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
 160   if (Insn.size() == 1)
 161     return false;
 162
 163   // The bitmask immediate consists of consecutive ones.  Let's say there is
 164   // constant 0b00000000001000000000010000000000 which does not consist of
 165   // consecutive ones. We can split it in to two bitmask immediate like
 166   // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
 167   // If we do AND with these two bitmask immediate, we can see original one.
 168   unsigned LowestBitSet = llvm::countr_zero(UImm);
 169   unsigned HighestBitSet = Log2_64(UImm);
 170
 171   // Create a mask which is filled with one from the position of lowest bit set
 172   // to the position of highest bit set.
 173   T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
 174               (static_cast<T>(1) << LowestBitSet);
 175   // Create a mask which is filled with one outside the position of lowest bit
 176   // set and the position of highest bit set.
 177   T NewImm2 = UImm | ~NewImm1;
 178
 179   // If the split value is not valid bitmask immediate, do not split this
 180   // constant.
 181   if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
 182     return false;
 183
 184   Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
 185   Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
 186   return true;
 187 }
 188
 189 template <typename T>
 190 bool AArch64MIPeepholeOpt::visitAND(
 191     unsigned Opc, MachineInstr &MI) {
 192   // Try below transformation.
 193   //
 194   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
 195   // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
 196   //
 197   // The mov pseudo instruction could be expanded to multiple mov instructions
 198   // later. Let's try to split the constant operand of mov instruction into two
 199   // bitmask immediates. It makes only two AND instructions intead of multiple
 200   // mov + and instructions.
 201
 202   return splitTwoPartImm<T>(
 203       MI,
 204       [Opc](T Imm, unsigned RegSize, T &Imm0,
 205             T &Imm1) -> std::optional<OpcodePair> {
 206         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
 207           return std::make_pair(Opc, Opc);
 208         return std::nullopt;
 209       },
 210       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
 211                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
 212                    Register NewDstReg) {
 213         DebugLoc DL = MI.getDebugLoc();
 214         MachineBasicBlock *MBB = MI.getParent();
 215         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
 216             .addReg(SrcReg)
 217             .addImm(Imm0);
 218         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
 219             .addReg(NewTmpReg)
 220             .addImm(Imm1);
 221       });
 222 }
 223
 224 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
 225   // Check this ORR comes from below zero-extend pattern.
 226   //
 227   // def : Pat<(i64 (zext GPR32:$src)),
 228   //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
 229   if (MI.getOperand(3).getImm() != 0)
 230     return false;
 231
 232   if (MI.getOperand(1).getReg() != AArch64::WZR)
 233     return false;
 234
 235   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
 236   if (!SrcMI)
 237     return false;
 238
 239   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
 240   //
 241   // When you use the 32-bit form of an instruction, the upper 32 bits of the
 242   // source registers are ignored and the upper 32 bits of the destination
 243   // register are set to zero.
 244   //
 245   // If AArch64's 32-bit form of instruction defines the source operand of
 246   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
 247   // real AArch64 instruction and if it is not, do not process the opcode
 248   // conservatively.
 249   if (SrcMI->getOpcode() == TargetOpcode::COPY &&
 250       SrcMI->getOperand(1).getReg().isVirtual()) {
 251     const TargetRegisterClass *RC =
 252         MRI->getRegClass(SrcMI->getOperand(1).getReg());
 253
 254     // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
 255     // that the upper bits are zero.
 256     if (RC != &AArch64::FPR32RegClass &&
 257         ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
 258          SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
 259       return false;
 260     Register CpySrc = SrcMI->getOperand(1).getReg();
 261     if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
 262       CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
 263       BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
 264               TII->get(TargetOpcode::COPY), CpySrc)
 265           .add(SrcMI->getOperand(1));
 266     }
 267     BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
 268             TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
 269         .addReg(CpySrc);
 270     SrcMI->eraseFromParent();
 271   }
 272   else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
 273     return false;
 274
 275   Register DefReg = MI.getOperand(0).getReg();
 276   Register SrcReg = MI.getOperand(2).getReg();
 277   MRI->replaceRegWith(DefReg, SrcReg);
 278   MRI->clearKillFlags(SrcReg);
 279   LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
 280   MI.eraseFromParent();
 281
 282   return true;
 283 }
 284
 285 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
 286   // Check this INSERT_SUBREG comes from below zero-extend pattern.
 287   //
 288   // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
 289   // To   %reg:subidx =  SUBREG_TO_REG 0, %subreg, subidx
 290   //
 291   // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
 292   // COPY would destroy the upper part of the register anyway
 293   if (!MI.isRegTiedToDefOperand(1))
 294     return false;
 295
 296   Register DstReg = MI.getOperand(0).getReg();
 297   const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
 298   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
 299   if (!SrcMI)
 300     return false;
 301
 302   // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
 303   //
 304   // When you use the 32-bit form of an instruction, the upper 32 bits of the
 305   // source registers are ignored and the upper 32 bits of the destination
 306   // register are set to zero.
 307   //
 308   // If AArch64's 32-bit form of instruction defines the source operand of
 309   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
 310   // real AArch64 instruction and if it is not, do not process the opcode
 311   // conservatively.
 312   if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
 313       !AArch64::GPR64allRegClass.hasSubClassEq(RC))
 314     return false;
 315
 316   // Build a SUBREG_TO_REG instruction
 317   MachineInstr *SubregMI =
 318       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
 319               TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
 320           .addImm(0)
 321           .add(MI.getOperand(2))
 322           .add(MI.getOperand(3));
 323   LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *SubregMI << "\n");
 324   (void)SubregMI;
 325   MI.eraseFromParent();
 326
 327   return true;
 328 }
 329
 330 template <typename T>
 331 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
 332   // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
 333   // imm0 and imm1 are non-zero 12-bit unsigned int.
 334   if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
 335       (Imm & ~static_cast<T>(0xffffff)) != 0)
 336     return false;
 337
 338   // The immediate can not be composed via a single instruction.
 339   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
 340   AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
 341   if (Insn.size() == 1)
 342     return false;
 343
 344   // Split Imm into (Imm0 << 12) + Imm1;
 345   Imm0 = (Imm >> 12) & 0xfff;
 346   Imm1 = Imm & 0xfff;
 347   return true;
 348 }
 349
 350 template <typename T>
 351 bool AArch64MIPeepholeOpt::visitADDSUB(
 352     unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
 353   // Try below transformation.
 354   //
 355   // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
 356   // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
 357   //
 358   // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
 359   // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
 360   //
 361   // The mov pseudo instruction could be expanded to multiple mov instructions
 362   // later. Let's try to split the constant operand of mov instruction into two
 363   // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
 364   // multiple `mov` + `and/sub` instructions.
 365
 366   // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
 367   // folded. Make sure that we don't generate invalid instructions that use XZR
 368   // in those cases.
 369   if (MI.getOperand(1).getReg() == AArch64::XZR ||
 370       MI.getOperand(1).getReg() == AArch64::WZR)
 371     return false;
 372
 373   return splitTwoPartImm<T>(
 374       MI,
 375       [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
 376                        T &Imm1) -> std::optional<OpcodePair> {
 377         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
 378           return std::make_pair(PosOpc, PosOpc);
 379         if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
 380           return std::make_pair(NegOpc, NegOpc);
 381         return std::nullopt;
 382       },
 383       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
 384                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
 385                    Register NewDstReg) {
 386         DebugLoc DL = MI.getDebugLoc();
 387         MachineBasicBlock *MBB = MI.getParent();
 388         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
 389             .addReg(SrcReg)
 390             .addImm(Imm0)
 391             .addImm(12);
 392         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
 393             .addReg(NewTmpReg)
 394             .addImm(Imm1)
 395             .addImm(0);
 396       });
 397 }
 398
 399 template <typename T>
 400 bool AArch64MIPeepholeOpt::visitADDSSUBS(
 401     OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
 402   // Try the same transformation as ADDSUB but with additional requirement
 403   // that the condition code usages are only for Equal and Not Equal
 404
 405   if (MI.getOperand(1).getReg() == AArch64::XZR ||
 406       MI.getOperand(1).getReg() == AArch64::WZR)
 407     return false;
 408
 409   return splitTwoPartImm<T>(
 410       MI,
 411       [PosOpcs, NegOpcs, &MI, &TRI = TRI,
 412        &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
 413                    T &Imm1) -> std::optional<OpcodePair> {
 414         OpcodePair OP;
 415         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
 416           OP = PosOpcs;
 417         else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
 418           OP = NegOpcs;
 419         else
 420           return std::nullopt;
 421         // Check conditional uses last since it is expensive for scanning
 422         // proceeding instructions
 423         MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
 424         std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
 425         if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
 426           return std::nullopt;
 427         return OP;
 428       },
 429       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
 430                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
 431                    Register NewDstReg) {
 432         DebugLoc DL = MI.getDebugLoc();
 433         MachineBasicBlock *MBB = MI.getParent();
 434         BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
 435             .addReg(SrcReg)
 436             .addImm(Imm0)
 437             .addImm(12);
 438         BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
 439             .addReg(NewTmpReg)
 440             .addImm(Imm1)
 441             .addImm(0);
 442       });
 443 }
 444
 445 // Checks if the corresponding MOV immediate instruction is applicable for
 446 // this peephole optimization.
 447 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
 448                                             MachineInstr *&MovMI,
 449                                             MachineInstr *&SubregToRegMI) {
 450   // Check whether current MBB is in loop and the AND is loop invariant.
 451   MachineBasicBlock *MBB = MI.getParent();
 452   MachineLoop *L = MLI->getLoopFor(MBB);
 453   if (L && !L->isLoopInvariant(MI))
 454     return false;
 455
 456   // Check whether current MI's operand is MOV with immediate.
 457   MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
 458   if (!MovMI)
 459     return false;
 460
 461   // If it is SUBREG_TO_REG, check its operand.
 462   SubregToRegMI = nullptr;
 463   if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
 464     SubregToRegMI = MovMI;
 465     MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
 466     if (!MovMI)
 467       return false;
 468   }
 469
 470   if (MovMI->getOpcode() != AArch64::MOVi32imm &&
 471       MovMI->getOpcode() != AArch64::MOVi64imm)
 472     return false;
 473
 474   // If the MOV has multiple uses, do not split the immediate because it causes
 475   // more instructions.
 476   if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
 477     return false;
 478   if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
 479     return false;
 480
 481   // It is OK to perform this peephole optimization.
 482   return true;
 483 }
 484
 485 template <typename T>
 486 bool AArch64MIPeepholeOpt::splitTwoPartImm(
 487     MachineInstr &MI,
 488     SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
 489   unsigned RegSize = sizeof(T) * 8;
 490   assert((RegSize == 32 || RegSize == 64) &&
 491          "Invalid RegSize for legal immediate peephole optimization");
 492
 493   // Perform several essential checks against current MI.
 494   MachineInstr *MovMI, *SubregToRegMI;
 495   if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
 496     return false;
 497
 498   // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
 499   T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
 500   // For the 32 bit form of instruction, the upper 32 bits of the destination
 501   // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
 502   // of Imm to zero. This is essential if the Immediate value was a negative
 503   // number since it was sign extended when we assign to the 64-bit Imm.
 504   if (SubregToRegMI)
 505     Imm &= 0xFFFFFFFF;
 506   OpcodePair Opcode;
 507   if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
 508     Opcode = *R;
 509   else
 510     return false;
 511
 512   // Create new MIs using the first and second opcodes. Opcodes might differ for
 513   // flag setting operations that should only set flags on second instruction.
 514   // NewTmpReg = Opcode.first SrcReg Imm0
 515   // NewDstReg = Opcode.second NewTmpReg Imm1
 516
 517   // Determine register classes for destinations and register operands
 518   MachineFunction *MF = MI.getMF();
 519   const TargetRegisterClass *FirstInstrDstRC =
 520       TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
 521   const TargetRegisterClass *FirstInstrOperandRC =
 522       TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
 523   const TargetRegisterClass *SecondInstrDstRC =
 524       (Opcode.first == Opcode.second)
 525           ? FirstInstrDstRC
 526           : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
 527   const TargetRegisterClass *SecondInstrOperandRC =
 528       (Opcode.first == Opcode.second)
 529           ? FirstInstrOperandRC
 530           : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
 531
 532   // Get old registers destinations and new register destinations
 533   Register DstReg = MI.getOperand(0).getReg();
 534   Register SrcReg = MI.getOperand(1).getReg();
 535   Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
 536   // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
 537   // reuse that same destination register.
 538   Register NewDstReg = DstReg.isVirtual()
 539                            ? MRI->createVirtualRegister(SecondInstrDstRC)
 540                            : DstReg;
 541
 542   // Constrain registers based on their new uses
 543   MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
 544   MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
 545   if (DstReg != NewDstReg)
 546     MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
 547
 548   // Call the delegating operation to build the instruction
 549   BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
 550
 551   // replaceRegWith changes MI's definition register. Keep it for SSA form until
 552   // deleting MI. Only if we made a new destination register.
 553   if (DstReg != NewDstReg) {
 554     MRI->replaceRegWith(DstReg, NewDstReg);
 555     MI.getOperand(0).setReg(DstReg);
 556   }
 557
 558   // Record the MIs need to be removed.
 559   MI.eraseFromParent();
 560   if (SubregToRegMI)
 561     SubregToRegMI->eraseFromParent();
 562   MovMI->eraseFromParent();
 563
 564   return true;
 565 }
 566
 567 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
 568   // Check if this INSvi[X]gpr comes from COPY of a source FPR128
 569   //
 570   // From
 571   //  %intermediate1:gpr64 = COPY %src:fpr128
 572   //  %intermediate2:gpr32 = COPY %intermediate1:gpr64
 573   //  %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
 574   // To
 575   //  %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
 576   //  src_index
 577   // where src_index = 0, X = [8|16|32|64]
 578
 579   MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
 580
 581   // For a chain of COPY instructions, find the initial source register
 582   // and check if it's an FPR128
 583   while (true) {
 584     if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
 585       return false;
 586
 587     if (!SrcMI->getOperand(1).getReg().isVirtual())
 588       return false;
 589
 590     if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
 591         &AArch64::FPR128RegClass) {
 592       break;
 593     }
 594     SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
 595   }
 596
 597   Register DstReg = MI.getOperand(0).getReg();
 598   Register SrcReg = SrcMI->getOperand(1).getReg();
 599   MachineInstr *INSvilaneMI =
 600       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
 601           .add(MI.getOperand(1))
 602           .add(MI.getOperand(2))
 603           .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
 604           .addImm(0);
 605
 606   LLVM_DEBUG(dbgs() << MI << "  replace by:\n: " << *INSvilaneMI << "\n");
 607   (void)INSvilaneMI;
 608   MI.eraseFromParent();
 609   return true;
 610 }
 611
 612 // All instructions that set a FPR64 will implicitly zero the top bits of the
 613 // register.
 614 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
 615                                         MachineRegisterInfo *MRI) {
 616   if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
 617     return false;
 618   const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
 619   if (RC != &AArch64::FPR64RegClass)
 620     return false;
 621   return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
 622 }
 623
 624 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
 625   // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
 626   // We are expecting below case.
 627   //
 628   //  %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
 629   //  %6:fpr128 = IMPLICIT_DEF
 630   //  %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
 631   //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
 632   MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
 633   if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
 634     return false;
 635   Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
 636   if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
 637     return false;
 638
 639   // Check there is `mov 0` MI for high 64-bits.
 640   // We are expecting below cases.
 641   //
 642   //  %2:fpr64 = MOVID 0
 643   //  %4:fpr128 = IMPLICIT_DEF
 644   //  %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
 645   //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
 646   // or
 647   //  %5:fpr128 = MOVIv2d_ns 0
 648   //  %6:fpr64 = COPY %5.dsub:fpr128
 649   //  %8:fpr128 = IMPLICIT_DEF
 650   //  %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
 651   //  %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
 652   MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
 653   if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
 654     return false;
 655   High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
 656   if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
 657     High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
 658   if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
 659                     High64MI->getOpcode() != AArch64::MOVIv2d_ns))
 660     return false;
 661   if (High64MI->getOperand(1).getImm() != 0)
 662     return false;
 663
 664   // Let's remove MIs for high 64-bits.
 665   Register OldDef = MI.getOperand(0).getReg();
 666   Register NewDef = MI.getOperand(1).getReg();
 667   MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
 668   MRI->replaceRegWith(OldDef, NewDef);
 669   MI.eraseFromParent();
 670
 671   return true;
 672 }
 673
 674 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
 675   // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
 676   MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
 677   if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
 678     return false;
 679
 680   // Let's remove MIs for high 64-bits.
 681   Register OldDef = MI.getOperand(0).getReg();
 682   Register NewDef = MI.getOperand(1).getReg();
 683   LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
 684   MRI->clearKillFlags(OldDef);
 685   MRI->clearKillFlags(NewDef);
 686   MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
 687   MRI->replaceRegWith(OldDef, NewDef);
 688   MI.eraseFromParent();
 689
 690   return true;
 691 }
 692
 693 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
 694   if (skipFunction(MF.getFunction()))
 695     return false;
 696
 697   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
 698   TRI = static_cast<const AArch64RegisterInfo *>(
 699       MF.getSubtarget().getRegisterInfo());
 700   MLI = &getAnalysis<MachineLoopInfo>();
 701   MRI = &MF.getRegInfo();
 702
 703   assert(MRI->isSSA() && "Expected to be run on SSA form!");
 704
 705   bool Changed = false;
 706
 707   for (MachineBasicBlock &MBB : MF) {
 708     for (MachineInstr &MI : make_early_inc_range(MBB)) {
 709       switch (MI.getOpcode()) {
 710       default:
 711         break;
 712       case AArch64::INSERT_SUBREG:
 713         Changed |= visitINSERT(MI);
 714         break;
 715       case AArch64::ANDWrr:
 716         Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
 717         break;
 718       case AArch64::ANDXrr:
 719         Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
 720         break;
 721       case AArch64::ORRWrs:
 722         Changed |= visitORR(MI);
 723         break;
 724       case AArch64::ADDWrr:
 725         Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
 726         break;
 727       case AArch64::SUBWrr:
 728         Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
 729         break;
 730       case AArch64::ADDXrr:
 731         Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
 732         break;
 733       case AArch64::SUBXrr:
 734         Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
 735         break;
 736       case AArch64::ADDSWrr:
 737         Changed |=
 738             visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
 739                                     {AArch64::SUBWri, AArch64::SUBSWri}, MI);
 740         break;
 741       case AArch64::SUBSWrr:
 742         Changed |=
 743             visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
 744                                     {AArch64::ADDWri, AArch64::ADDSWri}, MI);
 745         break;
 746       case AArch64::ADDSXrr:
 747         Changed |=
 748             visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
 749                                     {AArch64::SUBXri, AArch64::SUBSXri}, MI);
 750         break;
 751       case AArch64::SUBSXrr:
 752         Changed |=
 753             visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
 754                                     {AArch64::ADDXri, AArch64::ADDSXri}, MI);
 755         break;
 756       case AArch64::INSvi64gpr:
 757         Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
 758         break;
 759       case AArch64::INSvi32gpr:
 760         Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
 761         break;
 762       case AArch64::INSvi16gpr:
 763         Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
 764         break;
 765       case AArch64::INSvi8gpr:
 766         Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
 767         break;
 768       case AArch64::INSvi64lane:
 769         Changed |= visitINSvi64lane(MI);
 770         break;
 771       case AArch64::FMOVDr:
 772         Changed |= visitFMOVDr(MI);
 773         break;
 774       }
 775     }
 776   }
 777
 778   return Changed;
 779 }
 780
 781 FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
 782   return new AArch64MIPeepholeOpt();
 783 }