[SampleProfileLoader] Fix integer overflow in generateMDProfMetadata (#90217)
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64MIPeepholeOpt.cpp
blob22da7ddef98a2a83d7cb6df21afe1340e9691513
1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs below peephole optimizations on MIR level.
11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15 // MOVi64imm + ADDXrr ==> ANDXri + ANDXri
17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri
20 // The mov pseudo instruction could be expanded to multiple mov instructions
21 // later. In this case, we could try to split the constant operand of mov
22 // instruction into two immediates which can be directly encoded into
23 // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24 // multiple `mov` + `and/add/sub` instructions.
26 // 4. Remove redundant ORRWrs which is generated by zero-extend.
28 // %3:gpr32 = ORRWrs $wzr, %2, 0
29 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
31 // If AArch64's 32-bit form of instruction defines the source operand of
32 // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33 // operand are set to zero.
35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36 // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
38 // 6. %intermediate:gpr32 = COPY %src:fpr128
39 // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40 // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
42 // In cases where a source FPR is copied to a GPR in order to be copied
43 // to a destination FPR, we can directly copy the values between the FPRs,
44 // eliminating the use of the Integer unit. When we match a pattern of
45 // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46 // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47 // instructions.
49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50 // 64-bits. For example,
52 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53 // %2:fpr64 = MOVID 0
54 // %4:fpr128 = IMPLICIT_DEF
55 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56 // %6:fpr128 = IMPLICIT_DEF
57 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59 // ==>
60 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61 // %6:fpr128 = IMPLICIT_DEF
62 // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
64 //===----------------------------------------------------------------------===//
66 #include "AArch64ExpandImm.h"
67 #include "AArch64InstrInfo.h"
68 #include "MCTargetDesc/AArch64AddressingModes.h"
69 #include "llvm/CodeGen/MachineDominators.h"
70 #include "llvm/CodeGen/MachineLoopInfo.h"
72 using namespace llvm;
74 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
76 namespace {
78 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79 static char ID;
81 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
82 initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
85 const AArch64InstrInfo *TII;
86 const AArch64RegisterInfo *TRI;
87 MachineLoopInfo *MLI;
88 MachineRegisterInfo *MRI;
90 using OpcodePair = std::pair<unsigned, unsigned>;
91 template <typename T>
92 using SplitAndOpcFunc =
93 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94 using BuildMIFunc =
95 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
96 Register, Register, Register)>;
98 /// For instructions where an immediate operand could be split into two
99 /// separate immediate instructions, use the splitTwoPartImm two handle the
100 /// optimization.
102 /// To implement, the following function types must be passed to
103 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104 /// splitting the immediate is valid and returns the associated new opcode. A
105 /// BuildMIFunc must be implemented to build the two immediate instructions.
107 /// Example Pattern (where IMM would require 2+ MOV instructions):
108 /// %dst = <Instr>rr %src IMM [...]
109 /// becomes:
110 /// %tmp = <Instr>ri %src (encode half IMM) [...]
111 /// %dst = <Instr>ri %tmp (encode half IMM) [...]
112 template <typename T>
113 bool splitTwoPartImm(MachineInstr &MI,
114 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
116 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117 MachineInstr *&SubregToRegMI);
119 template <typename T>
120 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121 template <typename T>
122 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
124 template <typename T>
125 bool visitAND(unsigned Opc, MachineInstr &MI);
126 bool visitORR(MachineInstr &MI);
127 bool visitINSERT(MachineInstr &MI);
128 bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129 bool visitINSvi64lane(MachineInstr &MI);
130 bool visitFMOVDr(MachineInstr &MI);
131 bool runOnMachineFunction(MachineFunction &MF) override;
133 StringRef getPassName() const override {
134 return "AArch64 MI Peephole Optimization pass";
137 void getAnalysisUsage(AnalysisUsage &AU) const override {
138 AU.setPreservesCFG();
139 AU.addRequired<MachineLoopInfo>();
140 MachineFunctionPass::getAnalysisUsage(AU);
144 char AArch64MIPeepholeOpt::ID = 0;
146 } // end anonymous namespace
148 INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
149 "AArch64 MI Peephole Optimization", false, false)
151 template <typename T>
152 static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
153 T UImm = static_cast<T>(Imm);
154 if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
155 return false;
157 // If this immediate can be handled by one instruction, do not split it.
158 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
159 AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
160 if (Insn.size() == 1)
161 return false;
163 // The bitmask immediate consists of consecutive ones. Let's say there is
164 // constant 0b00000000001000000000010000000000 which does not consist of
165 // consecutive ones. We can split it in to two bitmask immediate like
166 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
167 // If we do AND with these two bitmask immediate, we can see original one.
168 unsigned LowestBitSet = llvm::countr_zero(UImm);
169 unsigned HighestBitSet = Log2_64(UImm);
171 // Create a mask which is filled with one from the position of lowest bit set
172 // to the position of highest bit set.
173 T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
174 (static_cast<T>(1) << LowestBitSet);
175 // Create a mask which is filled with one outside the position of lowest bit
176 // set and the position of highest bit set.
177 T NewImm2 = UImm | ~NewImm1;
179 // If the split value is not valid bitmask immediate, do not split this
180 // constant.
181 if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))
182 return false;
184 Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
185 Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
186 return true;
189 template <typename T>
190 bool AArch64MIPeepholeOpt::visitAND(
191 unsigned Opc, MachineInstr &MI) {
192 // Try below transformation.
194 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
195 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
197 // The mov pseudo instruction could be expanded to multiple mov instructions
198 // later. Let's try to split the constant operand of mov instruction into two
199 // bitmask immediates. It makes only two AND instructions intead of multiple
200 // mov + and instructions.
202 return splitTwoPartImm<T>(
204 [Opc](T Imm, unsigned RegSize, T &Imm0,
205 T &Imm1) -> std::optional<OpcodePair> {
206 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
207 return std::make_pair(Opc, Opc);
208 return std::nullopt;
210 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
211 unsigned Imm1, Register SrcReg, Register NewTmpReg,
212 Register NewDstReg) {
213 DebugLoc DL = MI.getDebugLoc();
214 MachineBasicBlock *MBB = MI.getParent();
215 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
216 .addReg(SrcReg)
217 .addImm(Imm0);
218 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
219 .addReg(NewTmpReg)
220 .addImm(Imm1);
224 bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
225 // Check this ORR comes from below zero-extend pattern.
227 // def : Pat<(i64 (zext GPR32:$src)),
228 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
229 if (MI.getOperand(3).getImm() != 0)
230 return false;
232 if (MI.getOperand(1).getReg() != AArch64::WZR)
233 return false;
235 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
236 if (!SrcMI)
237 return false;
239 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
241 // When you use the 32-bit form of an instruction, the upper 32 bits of the
242 // source registers are ignored and the upper 32 bits of the destination
243 // register are set to zero.
245 // If AArch64's 32-bit form of instruction defines the source operand of
246 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
247 // real AArch64 instruction and if it is not, do not process the opcode
248 // conservatively.
249 if (SrcMI->getOpcode() == TargetOpcode::COPY &&
250 SrcMI->getOperand(1).getReg().isVirtual()) {
251 const TargetRegisterClass *RC =
252 MRI->getRegClass(SrcMI->getOperand(1).getReg());
254 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
255 // that the upper bits are zero.
256 if (RC != &AArch64::FPR32RegClass &&
257 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
258 SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
259 return false;
260 Register CpySrc = SrcMI->getOperand(1).getReg();
261 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
262 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
263 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
264 TII->get(TargetOpcode::COPY), CpySrc)
265 .add(SrcMI->getOperand(1));
267 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
268 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
269 .addReg(CpySrc);
270 SrcMI->eraseFromParent();
272 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
273 return false;
275 Register DefReg = MI.getOperand(0).getReg();
276 Register SrcReg = MI.getOperand(2).getReg();
277 MRI->replaceRegWith(DefReg, SrcReg);
278 MRI->clearKillFlags(SrcReg);
279 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
280 MI.eraseFromParent();
282 return true;
285 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
286 // Check this INSERT_SUBREG comes from below zero-extend pattern.
288 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
289 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
291 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
292 // COPY would destroy the upper part of the register anyway
293 if (!MI.isRegTiedToDefOperand(1))
294 return false;
296 Register DstReg = MI.getOperand(0).getReg();
297 const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
298 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
299 if (!SrcMI)
300 return false;
302 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
304 // When you use the 32-bit form of an instruction, the upper 32 bits of the
305 // source registers are ignored and the upper 32 bits of the destination
306 // register are set to zero.
308 // If AArch64's 32-bit form of instruction defines the source operand of
309 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
310 // real AArch64 instruction and if it is not, do not process the opcode
311 // conservatively.
312 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
313 !AArch64::GPR64allRegClass.hasSubClassEq(RC))
314 return false;
316 // Build a SUBREG_TO_REG instruction
317 MachineInstr *SubregMI =
318 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
319 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
320 .addImm(0)
321 .add(MI.getOperand(2))
322 .add(MI.getOperand(3));
323 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");
324 (void)SubregMI;
325 MI.eraseFromParent();
327 return true;
330 template <typename T>
331 static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
332 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
333 // imm0 and imm1 are non-zero 12-bit unsigned int.
334 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
335 (Imm & ~static_cast<T>(0xffffff)) != 0)
336 return false;
338 // The immediate can not be composed via a single instruction.
339 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
340 AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
341 if (Insn.size() == 1)
342 return false;
344 // Split Imm into (Imm0 << 12) + Imm1;
345 Imm0 = (Imm >> 12) & 0xfff;
346 Imm1 = Imm & 0xfff;
347 return true;
350 template <typename T>
351 bool AArch64MIPeepholeOpt::visitADDSUB(
352 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
353 // Try below transformation.
355 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
356 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
358 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
359 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
361 // The mov pseudo instruction could be expanded to multiple mov instructions
362 // later. Let's try to split the constant operand of mov instruction into two
363 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
364 // multiple `mov` + `and/sub` instructions.
366 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
367 // folded. Make sure that we don't generate invalid instructions that use XZR
368 // in those cases.
369 if (MI.getOperand(1).getReg() == AArch64::XZR ||
370 MI.getOperand(1).getReg() == AArch64::WZR)
371 return false;
373 return splitTwoPartImm<T>(
375 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
376 T &Imm1) -> std::optional<OpcodePair> {
377 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
378 return std::make_pair(PosOpc, PosOpc);
379 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
380 return std::make_pair(NegOpc, NegOpc);
381 return std::nullopt;
383 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
384 unsigned Imm1, Register SrcReg, Register NewTmpReg,
385 Register NewDstReg) {
386 DebugLoc DL = MI.getDebugLoc();
387 MachineBasicBlock *MBB = MI.getParent();
388 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
389 .addReg(SrcReg)
390 .addImm(Imm0)
391 .addImm(12);
392 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
393 .addReg(NewTmpReg)
394 .addImm(Imm1)
395 .addImm(0);
399 template <typename T>
400 bool AArch64MIPeepholeOpt::visitADDSSUBS(
401 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
402 // Try the same transformation as ADDSUB but with additional requirement
403 // that the condition code usages are only for Equal and Not Equal
405 if (MI.getOperand(1).getReg() == AArch64::XZR ||
406 MI.getOperand(1).getReg() == AArch64::WZR)
407 return false;
409 return splitTwoPartImm<T>(
411 [PosOpcs, NegOpcs, &MI, &TRI = TRI,
412 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
413 T &Imm1) -> std::optional<OpcodePair> {
414 OpcodePair OP;
415 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
416 OP = PosOpcs;
417 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
418 OP = NegOpcs;
419 else
420 return std::nullopt;
421 // Check conditional uses last since it is expensive for scanning
422 // proceeding instructions
423 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
424 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
425 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
426 return std::nullopt;
427 return OP;
429 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
430 unsigned Imm1, Register SrcReg, Register NewTmpReg,
431 Register NewDstReg) {
432 DebugLoc DL = MI.getDebugLoc();
433 MachineBasicBlock *MBB = MI.getParent();
434 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
435 .addReg(SrcReg)
436 .addImm(Imm0)
437 .addImm(12);
438 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
439 .addReg(NewTmpReg)
440 .addImm(Imm1)
441 .addImm(0);
445 // Checks if the corresponding MOV immediate instruction is applicable for
446 // this peephole optimization.
447 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
448 MachineInstr *&MovMI,
449 MachineInstr *&SubregToRegMI) {
450 // Check whether current MBB is in loop and the AND is loop invariant.
451 MachineBasicBlock *MBB = MI.getParent();
452 MachineLoop *L = MLI->getLoopFor(MBB);
453 if (L && !L->isLoopInvariant(MI))
454 return false;
456 // Check whether current MI's operand is MOV with immediate.
457 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
458 if (!MovMI)
459 return false;
461 // If it is SUBREG_TO_REG, check its operand.
462 SubregToRegMI = nullptr;
463 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
464 SubregToRegMI = MovMI;
465 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
466 if (!MovMI)
467 return false;
470 if (MovMI->getOpcode() != AArch64::MOVi32imm &&
471 MovMI->getOpcode() != AArch64::MOVi64imm)
472 return false;
474 // If the MOV has multiple uses, do not split the immediate because it causes
475 // more instructions.
476 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
477 return false;
478 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
479 return false;
481 // It is OK to perform this peephole optimization.
482 return true;
485 template <typename T>
486 bool AArch64MIPeepholeOpt::splitTwoPartImm(
487 MachineInstr &MI,
488 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
489 unsigned RegSize = sizeof(T) * 8;
490 assert((RegSize == 32 || RegSize == 64) &&
491 "Invalid RegSize for legal immediate peephole optimization");
493 // Perform several essential checks against current MI.
494 MachineInstr *MovMI, *SubregToRegMI;
495 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
496 return false;
498 // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
499 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
500 // For the 32 bit form of instruction, the upper 32 bits of the destination
501 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
502 // of Imm to zero. This is essential if the Immediate value was a negative
503 // number since it was sign extended when we assign to the 64-bit Imm.
504 if (SubregToRegMI)
505 Imm &= 0xFFFFFFFF;
506 OpcodePair Opcode;
507 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
508 Opcode = *R;
509 else
510 return false;
512 // Create new MIs using the first and second opcodes. Opcodes might differ for
513 // flag setting operations that should only set flags on second instruction.
514 // NewTmpReg = Opcode.first SrcReg Imm0
515 // NewDstReg = Opcode.second NewTmpReg Imm1
517 // Determine register classes for destinations and register operands
518 MachineFunction *MF = MI.getMF();
519 const TargetRegisterClass *FirstInstrDstRC =
520 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
521 const TargetRegisterClass *FirstInstrOperandRC =
522 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
523 const TargetRegisterClass *SecondInstrDstRC =
524 (Opcode.first == Opcode.second)
525 ? FirstInstrDstRC
526 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
527 const TargetRegisterClass *SecondInstrOperandRC =
528 (Opcode.first == Opcode.second)
529 ? FirstInstrOperandRC
530 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
532 // Get old registers destinations and new register destinations
533 Register DstReg = MI.getOperand(0).getReg();
534 Register SrcReg = MI.getOperand(1).getReg();
535 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
536 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
537 // reuse that same destination register.
538 Register NewDstReg = DstReg.isVirtual()
539 ? MRI->createVirtualRegister(SecondInstrDstRC)
540 : DstReg;
542 // Constrain registers based on their new uses
543 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
544 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
545 if (DstReg != NewDstReg)
546 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
548 // Call the delegating operation to build the instruction
549 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
551 // replaceRegWith changes MI's definition register. Keep it for SSA form until
552 // deleting MI. Only if we made a new destination register.
553 if (DstReg != NewDstReg) {
554 MRI->replaceRegWith(DstReg, NewDstReg);
555 MI.getOperand(0).setReg(DstReg);
558 // Record the MIs need to be removed.
559 MI.eraseFromParent();
560 if (SubregToRegMI)
561 SubregToRegMI->eraseFromParent();
562 MovMI->eraseFromParent();
564 return true;
567 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
568 // Check if this INSvi[X]gpr comes from COPY of a source FPR128
570 // From
571 // %intermediate1:gpr64 = COPY %src:fpr128
572 // %intermediate2:gpr32 = COPY %intermediate1:gpr64
573 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
574 // To
575 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
576 // src_index
577 // where src_index = 0, X = [8|16|32|64]
579 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
581 // For a chain of COPY instructions, find the initial source register
582 // and check if it's an FPR128
583 while (true) {
584 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
585 return false;
587 if (!SrcMI->getOperand(1).getReg().isVirtual())
588 return false;
590 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
591 &AArch64::FPR128RegClass) {
592 break;
594 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
597 Register DstReg = MI.getOperand(0).getReg();
598 Register SrcReg = SrcMI->getOperand(1).getReg();
599 MachineInstr *INSvilaneMI =
600 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
601 .add(MI.getOperand(1))
602 .add(MI.getOperand(2))
603 .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
604 .addImm(0);
606 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
607 (void)INSvilaneMI;
608 MI.eraseFromParent();
609 return true;
612 // All instructions that set a FPR64 will implicitly zero the top bits of the
613 // register.
614 static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,
615 MachineRegisterInfo *MRI) {
616 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
617 return false;
618 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
619 if (RC != &AArch64::FPR64RegClass)
620 return false;
621 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
624 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
625 // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
626 // We are expecting below case.
628 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
629 // %6:fpr128 = IMPLICIT_DEF
630 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
631 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
632 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
633 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
634 return false;
635 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
636 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
637 return false;
639 // Check there is `mov 0` MI for high 64-bits.
640 // We are expecting below cases.
642 // %2:fpr64 = MOVID 0
643 // %4:fpr128 = IMPLICIT_DEF
644 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
645 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
646 // or
647 // %5:fpr128 = MOVIv2d_ns 0
648 // %6:fpr64 = COPY %5.dsub:fpr128
649 // %8:fpr128 = IMPLICIT_DEF
650 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
651 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
652 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
653 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
654 return false;
655 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
656 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
657 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
658 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
659 High64MI->getOpcode() != AArch64::MOVIv2d_ns))
660 return false;
661 if (High64MI->getOperand(1).getImm() != 0)
662 return false;
664 // Let's remove MIs for high 64-bits.
665 Register OldDef = MI.getOperand(0).getReg();
666 Register NewDef = MI.getOperand(1).getReg();
667 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
668 MRI->replaceRegWith(OldDef, NewDef);
669 MI.eraseFromParent();
671 return true;
674 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
675 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
676 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
677 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
678 return false;
680 // Let's remove MIs for high 64-bits.
681 Register OldDef = MI.getOperand(0).getReg();
682 Register NewDef = MI.getOperand(1).getReg();
683 LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
684 MRI->clearKillFlags(OldDef);
685 MRI->clearKillFlags(NewDef);
686 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
687 MRI->replaceRegWith(OldDef, NewDef);
688 MI.eraseFromParent();
690 return true;
693 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
694 if (skipFunction(MF.getFunction()))
695 return false;
697 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
698 TRI = static_cast<const AArch64RegisterInfo *>(
699 MF.getSubtarget().getRegisterInfo());
700 MLI = &getAnalysis<MachineLoopInfo>();
701 MRI = &MF.getRegInfo();
703 assert(MRI->isSSA() && "Expected to be run on SSA form!");
705 bool Changed = false;
707 for (MachineBasicBlock &MBB : MF) {
708 for (MachineInstr &MI : make_early_inc_range(MBB)) {
709 switch (MI.getOpcode()) {
710 default:
711 break;
712 case AArch64::INSERT_SUBREG:
713 Changed |= visitINSERT(MI);
714 break;
715 case AArch64::ANDWrr:
716 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
717 break;
718 case AArch64::ANDXrr:
719 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
720 break;
721 case AArch64::ORRWrs:
722 Changed |= visitORR(MI);
723 break;
724 case AArch64::ADDWrr:
725 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
726 break;
727 case AArch64::SUBWrr:
728 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
729 break;
730 case AArch64::ADDXrr:
731 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
732 break;
733 case AArch64::SUBXrr:
734 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
735 break;
736 case AArch64::ADDSWrr:
737 Changed |=
738 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
739 {AArch64::SUBWri, AArch64::SUBSWri}, MI);
740 break;
741 case AArch64::SUBSWrr:
742 Changed |=
743 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
744 {AArch64::ADDWri, AArch64::ADDSWri}, MI);
745 break;
746 case AArch64::ADDSXrr:
747 Changed |=
748 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
749 {AArch64::SUBXri, AArch64::SUBSXri}, MI);
750 break;
751 case AArch64::SUBSXrr:
752 Changed |=
753 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
754 {AArch64::ADDXri, AArch64::ADDSXri}, MI);
755 break;
756 case AArch64::INSvi64gpr:
757 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
758 break;
759 case AArch64::INSvi32gpr:
760 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
761 break;
762 case AArch64::INSvi16gpr:
763 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
764 break;
765 case AArch64::INSvi8gpr:
766 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
767 break;
768 case AArch64::INSvi64lane:
769 Changed |= visitINSvi64lane(MI);
770 break;
771 case AArch64::FMOVDr:
772 Changed |= visitFMOVDr(MI);
773 break;
778 return Changed;
781 FunctionPass *llvm::createAArch64MIPeepholeOptPass() {
782 return new AArch64MIPeepholeOpt();