1 //===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass performs below peephole optimizations on MIR level.
11 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
14 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15 // MOVi64imm + ADDXrr ==> ANDXri + ANDXri
17 // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18 // MOVi64imm + SUBXrr ==> SUBXri + SUBXri
20 // The mov pseudo instruction could be expanded to multiple mov instructions
21 // later. In this case, we could try to split the constant operand of mov
22 // instruction into two immediates which can be directly encoded into
23 // *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24 // multiple `mov` + `and/add/sub` instructions.
26 // 4. Remove redundant ORRWrs which is generated by zero-extend.
28 // %3:gpr32 = ORRWrs $wzr, %2, 0
29 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
31 // If AArch64's 32-bit form of instruction defines the source operand of
32 // ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33 // operand are set to zero.
35 // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36 // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
38 // 6. %intermediate:gpr32 = COPY %src:fpr128
39 // %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40 // ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
42 // In cases where a source FPR is copied to a GPR in order to be copied
43 // to a destination FPR, we can directly copy the values between the FPRs,
44 // eliminating the use of the Integer unit. When we match a pattern of
45 // INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46 // source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
49 // 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50 // 64-bits. For example,
52 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
54 // %4:fpr128 = IMPLICIT_DEF
55 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56 // %6:fpr128 = IMPLICIT_DEF
57 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
60 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61 // %6:fpr128 = IMPLICIT_DEF
62 // %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
64 //===----------------------------------------------------------------------===//
66 #include "AArch64ExpandImm.h"
67 #include "AArch64InstrInfo.h"
68 #include "MCTargetDesc/AArch64AddressingModes.h"
69 #include "llvm/CodeGen/MachineDominators.h"
70 #include "llvm/CodeGen/MachineLoopInfo.h"
74 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
78 struct AArch64MIPeepholeOpt
: public MachineFunctionPass
{
81 AArch64MIPeepholeOpt() : MachineFunctionPass(ID
) {
82 initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());
85 const AArch64InstrInfo
*TII
;
86 const AArch64RegisterInfo
*TRI
;
88 MachineRegisterInfo
*MRI
;
90 using OpcodePair
= std::pair
<unsigned, unsigned>;
92 using SplitAndOpcFunc
=
93 std::function
<std::optional
<OpcodePair
>(T
, unsigned, T
&, T
&)>;
95 std::function
<void(MachineInstr
&, OpcodePair
, unsigned, unsigned,
96 Register
, Register
, Register
)>;
98 /// For instructions where an immediate operand could be split into two
99 /// separate immediate instructions, use the splitTwoPartImm two handle the
102 /// To implement, the following function types must be passed to
103 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104 /// splitting the immediate is valid and returns the associated new opcode. A
105 /// BuildMIFunc must be implemented to build the two immediate instructions.
107 /// Example Pattern (where IMM would require 2+ MOV instructions):
108 /// %dst = <Instr>rr %src IMM [...]
110 /// %tmp = <Instr>ri %src (encode half IMM) [...]
111 /// %dst = <Instr>ri %tmp (encode half IMM) [...]
112 template <typename T
>
113 bool splitTwoPartImm(MachineInstr
&MI
,
114 SplitAndOpcFunc
<T
> SplitAndOpc
, BuildMIFunc BuildInstr
);
116 bool checkMovImmInstr(MachineInstr
&MI
, MachineInstr
*&MovMI
,
117 MachineInstr
*&SubregToRegMI
);
119 template <typename T
>
120 bool visitADDSUB(unsigned PosOpc
, unsigned NegOpc
, MachineInstr
&MI
);
121 template <typename T
>
122 bool visitADDSSUBS(OpcodePair PosOpcs
, OpcodePair NegOpcs
, MachineInstr
&MI
);
124 template <typename T
>
125 bool visitAND(unsigned Opc
, MachineInstr
&MI
);
126 bool visitORR(MachineInstr
&MI
);
127 bool visitINSERT(MachineInstr
&MI
);
128 bool visitINSviGPR(MachineInstr
&MI
, unsigned Opc
);
129 bool visitINSvi64lane(MachineInstr
&MI
);
130 bool visitFMOVDr(MachineInstr
&MI
);
131 bool runOnMachineFunction(MachineFunction
&MF
) override
;
133 StringRef
getPassName() const override
{
134 return "AArch64 MI Peephole Optimization pass";
137 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
138 AU
.setPreservesCFG();
139 AU
.addRequired
<MachineLoopInfo
>();
140 MachineFunctionPass::getAnalysisUsage(AU
);
144 char AArch64MIPeepholeOpt::ID
= 0;
146 } // end anonymous namespace
148 INITIALIZE_PASS(AArch64MIPeepholeOpt
, "aarch64-mi-peephole-opt",
149 "AArch64 MI Peephole Optimization", false, false)
151 template <typename T
>
152 static bool splitBitmaskImm(T Imm
, unsigned RegSize
, T
&Imm1Enc
, T
&Imm2Enc
) {
153 T UImm
= static_cast<T
>(Imm
);
154 if (AArch64_AM::isLogicalImmediate(UImm
, RegSize
))
157 // If this immediate can be handled by one instruction, do not split it.
158 SmallVector
<AArch64_IMM::ImmInsnModel
, 4> Insn
;
159 AArch64_IMM::expandMOVImm(UImm
, RegSize
, Insn
);
160 if (Insn
.size() == 1)
163 // The bitmask immediate consists of consecutive ones. Let's say there is
164 // constant 0b00000000001000000000010000000000 which does not consist of
165 // consecutive ones. We can split it in to two bitmask immediate like
166 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
167 // If we do AND with these two bitmask immediate, we can see original one.
168 unsigned LowestBitSet
= llvm::countr_zero(UImm
);
169 unsigned HighestBitSet
= Log2_64(UImm
);
171 // Create a mask which is filled with one from the position of lowest bit set
172 // to the position of highest bit set.
173 T NewImm1
= (static_cast<T
>(2) << HighestBitSet
) -
174 (static_cast<T
>(1) << LowestBitSet
);
175 // Create a mask which is filled with one outside the position of lowest bit
176 // set and the position of highest bit set.
177 T NewImm2
= UImm
| ~NewImm1
;
179 // If the split value is not valid bitmask immediate, do not split this
181 if (!AArch64_AM::isLogicalImmediate(NewImm2
, RegSize
))
184 Imm1Enc
= AArch64_AM::encodeLogicalImmediate(NewImm1
, RegSize
);
185 Imm2Enc
= AArch64_AM::encodeLogicalImmediate(NewImm2
, RegSize
);
189 template <typename T
>
190 bool AArch64MIPeepholeOpt::visitAND(
191 unsigned Opc
, MachineInstr
&MI
) {
192 // Try below transformation.
194 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
195 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
197 // The mov pseudo instruction could be expanded to multiple mov instructions
198 // later. Let's try to split the constant operand of mov instruction into two
199 // bitmask immediates. It makes only two AND instructions intead of multiple
200 // mov + and instructions.
202 return splitTwoPartImm
<T
>(
204 [Opc
](T Imm
, unsigned RegSize
, T
&Imm0
,
205 T
&Imm1
) -> std::optional
<OpcodePair
> {
206 if (splitBitmaskImm(Imm
, RegSize
, Imm0
, Imm1
))
207 return std::make_pair(Opc
, Opc
);
210 [&TII
= TII
](MachineInstr
&MI
, OpcodePair Opcode
, unsigned Imm0
,
211 unsigned Imm1
, Register SrcReg
, Register NewTmpReg
,
212 Register NewDstReg
) {
213 DebugLoc DL
= MI
.getDebugLoc();
214 MachineBasicBlock
*MBB
= MI
.getParent();
215 BuildMI(*MBB
, MI
, DL
, TII
->get(Opcode
.first
), NewTmpReg
)
218 BuildMI(*MBB
, MI
, DL
, TII
->get(Opcode
.second
), NewDstReg
)
224 bool AArch64MIPeepholeOpt::visitORR(MachineInstr
&MI
) {
225 // Check this ORR comes from below zero-extend pattern.
227 // def : Pat<(i64 (zext GPR32:$src)),
228 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
229 if (MI
.getOperand(3).getImm() != 0)
232 if (MI
.getOperand(1).getReg() != AArch64::WZR
)
235 MachineInstr
*SrcMI
= MRI
->getUniqueVRegDef(MI
.getOperand(2).getReg());
239 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
241 // When you use the 32-bit form of an instruction, the upper 32 bits of the
242 // source registers are ignored and the upper 32 bits of the destination
243 // register are set to zero.
245 // If AArch64's 32-bit form of instruction defines the source operand of
246 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
247 // real AArch64 instruction and if it is not, do not process the opcode
249 if (SrcMI
->getOpcode() == TargetOpcode::COPY
&&
250 SrcMI
->getOperand(1).getReg().isVirtual()) {
251 const TargetRegisterClass
*RC
=
252 MRI
->getRegClass(SrcMI
->getOperand(1).getReg());
254 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
255 // that the upper bits are zero.
256 if (RC
!= &AArch64::FPR32RegClass
&&
257 ((RC
!= &AArch64::FPR64RegClass
&& RC
!= &AArch64::FPR128RegClass
) ||
258 SrcMI
->getOperand(1).getSubReg() != AArch64::ssub
))
260 Register CpySrc
= SrcMI
->getOperand(1).getReg();
261 if (SrcMI
->getOperand(1).getSubReg() == AArch64::ssub
) {
262 CpySrc
= MRI
->createVirtualRegister(&AArch64::FPR32RegClass
);
263 BuildMI(*SrcMI
->getParent(), SrcMI
, SrcMI
->getDebugLoc(),
264 TII
->get(TargetOpcode::COPY
), CpySrc
)
265 .add(SrcMI
->getOperand(1));
267 BuildMI(*SrcMI
->getParent(), SrcMI
, SrcMI
->getDebugLoc(),
268 TII
->get(AArch64::FMOVSWr
), SrcMI
->getOperand(0).getReg())
270 SrcMI
->eraseFromParent();
272 else if (SrcMI
->getOpcode() <= TargetOpcode::GENERIC_OP_END
)
275 Register DefReg
= MI
.getOperand(0).getReg();
276 Register SrcReg
= MI
.getOperand(2).getReg();
277 MRI
->replaceRegWith(DefReg
, SrcReg
);
278 MRI
->clearKillFlags(SrcReg
);
279 LLVM_DEBUG(dbgs() << "Removed: " << MI
<< "\n");
280 MI
.eraseFromParent();
285 bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr
&MI
) {
286 // Check this INSERT_SUBREG comes from below zero-extend pattern.
288 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
289 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
291 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
292 // COPY would destroy the upper part of the register anyway
293 if (!MI
.isRegTiedToDefOperand(1))
296 Register DstReg
= MI
.getOperand(0).getReg();
297 const TargetRegisterClass
*RC
= MRI
->getRegClass(DstReg
);
298 MachineInstr
*SrcMI
= MRI
->getUniqueVRegDef(MI
.getOperand(2).getReg());
302 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
304 // When you use the 32-bit form of an instruction, the upper 32 bits of the
305 // source registers are ignored and the upper 32 bits of the destination
306 // register are set to zero.
308 // If AArch64's 32-bit form of instruction defines the source operand of
309 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
310 // real AArch64 instruction and if it is not, do not process the opcode
312 if ((SrcMI
->getOpcode() <= TargetOpcode::GENERIC_OP_END
) ||
313 !AArch64::GPR64allRegClass
.hasSubClassEq(RC
))
316 // Build a SUBREG_TO_REG instruction
317 MachineInstr
*SubregMI
=
318 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
319 TII
->get(TargetOpcode::SUBREG_TO_REG
), DstReg
)
321 .add(MI
.getOperand(2))
322 .add(MI
.getOperand(3));
323 LLVM_DEBUG(dbgs() << MI
<< " replace by:\n: " << *SubregMI
<< "\n");
325 MI
.eraseFromParent();
330 template <typename T
>
331 static bool splitAddSubImm(T Imm
, unsigned RegSize
, T
&Imm0
, T
&Imm1
) {
332 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
333 // imm0 and imm1 are non-zero 12-bit unsigned int.
334 if ((Imm
& 0xfff000) == 0 || (Imm
& 0xfff) == 0 ||
335 (Imm
& ~static_cast<T
>(0xffffff)) != 0)
338 // The immediate can not be composed via a single instruction.
339 SmallVector
<AArch64_IMM::ImmInsnModel
, 4> Insn
;
340 AArch64_IMM::expandMOVImm(Imm
, RegSize
, Insn
);
341 if (Insn
.size() == 1)
344 // Split Imm into (Imm0 << 12) + Imm1;
345 Imm0
= (Imm
>> 12) & 0xfff;
350 template <typename T
>
351 bool AArch64MIPeepholeOpt::visitADDSUB(
352 unsigned PosOpc
, unsigned NegOpc
, MachineInstr
&MI
) {
353 // Try below transformation.
355 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
356 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
358 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
359 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
361 // The mov pseudo instruction could be expanded to multiple mov instructions
362 // later. Let's try to split the constant operand of mov instruction into two
363 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
364 // multiple `mov` + `and/sub` instructions.
366 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
367 // folded. Make sure that we don't generate invalid instructions that use XZR
369 if (MI
.getOperand(1).getReg() == AArch64::XZR
||
370 MI
.getOperand(1).getReg() == AArch64::WZR
)
373 return splitTwoPartImm
<T
>(
375 [PosOpc
, NegOpc
](T Imm
, unsigned RegSize
, T
&Imm0
,
376 T
&Imm1
) -> std::optional
<OpcodePair
> {
377 if (splitAddSubImm(Imm
, RegSize
, Imm0
, Imm1
))
378 return std::make_pair(PosOpc
, PosOpc
);
379 if (splitAddSubImm(-Imm
, RegSize
, Imm0
, Imm1
))
380 return std::make_pair(NegOpc
, NegOpc
);
383 [&TII
= TII
](MachineInstr
&MI
, OpcodePair Opcode
, unsigned Imm0
,
384 unsigned Imm1
, Register SrcReg
, Register NewTmpReg
,
385 Register NewDstReg
) {
386 DebugLoc DL
= MI
.getDebugLoc();
387 MachineBasicBlock
*MBB
= MI
.getParent();
388 BuildMI(*MBB
, MI
, DL
, TII
->get(Opcode
.first
), NewTmpReg
)
392 BuildMI(*MBB
, MI
, DL
, TII
->get(Opcode
.second
), NewDstReg
)
399 template <typename T
>
400 bool AArch64MIPeepholeOpt::visitADDSSUBS(
401 OpcodePair PosOpcs
, OpcodePair NegOpcs
, MachineInstr
&MI
) {
402 // Try the same transformation as ADDSUB but with additional requirement
403 // that the condition code usages are only for Equal and Not Equal
405 if (MI
.getOperand(1).getReg() == AArch64::XZR
||
406 MI
.getOperand(1).getReg() == AArch64::WZR
)
409 return splitTwoPartImm
<T
>(
411 [PosOpcs
, NegOpcs
, &MI
, &TRI
= TRI
,
412 &MRI
= MRI
](T Imm
, unsigned RegSize
, T
&Imm0
,
413 T
&Imm1
) -> std::optional
<OpcodePair
> {
415 if (splitAddSubImm(Imm
, RegSize
, Imm0
, Imm1
))
417 else if (splitAddSubImm(-Imm
, RegSize
, Imm0
, Imm1
))
421 // Check conditional uses last since it is expensive for scanning
422 // proceeding instructions
423 MachineInstr
&SrcMI
= *MRI
->getUniqueVRegDef(MI
.getOperand(1).getReg());
424 std::optional
<UsedNZCV
> NZCVUsed
= examineCFlagsUse(SrcMI
, MI
, *TRI
);
425 if (!NZCVUsed
|| NZCVUsed
->C
|| NZCVUsed
->V
)
429 [&TII
= TII
](MachineInstr
&MI
, OpcodePair Opcode
, unsigned Imm0
,
430 unsigned Imm1
, Register SrcReg
, Register NewTmpReg
,
431 Register NewDstReg
) {
432 DebugLoc DL
= MI
.getDebugLoc();
433 MachineBasicBlock
*MBB
= MI
.getParent();
434 BuildMI(*MBB
, MI
, DL
, TII
->get(Opcode
.first
), NewTmpReg
)
438 BuildMI(*MBB
, MI
, DL
, TII
->get(Opcode
.second
), NewDstReg
)
445 // Checks if the corresponding MOV immediate instruction is applicable for
446 // this peephole optimization.
447 bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr
&MI
,
448 MachineInstr
*&MovMI
,
449 MachineInstr
*&SubregToRegMI
) {
450 // Check whether current MBB is in loop and the AND is loop invariant.
451 MachineBasicBlock
*MBB
= MI
.getParent();
452 MachineLoop
*L
= MLI
->getLoopFor(MBB
);
453 if (L
&& !L
->isLoopInvariant(MI
))
456 // Check whether current MI's operand is MOV with immediate.
457 MovMI
= MRI
->getUniqueVRegDef(MI
.getOperand(2).getReg());
461 // If it is SUBREG_TO_REG, check its operand.
462 SubregToRegMI
= nullptr;
463 if (MovMI
->getOpcode() == TargetOpcode::SUBREG_TO_REG
) {
464 SubregToRegMI
= MovMI
;
465 MovMI
= MRI
->getUniqueVRegDef(MovMI
->getOperand(2).getReg());
470 if (MovMI
->getOpcode() != AArch64::MOVi32imm
&&
471 MovMI
->getOpcode() != AArch64::MOVi64imm
)
474 // If the MOV has multiple uses, do not split the immediate because it causes
475 // more instructions.
476 if (!MRI
->hasOneUse(MovMI
->getOperand(0).getReg()))
478 if (SubregToRegMI
&& !MRI
->hasOneUse(SubregToRegMI
->getOperand(0).getReg()))
481 // It is OK to perform this peephole optimization.
485 template <typename T
>
486 bool AArch64MIPeepholeOpt::splitTwoPartImm(
488 SplitAndOpcFunc
<T
> SplitAndOpc
, BuildMIFunc BuildInstr
) {
489 unsigned RegSize
= sizeof(T
) * 8;
490 assert((RegSize
== 32 || RegSize
== 64) &&
491 "Invalid RegSize for legal immediate peephole optimization");
493 // Perform several essential checks against current MI.
494 MachineInstr
*MovMI
, *SubregToRegMI
;
495 if (!checkMovImmInstr(MI
, MovMI
, SubregToRegMI
))
498 // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
499 T Imm
= static_cast<T
>(MovMI
->getOperand(1).getImm()), Imm0
, Imm1
;
500 // For the 32 bit form of instruction, the upper 32 bits of the destination
501 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
502 // of Imm to zero. This is essential if the Immediate value was a negative
503 // number since it was sign extended when we assign to the 64-bit Imm.
507 if (auto R
= SplitAndOpc(Imm
, RegSize
, Imm0
, Imm1
))
512 // Create new MIs using the first and second opcodes. Opcodes might differ for
513 // flag setting operations that should only set flags on second instruction.
514 // NewTmpReg = Opcode.first SrcReg Imm0
515 // NewDstReg = Opcode.second NewTmpReg Imm1
517 // Determine register classes for destinations and register operands
518 MachineFunction
*MF
= MI
.getMF();
519 const TargetRegisterClass
*FirstInstrDstRC
=
520 TII
->getRegClass(TII
->get(Opcode
.first
), 0, TRI
, *MF
);
521 const TargetRegisterClass
*FirstInstrOperandRC
=
522 TII
->getRegClass(TII
->get(Opcode
.first
), 1, TRI
, *MF
);
523 const TargetRegisterClass
*SecondInstrDstRC
=
524 (Opcode
.first
== Opcode
.second
)
526 : TII
->getRegClass(TII
->get(Opcode
.second
), 0, TRI
, *MF
);
527 const TargetRegisterClass
*SecondInstrOperandRC
=
528 (Opcode
.first
== Opcode
.second
)
529 ? FirstInstrOperandRC
530 : TII
->getRegClass(TII
->get(Opcode
.second
), 1, TRI
, *MF
);
532 // Get old registers destinations and new register destinations
533 Register DstReg
= MI
.getOperand(0).getReg();
534 Register SrcReg
= MI
.getOperand(1).getReg();
535 Register NewTmpReg
= MRI
->createVirtualRegister(FirstInstrDstRC
);
536 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
537 // reuse that same destination register.
538 Register NewDstReg
= DstReg
.isVirtual()
539 ? MRI
->createVirtualRegister(SecondInstrDstRC
)
542 // Constrain registers based on their new uses
543 MRI
->constrainRegClass(SrcReg
, FirstInstrOperandRC
);
544 MRI
->constrainRegClass(NewTmpReg
, SecondInstrOperandRC
);
545 if (DstReg
!= NewDstReg
)
546 MRI
->constrainRegClass(NewDstReg
, MRI
->getRegClass(DstReg
));
548 // Call the delegating operation to build the instruction
549 BuildInstr(MI
, Opcode
, Imm0
, Imm1
, SrcReg
, NewTmpReg
, NewDstReg
);
551 // replaceRegWith changes MI's definition register. Keep it for SSA form until
552 // deleting MI. Only if we made a new destination register.
553 if (DstReg
!= NewDstReg
) {
554 MRI
->replaceRegWith(DstReg
, NewDstReg
);
555 MI
.getOperand(0).setReg(DstReg
);
558 // Record the MIs need to be removed.
559 MI
.eraseFromParent();
561 SubregToRegMI
->eraseFromParent();
562 MovMI
->eraseFromParent();
567 bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr
&MI
, unsigned Opc
) {
568 // Check if this INSvi[X]gpr comes from COPY of a source FPR128
571 // %intermediate1:gpr64 = COPY %src:fpr128
572 // %intermediate2:gpr32 = COPY %intermediate1:gpr64
573 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
575 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
577 // where src_index = 0, X = [8|16|32|64]
579 MachineInstr
*SrcMI
= MRI
->getUniqueVRegDef(MI
.getOperand(3).getReg());
581 // For a chain of COPY instructions, find the initial source register
582 // and check if it's an FPR128
584 if (!SrcMI
|| SrcMI
->getOpcode() != TargetOpcode::COPY
)
587 if (!SrcMI
->getOperand(1).getReg().isVirtual())
590 if (MRI
->getRegClass(SrcMI
->getOperand(1).getReg()) ==
591 &AArch64::FPR128RegClass
) {
594 SrcMI
= MRI
->getUniqueVRegDef(SrcMI
->getOperand(1).getReg());
597 Register DstReg
= MI
.getOperand(0).getReg();
598 Register SrcReg
= SrcMI
->getOperand(1).getReg();
599 MachineInstr
*INSvilaneMI
=
600 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(), TII
->get(Opc
), DstReg
)
601 .add(MI
.getOperand(1))
602 .add(MI
.getOperand(2))
603 .addUse(SrcReg
, getRegState(SrcMI
->getOperand(1)))
606 LLVM_DEBUG(dbgs() << MI
<< " replace by:\n: " << *INSvilaneMI
<< "\n");
608 MI
.eraseFromParent();
612 // All instructions that set a FPR64 will implicitly zero the top bits of the
614 static bool is64bitDefwithZeroHigh64bit(MachineInstr
*MI
,
615 MachineRegisterInfo
*MRI
) {
616 if (!MI
->getOperand(0).isReg() || !MI
->getOperand(0).isDef())
618 const TargetRegisterClass
*RC
= MRI
->getRegClass(MI
->getOperand(0).getReg());
619 if (RC
!= &AArch64::FPR64RegClass
)
621 return MI
->getOpcode() > TargetOpcode::GENERIC_OP_END
;
624 bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr
&MI
) {
625 // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
626 // We are expecting below case.
628 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
629 // %6:fpr128 = IMPLICIT_DEF
630 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
631 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
632 MachineInstr
*Low64MI
= MRI
->getUniqueVRegDef(MI
.getOperand(1).getReg());
633 if (Low64MI
->getOpcode() != AArch64::INSERT_SUBREG
)
635 Low64MI
= MRI
->getUniqueVRegDef(Low64MI
->getOperand(2).getReg());
636 if (!Low64MI
|| !is64bitDefwithZeroHigh64bit(Low64MI
, MRI
))
639 // Check there is `mov 0` MI for high 64-bits.
640 // We are expecting below cases.
642 // %2:fpr64 = MOVID 0
643 // %4:fpr128 = IMPLICIT_DEF
644 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
645 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
647 // %5:fpr128 = MOVIv2d_ns 0
648 // %6:fpr64 = COPY %5.dsub:fpr128
649 // %8:fpr128 = IMPLICIT_DEF
650 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
651 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
652 MachineInstr
*High64MI
= MRI
->getUniqueVRegDef(MI
.getOperand(3).getReg());
653 if (!High64MI
|| High64MI
->getOpcode() != AArch64::INSERT_SUBREG
)
655 High64MI
= MRI
->getUniqueVRegDef(High64MI
->getOperand(2).getReg());
656 if (High64MI
&& High64MI
->getOpcode() == TargetOpcode::COPY
)
657 High64MI
= MRI
->getUniqueVRegDef(High64MI
->getOperand(1).getReg());
658 if (!High64MI
|| (High64MI
->getOpcode() != AArch64::MOVID
&&
659 High64MI
->getOpcode() != AArch64::MOVIv2d_ns
))
661 if (High64MI
->getOperand(1).getImm() != 0)
664 // Let's remove MIs for high 64-bits.
665 Register OldDef
= MI
.getOperand(0).getReg();
666 Register NewDef
= MI
.getOperand(1).getReg();
667 MRI
->constrainRegClass(NewDef
, MRI
->getRegClass(OldDef
));
668 MRI
->replaceRegWith(OldDef
, NewDef
);
669 MI
.eraseFromParent();
674 bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr
&MI
) {
675 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
676 MachineInstr
*Low64MI
= MRI
->getUniqueVRegDef(MI
.getOperand(1).getReg());
677 if (!Low64MI
|| !is64bitDefwithZeroHigh64bit(Low64MI
, MRI
))
680 // Let's remove MIs for high 64-bits.
681 Register OldDef
= MI
.getOperand(0).getReg();
682 Register NewDef
= MI
.getOperand(1).getReg();
683 LLVM_DEBUG(dbgs() << "Removing: " << MI
<< "\n");
684 MRI
->clearKillFlags(OldDef
);
685 MRI
->clearKillFlags(NewDef
);
686 MRI
->constrainRegClass(NewDef
, MRI
->getRegClass(OldDef
));
687 MRI
->replaceRegWith(OldDef
, NewDef
);
688 MI
.eraseFromParent();
693 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction
&MF
) {
694 if (skipFunction(MF
.getFunction()))
697 TII
= static_cast<const AArch64InstrInfo
*>(MF
.getSubtarget().getInstrInfo());
698 TRI
= static_cast<const AArch64RegisterInfo
*>(
699 MF
.getSubtarget().getRegisterInfo());
700 MLI
= &getAnalysis
<MachineLoopInfo
>();
701 MRI
= &MF
.getRegInfo();
703 assert(MRI
->isSSA() && "Expected to be run on SSA form!");
705 bool Changed
= false;
707 for (MachineBasicBlock
&MBB
: MF
) {
708 for (MachineInstr
&MI
: make_early_inc_range(MBB
)) {
709 switch (MI
.getOpcode()) {
712 case AArch64::INSERT_SUBREG
:
713 Changed
|= visitINSERT(MI
);
715 case AArch64::ANDWrr
:
716 Changed
|= visitAND
<uint32_t>(AArch64::ANDWri
, MI
);
718 case AArch64::ANDXrr
:
719 Changed
|= visitAND
<uint64_t>(AArch64::ANDXri
, MI
);
721 case AArch64::ORRWrs
:
722 Changed
|= visitORR(MI
);
724 case AArch64::ADDWrr
:
725 Changed
|= visitADDSUB
<uint32_t>(AArch64::ADDWri
, AArch64::SUBWri
, MI
);
727 case AArch64::SUBWrr
:
728 Changed
|= visitADDSUB
<uint32_t>(AArch64::SUBWri
, AArch64::ADDWri
, MI
);
730 case AArch64::ADDXrr
:
731 Changed
|= visitADDSUB
<uint64_t>(AArch64::ADDXri
, AArch64::SUBXri
, MI
);
733 case AArch64::SUBXrr
:
734 Changed
|= visitADDSUB
<uint64_t>(AArch64::SUBXri
, AArch64::ADDXri
, MI
);
736 case AArch64::ADDSWrr
:
738 visitADDSSUBS
<uint32_t>({AArch64::ADDWri
, AArch64::ADDSWri
},
739 {AArch64::SUBWri
, AArch64::SUBSWri
}, MI
);
741 case AArch64::SUBSWrr
:
743 visitADDSSUBS
<uint32_t>({AArch64::SUBWri
, AArch64::SUBSWri
},
744 {AArch64::ADDWri
, AArch64::ADDSWri
}, MI
);
746 case AArch64::ADDSXrr
:
748 visitADDSSUBS
<uint64_t>({AArch64::ADDXri
, AArch64::ADDSXri
},
749 {AArch64::SUBXri
, AArch64::SUBSXri
}, MI
);
751 case AArch64::SUBSXrr
:
753 visitADDSSUBS
<uint64_t>({AArch64::SUBXri
, AArch64::SUBSXri
},
754 {AArch64::ADDXri
, AArch64::ADDSXri
}, MI
);
756 case AArch64::INSvi64gpr
:
757 Changed
|= visitINSviGPR(MI
, AArch64::INSvi64lane
);
759 case AArch64::INSvi32gpr
:
760 Changed
|= visitINSviGPR(MI
, AArch64::INSvi32lane
);
762 case AArch64::INSvi16gpr
:
763 Changed
|= visitINSviGPR(MI
, AArch64::INSvi16lane
);
765 case AArch64::INSvi8gpr
:
766 Changed
|= visitINSviGPR(MI
, AArch64::INSvi8lane
);
768 case AArch64::INSvi64lane
:
769 Changed
|= visitINSvi64lane(MI
);
771 case AArch64::FMOVDr
:
772 Changed
|= visitFMOVDr(MI
);
781 FunctionPass
*llvm::createAArch64MIPeepholeOptPass() {
782 return new AArch64MIPeepholeOpt();