1 //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file This pass does a few optimisations related to Tail predicated loops
10 /// and MVE VPT blocks before register allocation is performed. For VPT blocks
11 /// the goal is to maximize the sizes of the blocks that will be created by the
12 /// MVE VPT Block Insertion pass (which runs after register allocation). For
13 /// tail predicated loops we transform the loop into something that will
14 /// hopefully make the backend ARMLowOverheadLoops pass's job easier.
16 //===----------------------------------------------------------------------===//
19 #include "ARMSubtarget.h"
20 #include "MCTargetDesc/ARMBaseInfo.h"
21 #include "MVETailPredUtils.h"
22 #include "Thumb2InstrInfo.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
28 #include "llvm/CodeGen/MachineInstr.h"
29 #include "llvm/CodeGen/MachineLoopInfo.h"
30 #include "llvm/InitializePasses.h"
31 #include "llvm/Support/Debug.h"
36 #define DEBUG_TYPE "arm-mve-vpt-opts"
39 MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden
,
40 cl::desc("Enable merging Loop End and Dec instructions."),
44 class MVETPAndVPTOptimisations
: public MachineFunctionPass
{
47 const Thumb2InstrInfo
*TII
;
48 MachineRegisterInfo
*MRI
;
50 MVETPAndVPTOptimisations() : MachineFunctionPass(ID
) {}
52 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
54 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
55 AU
.addRequired
<MachineLoopInfo
>();
56 AU
.addPreserved
<MachineLoopInfo
>();
57 AU
.addRequired
<MachineDominatorTree
>();
58 AU
.addPreserved
<MachineDominatorTree
>();
59 MachineFunctionPass::getAnalysisUsage(AU
);
62 StringRef
getPassName() const override
{
63 return "ARM MVE TailPred and VPT Optimisation Pass";
67 bool LowerWhileLoopStart(MachineLoop
*ML
);
68 bool MergeLoopEnd(MachineLoop
*ML
);
69 bool ConvertTailPredLoop(MachineLoop
*ML
, MachineDominatorTree
*DT
);
70 MachineInstr
&ReplaceRegisterUseWithVPNOT(MachineBasicBlock
&MBB
,
74 bool ReduceOldVCCRValueUses(MachineBasicBlock
&MBB
);
75 bool ReplaceVCMPsByVPNOTs(MachineBasicBlock
&MBB
);
76 bool ReplaceConstByVPNOTs(MachineBasicBlock
&MBB
, MachineDominatorTree
*DT
);
77 bool ConvertVPSEL(MachineBasicBlock
&MBB
);
78 bool HintDoLoopStartReg(MachineBasicBlock
&MBB
);
79 MachineInstr
*CheckForLRUseInPredecessors(MachineBasicBlock
*PreHeader
,
80 MachineInstr
*LoopStart
);
83 char MVETPAndVPTOptimisations::ID
= 0;
85 } // end anonymous namespace
87 INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations
, DEBUG_TYPE
,
88 "ARM MVE TailPred and VPT Optimisations pass", false,
90 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo
)
91 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree
)
92 INITIALIZE_PASS_END(MVETPAndVPTOptimisations
, DEBUG_TYPE
,
93 "ARM MVE TailPred and VPT Optimisations pass", false, false)
95 static MachineInstr
*LookThroughCOPY(MachineInstr
*MI
,
96 MachineRegisterInfo
*MRI
) {
97 while (MI
&& MI
->getOpcode() == TargetOpcode::COPY
&&
98 MI
->getOperand(1).getReg().isVirtual())
99 MI
= MRI
->getVRegDef(MI
->getOperand(1).getReg());
103 // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
104 // corresponding PHI that make up a low overhead loop. Only handles 'do' loops
105 // at the moment, returning a t2DoLoopStart in LoopStart.
106 static bool findLoopComponents(MachineLoop
*ML
, MachineRegisterInfo
*MRI
,
107 MachineInstr
*&LoopStart
, MachineInstr
*&LoopPhi
,
108 MachineInstr
*&LoopDec
, MachineInstr
*&LoopEnd
) {
109 MachineBasicBlock
*Header
= ML
->getHeader();
110 MachineBasicBlock
*Latch
= ML
->getLoopLatch();
111 if (!Header
|| !Latch
) {
112 LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n");
116 // Find the loop end from the terminators.
118 for (auto &T
: Latch
->terminators()) {
119 if (T
.getOpcode() == ARM::t2LoopEnd
&& T
.getOperand(1).getMBB() == Header
) {
123 if (T
.getOpcode() == ARM::t2LoopEndDec
&&
124 T
.getOperand(2).getMBB() == Header
) {
130 LLVM_DEBUG(dbgs() << " no LoopEnd\n");
133 LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd
);
135 // Find the dec from the use of the end. There may be copies between
136 // instructions. We expect the loop to loop like:
137 // $vs = t2DoLoopStart ...
139 // $vp = phi [ $vs ], [ $vd ]
141 // $vd = t2LoopDec $vp
143 // t2LoopEnd $vd, loop
144 if (LoopEnd
->getOpcode() == ARM::t2LoopEndDec
)
148 LookThroughCOPY(MRI
->getVRegDef(LoopEnd
->getOperand(0).getReg()), MRI
);
149 if (!LoopDec
|| LoopDec
->getOpcode() != ARM::t2LoopDec
) {
150 LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n");
154 LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec
);
157 LookThroughCOPY(MRI
->getVRegDef(LoopDec
->getOperand(1).getReg()), MRI
);
158 if (!LoopPhi
|| LoopPhi
->getOpcode() != TargetOpcode::PHI
||
159 LoopPhi
->getNumOperands() != 5 ||
160 (LoopPhi
->getOperand(2).getMBB() != Latch
&&
161 LoopPhi
->getOperand(4).getMBB() != Latch
)) {
162 LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n");
165 LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi
);
167 Register StartReg
= LoopPhi
->getOperand(2).getMBB() == Latch
168 ? LoopPhi
->getOperand(3).getReg()
169 : LoopPhi
->getOperand(1).getReg();
170 LoopStart
= LookThroughCOPY(MRI
->getVRegDef(StartReg
), MRI
);
171 if (!LoopStart
|| (LoopStart
->getOpcode() != ARM::t2DoLoopStart
&&
172 LoopStart
->getOpcode() != ARM::t2WhileLoopSetup
&&
173 LoopStart
->getOpcode() != ARM::t2WhileLoopStartLR
)) {
174 LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n");
177 LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart
);
182 static void RevertWhileLoopSetup(MachineInstr
*MI
, const TargetInstrInfo
*TII
) {
183 MachineBasicBlock
*MBB
= MI
->getParent();
184 assert(MI
->getOpcode() == ARM::t2WhileLoopSetup
&&
185 "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!");
188 MachineInstrBuilder MIB
=
189 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
->get(ARM::t2SUBri
));
190 MIB
.add(MI
->getOperand(0));
191 MIB
.add(MI
->getOperand(1));
193 MIB
.addImm(ARMCC::AL
);
194 MIB
.addReg(ARM::NoRegister
);
195 MIB
.addReg(ARM::CPSR
, RegState::Define
);
197 // Attempt to find a t2WhileLoopStart and revert to a t2Bcc.
198 for (MachineInstr
&I
: MBB
->terminators()) {
199 if (I
.getOpcode() == ARM::t2WhileLoopStart
) {
200 MachineInstrBuilder MIB
=
201 BuildMI(*MBB
, &I
, I
.getDebugLoc(), TII
->get(ARM::t2Bcc
));
202 MIB
.add(MI
->getOperand(1)); // branch target
203 MIB
.addImm(ARMCC::EQ
);
204 MIB
.addReg(ARM::CPSR
);
210 MI
->eraseFromParent();
213 // The Hardware Loop insertion and ISel Lowering produce the pseudos for the
214 // start of a while loop:
215 // %a:gprlr = t2WhileLoopSetup %Cnt
216 // t2WhileLoopStart %a, %BB
217 // We want to convert those to a single instruction which, like t2LoopEndDec and
218 // t2DoLoopStartTP is both a terminator and produces a value:
219 // %a:grplr: t2WhileLoopStartLR %Cnt, %BB
221 // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and
222 // t2WhileLoopStart are not valid past regalloc.
223 bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop
*ML
) {
224 LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop "
225 << ML
->getHeader()->getName() << "\n");
227 MachineInstr
*LoopEnd
, *LoopPhi
, *LoopStart
, *LoopDec
;
228 if (!findLoopComponents(ML
, MRI
, LoopStart
, LoopPhi
, LoopDec
, LoopEnd
))
231 if (LoopStart
->getOpcode() != ARM::t2WhileLoopSetup
)
234 Register LR
= LoopStart
->getOperand(0).getReg();
235 auto WLSIt
= find_if(MRI
->use_nodbg_instructions(LR
), [](auto &MI
) {
236 return MI
.getOpcode() == ARM::t2WhileLoopStart
;
238 if (!MergeEndDec
|| WLSIt
== MRI
->use_instr_nodbg_end()) {
239 RevertWhileLoopSetup(LoopStart
, TII
);
240 RevertLoopDec(LoopStart
, TII
);
241 RevertLoopEnd(LoopStart
, TII
);
245 MachineInstrBuilder MI
=
246 BuildMI(*WLSIt
->getParent(), *WLSIt
, WLSIt
->getDebugLoc(),
247 TII
->get(ARM::t2WhileLoopStartLR
), LR
)
248 .add(LoopStart
->getOperand(1))
249 .add(WLSIt
->getOperand(1));
251 LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI
.getInstr());
253 WLSIt
->eraseFromParent();
254 LoopStart
->eraseFromParent();
258 // Return true if this instruction is invalid in a low overhead loop, usually
259 // because it clobbers LR.
260 static bool IsInvalidTPInstruction(MachineInstr
&MI
) {
261 return MI
.isCall() || isLoopStart(MI
);
264 // Starting from PreHeader, search for invalid instructions back until the
265 // LoopStart block is reached. If invalid instructions are found, the loop start
266 // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will
267 // return the new DLS LoopStart if updated.
268 MachineInstr
*MVETPAndVPTOptimisations::CheckForLRUseInPredecessors(
269 MachineBasicBlock
*PreHeader
, MachineInstr
*LoopStart
) {
270 SmallVector
<MachineBasicBlock
*> Worklist
;
271 SmallPtrSet
<MachineBasicBlock
*, 4> Visited
;
272 Worklist
.push_back(PreHeader
);
273 Visited
.insert(LoopStart
->getParent());
275 while (!Worklist
.empty()) {
276 MachineBasicBlock
*MBB
= Worklist
.pop_back_val();
277 if (Visited
.count(MBB
))
280 for (MachineInstr
&MI
: *MBB
) {
281 if (!IsInvalidTPInstruction(MI
))
284 LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI
);
286 // Create a t2DoLoopStart at the end of the preheader.
287 MachineInstrBuilder MIB
=
288 BuildMI(*PreHeader
, PreHeader
->getFirstTerminator(),
289 LoopStart
->getDebugLoc(), TII
->get(ARM::t2DoLoopStart
));
290 MIB
.add(LoopStart
->getOperand(0));
291 MIB
.add(LoopStart
->getOperand(1));
293 // Make sure to remove the kill flags, to prevent them from being invalid.
294 LoopStart
->getOperand(1).setIsKill(false);
296 // Revert the t2WhileLoopStartLR to a CMP and Br.
297 RevertWhileLoopStartLR(LoopStart
, TII
, ARM::t2Bcc
, true);
302 for (auto *Pred
: MBB
->predecessors())
303 Worklist
.push_back(Pred
);
308 // This function converts loops with t2LoopEnd and t2LoopEnd instructions into
309 // a single t2LoopEndDec instruction. To do that it needs to make sure that LR
310 // will be valid to be used for the low overhead loop, which means nothing else
311 // is using LR (especially calls) and there are no superfluous copies in the
312 // loop. The t2LoopEndDec is a branching terminator that produces a value (the
313 // decrement) around the loop edge, which means we need to be careful that they
314 // will be valid to allocate without any spilling.
315 bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop
*ML
) {
319 LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML
->getHeader()->getName()
322 MachineInstr
*LoopEnd
, *LoopPhi
, *LoopStart
, *LoopDec
;
323 if (!findLoopComponents(ML
, MRI
, LoopStart
, LoopPhi
, LoopDec
, LoopEnd
))
326 // Check if there is an illegal instruction (a call) in the low overhead loop
327 // and if so revert it now before we get any further. While loops also need to
328 // check the preheaders, but can be reverted to a DLS loop if needed.
329 auto *PreHeader
= ML
->getLoopPreheader();
330 if (LoopStart
->getOpcode() == ARM::t2WhileLoopStartLR
&& PreHeader
)
331 LoopStart
= CheckForLRUseInPredecessors(PreHeader
, LoopStart
);
333 for (MachineBasicBlock
*MBB
: ML
->blocks()) {
334 for (MachineInstr
&MI
: *MBB
) {
335 if (IsInvalidTPInstruction(MI
)) {
336 LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI
);
337 if (LoopStart
->getOpcode() == ARM::t2DoLoopStart
)
338 RevertDoLoopStart(LoopStart
, TII
);
340 RevertWhileLoopStartLR(LoopStart
, TII
);
341 RevertLoopDec(LoopDec
, TII
);
342 RevertLoopEnd(LoopEnd
, TII
);
348 // Remove any copies from the loop, to ensure the phi that remains is both
349 // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
350 // that cannot spill, we need to be careful what remains in the loop.
351 Register PhiReg
= LoopPhi
->getOperand(0).getReg();
352 Register DecReg
= LoopDec
->getOperand(0).getReg();
353 Register StartReg
= LoopStart
->getOperand(0).getReg();
354 // Ensure the uses are expected, and collect any copies we want to remove.
355 SmallVector
<MachineInstr
*, 4> Copies
;
356 auto CheckUsers
= [&Copies
](Register BaseReg
,
357 ArrayRef
<MachineInstr
*> ExpectedUsers
,
358 MachineRegisterInfo
*MRI
) {
359 SmallVector
<Register
, 4> Worklist
;
360 Worklist
.push_back(BaseReg
);
361 while (!Worklist
.empty()) {
362 Register Reg
= Worklist
.pop_back_val();
363 for (MachineInstr
&MI
: MRI
->use_nodbg_instructions(Reg
)) {
364 if (count(ExpectedUsers
, &MI
))
366 if (MI
.getOpcode() != TargetOpcode::COPY
||
367 !MI
.getOperand(0).getReg().isVirtual()) {
368 LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI
);
371 Worklist
.push_back(MI
.getOperand(0).getReg());
372 Copies
.push_back(&MI
);
377 if (!CheckUsers(PhiReg
, {LoopDec
}, MRI
) ||
378 !CheckUsers(DecReg
, {LoopPhi
, LoopEnd
}, MRI
) ||
379 !CheckUsers(StartReg
, {LoopPhi
}, MRI
)) {
380 // Don't leave a t2WhileLoopStartLR without the LoopDecEnd.
381 if (LoopStart
->getOpcode() == ARM::t2WhileLoopStartLR
) {
382 RevertWhileLoopStartLR(LoopStart
, TII
);
383 RevertLoopDec(LoopDec
, TII
);
384 RevertLoopEnd(LoopEnd
, TII
);
390 MRI
->constrainRegClass(StartReg
, &ARM::GPRlrRegClass
);
391 MRI
->constrainRegClass(PhiReg
, &ARM::GPRlrRegClass
);
392 MRI
->constrainRegClass(DecReg
, &ARM::GPRlrRegClass
);
394 if (LoopPhi
->getOperand(2).getMBB() == ML
->getLoopLatch()) {
395 LoopPhi
->getOperand(3).setReg(StartReg
);
396 LoopPhi
->getOperand(1).setReg(DecReg
);
398 LoopPhi
->getOperand(1).setReg(StartReg
);
399 LoopPhi
->getOperand(3).setReg(DecReg
);
402 // Replace the loop dec and loop end as a single instruction.
403 MachineInstrBuilder MI
=
404 BuildMI(*LoopEnd
->getParent(), *LoopEnd
, LoopEnd
->getDebugLoc(),
405 TII
->get(ARM::t2LoopEndDec
), DecReg
)
407 .add(LoopEnd
->getOperand(1));
409 LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI
.getInstr());
411 LoopDec
->eraseFromParent();
412 LoopEnd
->eraseFromParent();
413 for (auto *MI
: Copies
)
414 MI
->eraseFromParent();
418 // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
419 // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
420 // instruction, making the backend ARMLowOverheadLoops passes job of finding the
421 // VCTP operand much simpler.
422 bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop
*ML
,
423 MachineDominatorTree
*DT
) {
424 LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
425 << ML
->getHeader()->getName() << "\n");
427 // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
429 MachineInstr
*LoopEnd
, *LoopPhi
, *LoopStart
, *LoopDec
;
430 if (!findLoopComponents(ML
, MRI
, LoopStart
, LoopPhi
, LoopDec
, LoopEnd
))
432 if (LoopDec
!= LoopEnd
|| (LoopStart
->getOpcode() != ARM::t2DoLoopStart
&&
433 LoopStart
->getOpcode() != ARM::t2WhileLoopStartLR
))
436 SmallVector
<MachineInstr
*, 4> VCTPs
;
437 for (MachineBasicBlock
*BB
: ML
->blocks())
438 for (MachineInstr
&MI
: *BB
)
440 VCTPs
.push_back(&MI
);
443 LLVM_DEBUG(dbgs() << " no VCTPs\n");
447 // Check all VCTPs are the same.
448 MachineInstr
*FirstVCTP
= *VCTPs
.begin();
449 for (MachineInstr
*VCTP
: VCTPs
) {
450 LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP
);
451 if (VCTP
->getOpcode() != FirstVCTP
->getOpcode() ||
452 VCTP
->getOperand(0).getReg() != FirstVCTP
->getOperand(0).getReg()) {
453 LLVM_DEBUG(dbgs() << " VCTP's are not identical\n");
458 // Check for the register being used can be setup before the loop. We expect
462 // $vp = PHI [ $vx ], [ $vd ]
466 // $vd = t2SUBri $vp, #n
468 Register CountReg
= FirstVCTP
->getOperand(1).getReg();
469 if (!CountReg
.isVirtual()) {
470 LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n");
473 MachineInstr
*Phi
= LookThroughCOPY(MRI
->getVRegDef(CountReg
), MRI
);
474 if (!Phi
|| Phi
->getOpcode() != TargetOpcode::PHI
||
475 Phi
->getNumOperands() != 5 ||
476 (Phi
->getOperand(2).getMBB() != ML
->getLoopLatch() &&
477 Phi
->getOperand(4).getMBB() != ML
->getLoopLatch())) {
478 LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n");
481 CountReg
= Phi
->getOperand(2).getMBB() == ML
->getLoopLatch()
482 ? Phi
->getOperand(3).getReg()
483 : Phi
->getOperand(1).getReg();
485 // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
486 // the preheader and add the new CountReg to it. We attempt to place it late
487 // in the preheader, but may need to move that earlier based on uses.
488 MachineBasicBlock
*MBB
= LoopStart
->getParent();
489 MachineBasicBlock::iterator InsertPt
= MBB
->getFirstTerminator();
490 for (MachineInstr
&Use
:
491 MRI
->use_instructions(LoopStart
->getOperand(0).getReg()))
492 if ((InsertPt
!= MBB
->end() && !DT
->dominates(&*InsertPt
, &Use
)) ||
493 !DT
->dominates(ML
->getHeader(), Use
.getParent())) {
494 LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n");
498 unsigned NewOpc
= LoopStart
->getOpcode() == ARM::t2DoLoopStart
499 ? ARM::t2DoLoopStartTP
500 : ARM::t2WhileLoopStartTP
;
501 MachineInstrBuilder MI
=
502 BuildMI(*MBB
, InsertPt
, LoopStart
->getDebugLoc(), TII
->get(NewOpc
))
503 .add(LoopStart
->getOperand(0))
504 .add(LoopStart
->getOperand(1))
506 if (NewOpc
== ARM::t2WhileLoopStartTP
)
507 MI
.add(LoopStart
->getOperand(2));
508 LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart
<< " with "
510 MRI
->constrainRegClass(CountReg
, &ARM::rGPRRegClass
);
511 LoopStart
->eraseFromParent();
516 // Returns true if Opcode is any VCMP Opcode.
517 static bool IsVCMP(unsigned Opcode
) { return VCMPOpcodeToVPT(Opcode
) != 0; }
519 // Returns true if a VCMP with this Opcode can have its operands swapped.
520 // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
521 // and VCMPr instructions (since the r is always on the right).
522 static bool CanHaveSwappedOperands(unsigned Opcode
) {
526 case ARM::MVE_VCMPf32
:
527 case ARM::MVE_VCMPf16
:
528 case ARM::MVE_VCMPf32r
:
529 case ARM::MVE_VCMPf16r
:
530 case ARM::MVE_VCMPi8r
:
531 case ARM::MVE_VCMPi16r
:
532 case ARM::MVE_VCMPi32r
:
533 case ARM::MVE_VCMPu8r
:
534 case ARM::MVE_VCMPu16r
:
535 case ARM::MVE_VCMPu32r
:
536 case ARM::MVE_VCMPs8r
:
537 case ARM::MVE_VCMPs16r
:
538 case ARM::MVE_VCMPs32r
:
543 // Returns the CondCode of a VCMP Instruction.
544 static ARMCC::CondCodes
GetCondCode(MachineInstr
&Instr
) {
545 assert(IsVCMP(Instr
.getOpcode()) && "Inst must be a VCMP");
546 return ARMCC::CondCodes(Instr
.getOperand(3).getImm());
549 // Returns true if Cond is equivalent to a VPNOT instruction on the result of
550 // Prev. Cond and Prev must be VCMPs.
551 static bool IsVPNOTEquivalent(MachineInstr
&Cond
, MachineInstr
&Prev
) {
552 assert(IsVCMP(Cond
.getOpcode()) && IsVCMP(Prev
.getOpcode()));
554 // Opcodes must match.
555 if (Cond
.getOpcode() != Prev
.getOpcode())
558 MachineOperand
&CondOP1
= Cond
.getOperand(1), &CondOP2
= Cond
.getOperand(2);
559 MachineOperand
&PrevOP1
= Prev
.getOperand(1), &PrevOP2
= Prev
.getOperand(2);
561 // If the VCMP has the opposite condition with the same operands, we can
562 // replace it with a VPNOT
563 ARMCC::CondCodes ExpectedCode
= GetCondCode(Cond
);
564 ExpectedCode
= ARMCC::getOppositeCondition(ExpectedCode
);
565 if (ExpectedCode
== GetCondCode(Prev
))
566 if (CondOP1
.isIdenticalTo(PrevOP1
) && CondOP2
.isIdenticalTo(PrevOP2
))
568 // Check again with operands swapped if possible
569 if (!CanHaveSwappedOperands(Cond
.getOpcode()))
571 ExpectedCode
= ARMCC::getSwappedCondition(ExpectedCode
);
572 return ExpectedCode
== GetCondCode(Prev
) && CondOP1
.isIdenticalTo(PrevOP2
) &&
573 CondOP2
.isIdenticalTo(PrevOP1
);
576 // Returns true if Instr writes to VCCR.
577 static bool IsWritingToVCCR(MachineInstr
&Instr
) {
578 if (Instr
.getNumOperands() == 0)
580 MachineOperand
&Dst
= Instr
.getOperand(0);
583 Register DstReg
= Dst
.getReg();
584 if (!DstReg
.isVirtual())
586 MachineRegisterInfo
&RegInfo
= Instr
.getMF()->getRegInfo();
587 const TargetRegisterClass
*RegClass
= RegInfo
.getRegClassOrNull(DstReg
);
588 return RegClass
&& (RegClass
->getID() == ARM::VCCRRegClassID
);
592 // <Instr that uses %A ('User' Operand)>
594 // %K = VPNOT %Target
595 // <Instr that uses %K ('User' Operand)>
596 // And returns the newly inserted VPNOT.
597 // This optimization is done in the hopes of preventing spills/reloads of VPR by
598 // reducing the number of VCCR values with overlapping lifetimes.
599 MachineInstr
&MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT(
600 MachineBasicBlock
&MBB
, MachineInstr
&Instr
, MachineOperand
&User
,
602 Register NewResult
= MRI
->createVirtualRegister(MRI
->getRegClass(Target
));
604 MachineInstrBuilder MIBuilder
=
605 BuildMI(MBB
, &Instr
, Instr
.getDebugLoc(), TII
->get(ARM::MVE_VPNOT
))
608 addUnpredicatedMveVpredNOp(MIBuilder
);
610 // Make the user use NewResult instead, and clear its kill flag.
611 User
.setReg(NewResult
);
612 User
.setIsKill(false);
614 LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): ";
615 MIBuilder
.getInstr()->dump());
617 return *MIBuilder
.getInstr();
620 // Moves a VPNOT before its first user if an instruction that uses Reg is found
621 // in-between the VPNOT and its user.
622 // Returns true if there is at least one user of the VPNOT in the block.
623 static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock
&MBB
,
624 MachineBasicBlock::iterator Iter
,
626 assert(Iter
->getOpcode() == ARM::MVE_VPNOT
&& "Not a VPNOT!");
627 assert(getVPTInstrPredicate(*Iter
) == ARMVCC::None
&&
628 "The VPNOT cannot be predicated");
630 MachineInstr
&VPNOT
= *Iter
;
631 Register VPNOTResult
= VPNOT
.getOperand(0).getReg();
632 Register VPNOTOperand
= VPNOT
.getOperand(1).getReg();
634 // Whether the VPNOT will need to be moved, and whether we found a user of the
636 bool MustMove
= false, HasUser
= false;
637 MachineOperand
*VPNOTOperandKiller
= nullptr;
638 for (; Iter
!= MBB
.end(); ++Iter
) {
639 if (MachineOperand
*MO
=
640 Iter
->findRegisterUseOperand(VPNOTOperand
, /*isKill*/ true)) {
641 // If we find the operand that kills the VPNOTOperand's result, save it.
642 VPNOTOperandKiller
= MO
;
645 if (Iter
->findRegisterUseOperandIdx(Reg
) != -1) {
650 if (Iter
->findRegisterUseOperandIdx(VPNOTResult
) == -1)
657 // Move the VPNOT right before Iter
658 LLVM_DEBUG(dbgs() << "Moving: "; VPNOT
.dump(); dbgs() << " Before: ";
660 MBB
.splice(Iter
, &MBB
, VPNOT
.getIterator());
661 // If we move the instr, and its operand was killed earlier, remove the kill
663 if (VPNOTOperandKiller
)
664 VPNOTOperandKiller
->setIsKill(false);
671 // This optimisation attempts to reduce the number of overlapping lifetimes of
672 // VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
674 // %A:vccr = (something)
675 // %B:vccr = VPNOT %A
676 // %Foo = (some op that uses %B)
677 // %Bar = (some op that uses %A)
679 // %A:vccr = (something)
680 // %B:vccr = VPNOT %A
681 // %Foo = (some op that uses %B)
682 // %TMP2:vccr = VPNOT %B
683 // %Bar = (some op that uses %A)
684 bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock
&MBB
) {
685 MachineBasicBlock::iterator Iter
= MBB
.begin(), End
= MBB
.end();
686 SmallVector
<MachineInstr
*, 4> DeadInstructions
;
687 bool Modified
= false;
689 while (Iter
!= End
) {
690 Register VCCRValue
, OppositeVCCRValue
;
691 // The first loop looks for 2 unpredicated instructions:
692 // %A:vccr = (instr) ; A is stored in VCCRValue
693 // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue
694 for (; Iter
!= End
; ++Iter
) {
695 // We're only interested in unpredicated instructions that write to VCCR.
696 if (!IsWritingToVCCR(*Iter
) ||
697 getVPTInstrPredicate(*Iter
) != ARMVCC::None
)
699 Register Dst
= Iter
->getOperand(0).getReg();
701 // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
702 // found what we were looking for.
703 if (VCCRValue
&& Iter
->getOpcode() == ARM::MVE_VPNOT
&&
704 Iter
->findRegisterUseOperandIdx(VCCRValue
) != -1) {
705 // Move the VPNOT closer to its first user if needed, and ignore if it
707 if (!MoveVPNOTBeforeFirstUser(MBB
, Iter
, VCCRValue
))
710 OppositeVCCRValue
= Dst
;
715 // Else, just set VCCRValue.
719 // If the first inner loop didn't find anything, stop here.
723 assert(VCCRValue
&& OppositeVCCRValue
&&
724 "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
725 "stopped before the end of the block!");
726 assert(VCCRValue
!= OppositeVCCRValue
&&
727 "VCCRValue should not be equal to OppositeVCCRValue!");
729 // LastVPNOTResult always contains the same value as OppositeVCCRValue.
730 Register LastVPNOTResult
= OppositeVCCRValue
;
732 // This second loop tries to optimize the remaining instructions.
733 for (; Iter
!= End
; ++Iter
) {
734 bool IsInteresting
= false;
736 if (MachineOperand
*MO
= Iter
->findRegisterUseOperand(VCCRValue
)) {
737 IsInteresting
= true;
739 // - If the instruction is a VPNOT, it can be removed, and we can just
740 // replace its uses with LastVPNOTResult.
741 // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
742 if (Iter
->getOpcode() == ARM::MVE_VPNOT
) {
743 Register Result
= Iter
->getOperand(0).getReg();
745 MRI
->replaceRegWith(Result
, LastVPNOTResult
);
746 DeadInstructions
.push_back(&*Iter
);
750 << "Replacing all uses of '" << printReg(Result
)
751 << "' with '" << printReg(LastVPNOTResult
) << "'\n");
753 MachineInstr
&VPNOT
=
754 ReplaceRegisterUseWithVPNOT(MBB
, *Iter
, *MO
, LastVPNOTResult
);
757 LastVPNOTResult
= VPNOT
.getOperand(0).getReg();
758 std::swap(VCCRValue
, OppositeVCCRValue
);
760 LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue
)
761 << "' with '" << printReg(LastVPNOTResult
)
762 << "' in instr: " << *Iter
);
765 // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
766 // instead as they contain the same value.
767 if (MachineOperand
*MO
=
768 Iter
->findRegisterUseOperand(OppositeVCCRValue
)) {
769 IsInteresting
= true;
771 // This is pointless if LastVPNOTResult == OppositeVCCRValue.
772 if (LastVPNOTResult
!= OppositeVCCRValue
) {
773 LLVM_DEBUG(dbgs() << "Replacing usage of '"
774 << printReg(OppositeVCCRValue
) << "' with '"
775 << printReg(LastVPNOTResult
) << " for instr: ";
777 MO
->setReg(LastVPNOTResult
);
781 MO
->setIsKill(false);
784 // If this is an unpredicated VPNOT on
785 // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
786 if (Iter
->getOpcode() == ARM::MVE_VPNOT
&&
787 getVPTInstrPredicate(*Iter
) == ARMVCC::None
) {
788 Register VPNOTOperand
= Iter
->getOperand(1).getReg();
789 if (VPNOTOperand
== LastVPNOTResult
||
790 VPNOTOperand
== OppositeVCCRValue
) {
791 IsInteresting
= true;
793 std::swap(VCCRValue
, OppositeVCCRValue
);
794 LastVPNOTResult
= Iter
->getOperand(0).getReg();
799 // If this instruction was not interesting, and it writes to VCCR, stop.
800 if (!IsInteresting
&& IsWritingToVCCR(*Iter
))
805 for (MachineInstr
*DeadInstruction
: DeadInstructions
)
806 DeadInstruction
->eraseFromParent();
811 // This optimisation replaces VCMPs with VPNOTs when they are equivalent.
812 bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock
&MBB
) {
813 SmallVector
<MachineInstr
*, 4> DeadInstructions
;
815 // The last VCMP that we have seen and that couldn't be replaced.
816 // This is reset when an instruction that writes to VCCR/VPR is found, or when
817 // a VCMP is replaced with a VPNOT.
818 // We'll only replace VCMPs with VPNOTs when this is not null, and when the
819 // current VCMP is the opposite of PrevVCMP.
820 MachineInstr
*PrevVCMP
= nullptr;
821 // If we find an instruction that kills the result of PrevVCMP, we save the
822 // operand here to remove the kill flag in case we need to use PrevVCMP's
824 MachineOperand
*PrevVCMPResultKiller
= nullptr;
826 for (MachineInstr
&Instr
: MBB
.instrs()) {
828 if (MachineOperand
*MO
= Instr
.findRegisterUseOperand(
829 PrevVCMP
->getOperand(0).getReg(), /*isKill*/ true)) {
830 // If we come accross the instr that kills PrevVCMP's result, record it
831 // so we can remove the kill flag later if we need to.
832 PrevVCMPResultKiller
= MO
;
836 // Ignore predicated instructions.
837 if (getVPTInstrPredicate(Instr
) != ARMVCC::None
)
840 // Only look at VCMPs
841 if (!IsVCMP(Instr
.getOpcode())) {
842 // If the instruction writes to VCCR, forget the previous VCMP.
843 if (IsWritingToVCCR(Instr
))
848 if (!PrevVCMP
|| !IsVPNOTEquivalent(Instr
, *PrevVCMP
)) {
853 // The register containing the result of the VCMP that we're going to
855 Register PrevVCMPResultReg
= PrevVCMP
->getOperand(0).getReg();
857 // Build a VPNOT to replace the VCMP, reusing its operands.
858 MachineInstrBuilder MIBuilder
=
859 BuildMI(MBB
, &Instr
, Instr
.getDebugLoc(), TII
->get(ARM::MVE_VPNOT
))
860 .add(Instr
.getOperand(0))
861 .addReg(PrevVCMPResultReg
);
862 addUnpredicatedMveVpredNOp(MIBuilder
);
863 LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
864 MIBuilder
.getInstr()->dump(); dbgs() << " Removed VCMP: ";
867 // If we found an instruction that uses, and kills PrevVCMP's result,
868 // remove the kill flag.
869 if (PrevVCMPResultKiller
)
870 PrevVCMPResultKiller
->setIsKill(false);
872 // Finally, mark the old VCMP for removal and reset
873 // PrevVCMP/PrevVCMPResultKiller.
874 DeadInstructions
.push_back(&Instr
);
876 PrevVCMPResultKiller
= nullptr;
879 for (MachineInstr
*DeadInstruction
: DeadInstructions
)
880 DeadInstruction
->eraseFromParent();
882 return !DeadInstructions
.empty();
885 bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock
&MBB
,
886 MachineDominatorTree
*DT
) {
887 // Scan through the block, looking for instructions that use constants moves
888 // into VPR that are the negative of one another. These are expected to be
889 // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
890 // mask is kept it or and VPNOT's of it are added or reused as we scan through
892 unsigned LastVPTImm
= 0;
893 Register LastVPTReg
= 0;
894 SmallSet
<MachineInstr
*, 4> DeadInstructions
;
896 for (MachineInstr
&Instr
: MBB
.instrs()) {
897 // Look for predicated MVE instructions.
898 int PIdx
= llvm::findFirstVPTPredOperandIdx(Instr
);
901 Register VPR
= Instr
.getOperand(PIdx
+ 1).getReg();
902 if (!VPR
.isVirtual())
905 // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
906 MachineInstr
*Copy
= MRI
->getVRegDef(VPR
);
907 if (!Copy
|| Copy
->getOpcode() != TargetOpcode::COPY
||
908 !Copy
->getOperand(1).getReg().isVirtual() ||
909 MRI
->getRegClass(Copy
->getOperand(1).getReg()) == &ARM::VCCRRegClass
) {
913 Register GPR
= Copy
->getOperand(1).getReg();
915 // Find the Immediate used by the copy.
916 auto getImm
= [&](Register GPR
) -> unsigned {
917 MachineInstr
*Def
= MRI
->getVRegDef(GPR
);
918 if (Def
&& (Def
->getOpcode() == ARM::t2MOVi
||
919 Def
->getOpcode() == ARM::t2MOVi16
))
920 return Def
->getOperand(1).getImm();
923 unsigned Imm
= getImm(GPR
);
929 unsigned NotImm
= ~Imm
& 0xffff;
930 if (LastVPTReg
!= 0 && LastVPTReg
!= VPR
&& LastVPTImm
== Imm
) {
931 Instr
.getOperand(PIdx
+ 1).setReg(LastVPTReg
);
932 if (MRI
->use_empty(VPR
)) {
933 DeadInstructions
.insert(Copy
);
934 if (MRI
->hasOneUse(GPR
))
935 DeadInstructions
.insert(MRI
->getVRegDef(GPR
));
937 LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr
);
938 } else if (LastVPTReg
!= 0 && LastVPTImm
== NotImm
) {
939 // We have found the not of a previous constant. Create a VPNot of the
940 // earlier predicate reg and use it instead of the copy.
941 Register NewVPR
= MRI
->createVirtualRegister(&ARM::VCCRRegClass
);
942 auto VPNot
= BuildMI(MBB
, &Instr
, Instr
.getDebugLoc(),
943 TII
->get(ARM::MVE_VPNOT
), NewVPR
)
945 addUnpredicatedMveVpredNOp(VPNot
);
947 // Use the new register and check if the def is now dead.
948 Instr
.getOperand(PIdx
+ 1).setReg(NewVPR
);
949 if (MRI
->use_empty(VPR
)) {
950 DeadInstructions
.insert(Copy
);
951 if (MRI
->hasOneUse(GPR
))
952 DeadInstructions
.insert(MRI
->getVRegDef(GPR
));
954 LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot
<< " to replace use at "
963 for (MachineInstr
*DI
: DeadInstructions
)
964 DI
->eraseFromParent();
966 return !DeadInstructions
.empty();
969 // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
970 // somewhat blunt approximation to allow tail predicated with vpsel
971 // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
972 // different semantics under tail predication. Until that is modelled we just
973 // convert to a VMOVT (via a predicated VORR) instead.
974 bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock
&MBB
) {
975 bool HasVCTP
= false;
976 SmallVector
<MachineInstr
*, 4> DeadInstructions
;
978 for (MachineInstr
&MI
: MBB
.instrs()) {
984 if (!HasVCTP
|| MI
.getOpcode() != ARM::MVE_VPSEL
)
987 MachineInstrBuilder MIBuilder
=
988 BuildMI(MBB
, &MI
, MI
.getDebugLoc(), TII
->get(ARM::MVE_VORR
))
989 .add(MI
.getOperand(0))
990 .add(MI
.getOperand(1))
991 .add(MI
.getOperand(1))
992 .addImm(ARMVCC::Then
)
993 .add(MI
.getOperand(4))
994 .add(MI
.getOperand(2));
995 // Silence unused variable warning in release builds.
997 LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI
.dump();
998 dbgs() << " with VMOVT: "; MIBuilder
.getInstr()->dump());
999 DeadInstructions
.push_back(&MI
);
1002 for (MachineInstr
*DeadInstruction
: DeadInstructions
)
1003 DeadInstruction
->eraseFromParent();
1005 return !DeadInstructions
.empty();
1008 // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
1009 // the instruction may be removable as a noop.
1010 bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock
&MBB
) {
1011 bool Changed
= false;
1012 for (MachineInstr
&MI
: MBB
.instrs()) {
1013 if (MI
.getOpcode() != ARM::t2DoLoopStart
)
1015 Register R
= MI
.getOperand(1).getReg();
1016 MachineFunction
*MF
= MI
.getParent()->getParent();
1017 MF
->getRegInfo().setRegAllocationHint(R
, ARMRI::RegLR
, 0);
1023 bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction
&Fn
) {
1024 const ARMSubtarget
&STI
=
1025 static_cast<const ARMSubtarget
&>(Fn
.getSubtarget());
1027 if (!STI
.isThumb2() || !STI
.hasLOB())
1030 TII
= static_cast<const Thumb2InstrInfo
*>(STI
.getInstrInfo());
1031 MRI
= &Fn
.getRegInfo();
1032 MachineLoopInfo
*MLI
= &getAnalysis
<MachineLoopInfo
>();
1033 MachineDominatorTree
*DT
= &getAnalysis
<MachineDominatorTree
>();
1035 LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
1036 << "********** Function: " << Fn
.getName() << '\n');
1038 bool Modified
= false;
1039 for (MachineLoop
*ML
: MLI
->getBase().getLoopsInPreorder()) {
1040 Modified
|= LowerWhileLoopStart(ML
);
1041 Modified
|= MergeLoopEnd(ML
);
1042 Modified
|= ConvertTailPredLoop(ML
, DT
);
1045 for (MachineBasicBlock
&MBB
: Fn
) {
1046 Modified
|= HintDoLoopStartReg(MBB
);
1047 Modified
|= ReplaceConstByVPNOTs(MBB
, DT
);
1048 Modified
|= ReplaceVCMPsByVPNOTs(MBB
);
1049 Modified
|= ReduceOldVCCRValueUses(MBB
);
1050 Modified
|= ConvertVPSEL(MBB
);
1053 LLVM_DEBUG(dbgs() << "**************************************\n");
1057 /// createMVETPAndVPTOptimisationsPass
1058 FunctionPass
*llvm::createMVETPAndVPTOptimisationsPass() {
1059 return new MVETPAndVPTOptimisations();