1 //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file This pass does a few optimisations related to Tail predicated loops
10 /// and MVE VPT blocks before register allocation is performed. For VPT blocks
11 /// the goal is to maximize the sizes of the blocks that will be created by the
12 /// MVE VPT Block Insertion pass (which runs after register allocation). For
13 /// tail predicated loops we transform the loop into something that will
14 /// hopefully make the backend ARMLowOverheadLoops pass's job easier.
16 //===----------------------------------------------------------------------===//
19 #include "ARMSubtarget.h"
20 #include "MVETailPredUtils.h"
21 #include "Thumb2InstrInfo.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineDominators.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineLoopInfo.h"
29 #include "llvm/InitializePasses.h"
30 #include "llvm/Support/Debug.h"
35 #define DEBUG_TYPE "arm-mve-vpt-opts"
38 MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden
,
39 cl::desc("Enable merging Loop End and Dec instructions."),
43 SetLRPredicate("arm-set-lr-predicate", cl::Hidden
,
44 cl::desc("Enable setting lr as a predicate in tail predication regions."),
48 class MVETPAndVPTOptimisations
: public MachineFunctionPass
{
51 const Thumb2InstrInfo
*TII
;
52 MachineRegisterInfo
*MRI
;
54 MVETPAndVPTOptimisations() : MachineFunctionPass(ID
) {}
56 bool runOnMachineFunction(MachineFunction
&Fn
) override
;
58 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
59 AU
.addRequired
<MachineLoopInfoWrapperPass
>();
60 AU
.addPreserved
<MachineLoopInfoWrapperPass
>();
61 AU
.addRequired
<MachineDominatorTreeWrapperPass
>();
62 AU
.addPreserved
<MachineDominatorTreeWrapperPass
>();
63 MachineFunctionPass::getAnalysisUsage(AU
);
66 StringRef
getPassName() const override
{
67 return "ARM MVE TailPred and VPT Optimisation Pass";
71 bool LowerWhileLoopStart(MachineLoop
*ML
);
72 bool MergeLoopEnd(MachineLoop
*ML
);
73 bool ConvertTailPredLoop(MachineLoop
*ML
, MachineDominatorTree
*DT
);
74 MachineInstr
&ReplaceRegisterUseWithVPNOT(MachineBasicBlock
&MBB
,
78 bool ReduceOldVCCRValueUses(MachineBasicBlock
&MBB
);
79 bool ReplaceVCMPsByVPNOTs(MachineBasicBlock
&MBB
);
80 bool ReplaceConstByVPNOTs(MachineBasicBlock
&MBB
, MachineDominatorTree
*DT
);
81 bool ConvertVPSEL(MachineBasicBlock
&MBB
);
82 bool HintDoLoopStartReg(MachineBasicBlock
&MBB
);
83 MachineInstr
*CheckForLRUseInPredecessors(MachineBasicBlock
*PreHeader
,
84 MachineInstr
*LoopStart
);
87 char MVETPAndVPTOptimisations::ID
= 0;
89 } // end anonymous namespace
91 INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations
, DEBUG_TYPE
,
92 "ARM MVE TailPred and VPT Optimisations pass", false,
94 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass
)
95 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass
)
96 INITIALIZE_PASS_END(MVETPAndVPTOptimisations
, DEBUG_TYPE
,
97 "ARM MVE TailPred and VPT Optimisations pass", false, false)
99 static MachineInstr
*LookThroughCOPY(MachineInstr
*MI
,
100 MachineRegisterInfo
*MRI
) {
101 while (MI
&& MI
->getOpcode() == TargetOpcode::COPY
&&
102 MI
->getOperand(1).getReg().isVirtual())
103 MI
= MRI
->getVRegDef(MI
->getOperand(1).getReg());
107 // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
108 // corresponding PHI that make up a low overhead loop. Only handles 'do' loops
109 // at the moment, returning a t2DoLoopStart in LoopStart.
110 static bool findLoopComponents(MachineLoop
*ML
, MachineRegisterInfo
*MRI
,
111 MachineInstr
*&LoopStart
, MachineInstr
*&LoopPhi
,
112 MachineInstr
*&LoopDec
, MachineInstr
*&LoopEnd
) {
113 MachineBasicBlock
*Header
= ML
->getHeader();
114 MachineBasicBlock
*Latch
= ML
->getLoopLatch();
115 if (!Header
|| !Latch
) {
116 LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n");
120 // Find the loop end from the terminators.
122 for (auto &T
: Latch
->terminators()) {
123 if (T
.getOpcode() == ARM::t2LoopEnd
&& T
.getOperand(1).getMBB() == Header
) {
127 if (T
.getOpcode() == ARM::t2LoopEndDec
&&
128 T
.getOperand(2).getMBB() == Header
) {
134 LLVM_DEBUG(dbgs() << " no LoopEnd\n");
137 LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd
);
139 // Find the dec from the use of the end. There may be copies between
140 // instructions. We expect the loop to loop like:
141 // $vs = t2DoLoopStart ...
143 // $vp = phi [ $vs ], [ $vd ]
145 // $vd = t2LoopDec $vp
147 // t2LoopEnd $vd, loop
148 if (LoopEnd
->getOpcode() == ARM::t2LoopEndDec
)
152 LookThroughCOPY(MRI
->getVRegDef(LoopEnd
->getOperand(0).getReg()), MRI
);
153 if (!LoopDec
|| LoopDec
->getOpcode() != ARM::t2LoopDec
) {
154 LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n");
158 LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec
);
161 LookThroughCOPY(MRI
->getVRegDef(LoopDec
->getOperand(1).getReg()), MRI
);
162 if (!LoopPhi
|| LoopPhi
->getOpcode() != TargetOpcode::PHI
||
163 LoopPhi
->getNumOperands() != 5 ||
164 (LoopPhi
->getOperand(2).getMBB() != Latch
&&
165 LoopPhi
->getOperand(4).getMBB() != Latch
)) {
166 LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n");
169 LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi
);
171 Register StartReg
= LoopPhi
->getOperand(2).getMBB() == Latch
172 ? LoopPhi
->getOperand(3).getReg()
173 : LoopPhi
->getOperand(1).getReg();
174 LoopStart
= LookThroughCOPY(MRI
->getVRegDef(StartReg
), MRI
);
175 if (!LoopStart
|| (LoopStart
->getOpcode() != ARM::t2DoLoopStart
&&
176 LoopStart
->getOpcode() != ARM::t2WhileLoopSetup
&&
177 LoopStart
->getOpcode() != ARM::t2WhileLoopStartLR
)) {
178 LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n");
181 LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart
);
186 static void RevertWhileLoopSetup(MachineInstr
*MI
, const TargetInstrInfo
*TII
) {
187 MachineBasicBlock
*MBB
= MI
->getParent();
188 assert(MI
->getOpcode() == ARM::t2WhileLoopSetup
&&
189 "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!");
192 MachineInstrBuilder MIB
=
193 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
->get(ARM::t2SUBri
));
194 MIB
.add(MI
->getOperand(0));
195 MIB
.add(MI
->getOperand(1));
197 MIB
.addImm(ARMCC::AL
);
198 MIB
.addReg(ARM::NoRegister
);
199 MIB
.addReg(ARM::CPSR
, RegState::Define
);
201 // Attempt to find a t2WhileLoopStart and revert to a t2Bcc.
202 for (MachineInstr
&I
: MBB
->terminators()) {
203 if (I
.getOpcode() == ARM::t2WhileLoopStart
) {
204 MachineInstrBuilder MIB
=
205 BuildMI(*MBB
, &I
, I
.getDebugLoc(), TII
->get(ARM::t2Bcc
));
206 MIB
.add(MI
->getOperand(1)); // branch target
207 MIB
.addImm(ARMCC::EQ
);
208 MIB
.addReg(ARM::CPSR
);
214 MI
->eraseFromParent();
217 // The Hardware Loop insertion and ISel Lowering produce the pseudos for the
218 // start of a while loop:
219 // %a:gprlr = t2WhileLoopSetup %Cnt
220 // t2WhileLoopStart %a, %BB
221 // We want to convert those to a single instruction which, like t2LoopEndDec and
222 // t2DoLoopStartTP is both a terminator and produces a value:
223 // %a:grplr: t2WhileLoopStartLR %Cnt, %BB
225 // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and
226 // t2WhileLoopStart are not valid past regalloc.
227 bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop
*ML
) {
228 LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop "
229 << ML
->getHeader()->getName() << "\n");
231 MachineInstr
*LoopEnd
, *LoopPhi
, *LoopStart
, *LoopDec
;
232 if (!findLoopComponents(ML
, MRI
, LoopStart
, LoopPhi
, LoopDec
, LoopEnd
))
235 if (LoopStart
->getOpcode() != ARM::t2WhileLoopSetup
)
238 Register LR
= LoopStart
->getOperand(0).getReg();
239 auto WLSIt
= find_if(MRI
->use_nodbg_instructions(LR
), [](auto &MI
) {
240 return MI
.getOpcode() == ARM::t2WhileLoopStart
;
242 if (!MergeEndDec
|| WLSIt
== MRI
->use_instr_nodbg_end()) {
243 RevertWhileLoopSetup(LoopStart
, TII
);
244 RevertLoopDec(LoopStart
, TII
);
245 RevertLoopEnd(LoopStart
, TII
);
249 MachineInstrBuilder MI
=
250 BuildMI(*WLSIt
->getParent(), *WLSIt
, WLSIt
->getDebugLoc(),
251 TII
->get(ARM::t2WhileLoopStartLR
), LR
)
252 .add(LoopStart
->getOperand(1))
253 .add(WLSIt
->getOperand(1));
255 LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI
.getInstr());
257 WLSIt
->eraseFromParent();
258 LoopStart
->eraseFromParent();
262 // Return true if this instruction is invalid in a low overhead loop, usually
263 // because it clobbers LR.
264 static bool IsInvalidTPInstruction(MachineInstr
&MI
) {
265 return MI
.isCall() || isLoopStart(MI
);
268 // Starting from PreHeader, search for invalid instructions back until the
269 // LoopStart block is reached. If invalid instructions are found, the loop start
270 // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will
271 // return the new DLS LoopStart if updated.
272 MachineInstr
*MVETPAndVPTOptimisations::CheckForLRUseInPredecessors(
273 MachineBasicBlock
*PreHeader
, MachineInstr
*LoopStart
) {
274 SmallVector
<MachineBasicBlock
*> Worklist
;
275 SmallPtrSet
<MachineBasicBlock
*, 4> Visited
;
276 Worklist
.push_back(PreHeader
);
277 Visited
.insert(LoopStart
->getParent());
279 while (!Worklist
.empty()) {
280 MachineBasicBlock
*MBB
= Worklist
.pop_back_val();
281 if (Visited
.count(MBB
))
284 for (MachineInstr
&MI
: *MBB
) {
285 if (!IsInvalidTPInstruction(MI
))
288 LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI
);
290 // Create a t2DoLoopStart at the end of the preheader.
291 MachineInstrBuilder MIB
=
292 BuildMI(*PreHeader
, PreHeader
->getFirstTerminator(),
293 LoopStart
->getDebugLoc(), TII
->get(ARM::t2DoLoopStart
));
294 MIB
.add(LoopStart
->getOperand(0));
295 MIB
.add(LoopStart
->getOperand(1));
297 // Make sure to remove the kill flags, to prevent them from being invalid.
298 LoopStart
->getOperand(1).setIsKill(false);
300 // Revert the t2WhileLoopStartLR to a CMP and Br.
301 RevertWhileLoopStartLR(LoopStart
, TII
, ARM::t2Bcc
, true);
306 for (auto *Pred
: MBB
->predecessors())
307 Worklist
.push_back(Pred
);
312 // This function converts loops with t2LoopEnd and t2LoopEnd instructions into
313 // a single t2LoopEndDec instruction. To do that it needs to make sure that LR
314 // will be valid to be used for the low overhead loop, which means nothing else
315 // is using LR (especially calls) and there are no superfluous copies in the
316 // loop. The t2LoopEndDec is a branching terminator that produces a value (the
317 // decrement) around the loop edge, which means we need to be careful that they
318 // will be valid to allocate without any spilling.
319 bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop
*ML
) {
323 LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML
->getHeader()->getName()
326 MachineInstr
*LoopEnd
, *LoopPhi
, *LoopStart
, *LoopDec
;
327 if (!findLoopComponents(ML
, MRI
, LoopStart
, LoopPhi
, LoopDec
, LoopEnd
))
330 // Check if there is an illegal instruction (a call) in the low overhead loop
331 // and if so revert it now before we get any further. While loops also need to
332 // check the preheaders, but can be reverted to a DLS loop if needed.
333 auto *PreHeader
= ML
->getLoopPreheader();
334 if (LoopStart
->getOpcode() == ARM::t2WhileLoopStartLR
&& PreHeader
)
335 LoopStart
= CheckForLRUseInPredecessors(PreHeader
, LoopStart
);
337 for (MachineBasicBlock
*MBB
: ML
->blocks()) {
338 for (MachineInstr
&MI
: *MBB
) {
339 if (IsInvalidTPInstruction(MI
)) {
340 LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI
);
341 if (LoopStart
->getOpcode() == ARM::t2DoLoopStart
)
342 RevertDoLoopStart(LoopStart
, TII
);
344 RevertWhileLoopStartLR(LoopStart
, TII
);
345 RevertLoopDec(LoopDec
, TII
);
346 RevertLoopEnd(LoopEnd
, TII
);
352 // Remove any copies from the loop, to ensure the phi that remains is both
353 // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
354 // that cannot spill, we need to be careful what remains in the loop.
355 Register PhiReg
= LoopPhi
->getOperand(0).getReg();
356 Register DecReg
= LoopDec
->getOperand(0).getReg();
357 Register StartReg
= LoopStart
->getOperand(0).getReg();
358 // Ensure the uses are expected, and collect any copies we want to remove.
359 SmallVector
<MachineInstr
*, 4> Copies
;
360 auto CheckUsers
= [&Copies
](Register BaseReg
,
361 ArrayRef
<MachineInstr
*> ExpectedUsers
,
362 MachineRegisterInfo
*MRI
) {
363 SmallVector
<Register
, 4> Worklist
;
364 Worklist
.push_back(BaseReg
);
365 while (!Worklist
.empty()) {
366 Register Reg
= Worklist
.pop_back_val();
367 for (MachineInstr
&MI
: MRI
->use_nodbg_instructions(Reg
)) {
368 if (llvm::is_contained(ExpectedUsers
, &MI
))
370 if (MI
.getOpcode() != TargetOpcode::COPY
||
371 !MI
.getOperand(0).getReg().isVirtual()) {
372 LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI
);
375 Worklist
.push_back(MI
.getOperand(0).getReg());
376 Copies
.push_back(&MI
);
381 if (!CheckUsers(PhiReg
, {LoopDec
}, MRI
) ||
382 !CheckUsers(DecReg
, {LoopPhi
, LoopEnd
}, MRI
) ||
383 !CheckUsers(StartReg
, {LoopPhi
}, MRI
)) {
384 // Don't leave a t2WhileLoopStartLR without the LoopDecEnd.
385 if (LoopStart
->getOpcode() == ARM::t2WhileLoopStartLR
) {
386 RevertWhileLoopStartLR(LoopStart
, TII
);
387 RevertLoopDec(LoopDec
, TII
);
388 RevertLoopEnd(LoopEnd
, TII
);
394 MRI
->constrainRegClass(StartReg
, &ARM::GPRlrRegClass
);
395 MRI
->constrainRegClass(PhiReg
, &ARM::GPRlrRegClass
);
396 MRI
->constrainRegClass(DecReg
, &ARM::GPRlrRegClass
);
398 if (LoopPhi
->getOperand(2).getMBB() == ML
->getLoopLatch()) {
399 LoopPhi
->getOperand(3).setReg(StartReg
);
400 LoopPhi
->getOperand(1).setReg(DecReg
);
402 LoopPhi
->getOperand(1).setReg(StartReg
);
403 LoopPhi
->getOperand(3).setReg(DecReg
);
406 SmallVector
<MachineOperand
, 4> Cond
; // For analyzeBranch.
407 MachineBasicBlock
*TBB
= nullptr, *FBB
= nullptr; // For analyzeBranch.
408 if (!TII
->analyzeBranch(*LoopEnd
->getParent(), TBB
, FBB
, Cond
) && !FBB
) {
409 // If the LoopEnd falls through, need to insert a t2B to the fall-through
410 // block so that the non-analyzable t2LoopEndDec doesn't fall through.
411 MachineFunction::iterator MBBI
= ++LoopEnd
->getParent()->getIterator();
412 BuildMI(LoopEnd
->getParent(), DebugLoc(), TII
->get(ARM::t2B
))
414 .add(predOps(ARMCC::AL
));
417 // Replace the loop dec and loop end as a single instruction.
418 MachineInstrBuilder MI
=
419 BuildMI(*LoopEnd
->getParent(), *LoopEnd
, LoopEnd
->getDebugLoc(),
420 TII
->get(ARM::t2LoopEndDec
), DecReg
)
422 .add(LoopEnd
->getOperand(1));
424 LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI
.getInstr());
426 LoopDec
->eraseFromParent();
427 LoopEnd
->eraseFromParent();
428 for (auto *MI
: Copies
)
429 MI
->eraseFromParent();
433 // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
434 // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
435 // instruction, making the backend ARMLowOverheadLoops passes job of finding the
436 // VCTP operand much simpler.
437 bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop
*ML
,
438 MachineDominatorTree
*DT
) {
439 LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
440 << ML
->getHeader()->getName() << "\n");
442 // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
444 MachineInstr
*LoopEnd
, *LoopPhi
, *LoopStart
, *LoopDec
;
445 if (!findLoopComponents(ML
, MRI
, LoopStart
, LoopPhi
, LoopDec
, LoopEnd
))
447 if (LoopDec
!= LoopEnd
|| (LoopStart
->getOpcode() != ARM::t2DoLoopStart
&&
448 LoopStart
->getOpcode() != ARM::t2WhileLoopStartLR
))
451 SmallVector
<MachineInstr
*, 4> VCTPs
;
452 SmallVector
<MachineInstr
*, 4> MVEInstrs
;
453 for (MachineBasicBlock
*BB
: ML
->blocks()) {
454 for (MachineInstr
&MI
: *BB
)
456 VCTPs
.push_back(&MI
);
457 else if (findFirstVPTPredOperandIdx(MI
) != -1)
458 MVEInstrs
.push_back(&MI
);
462 LLVM_DEBUG(dbgs() << " no VCTPs\n");
466 // Check all VCTPs are the same.
467 MachineInstr
*FirstVCTP
= *VCTPs
.begin();
468 for (MachineInstr
*VCTP
: VCTPs
) {
469 LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP
);
470 if (VCTP
->getOpcode() != FirstVCTP
->getOpcode() ||
471 VCTP
->getOperand(0).getReg() != FirstVCTP
->getOperand(0).getReg()) {
472 LLVM_DEBUG(dbgs() << " VCTP's are not identical\n");
477 // Check for the register being used can be setup before the loop. We expect
481 // $vp = PHI [ $vx ], [ $vd ]
485 // $vd = t2SUBri $vp, #n
487 Register CountReg
= FirstVCTP
->getOperand(1).getReg();
488 if (!CountReg
.isVirtual()) {
489 LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n");
492 MachineInstr
*Phi
= LookThroughCOPY(MRI
->getVRegDef(CountReg
), MRI
);
493 if (!Phi
|| Phi
->getOpcode() != TargetOpcode::PHI
||
494 Phi
->getNumOperands() != 5 ||
495 (Phi
->getOperand(2).getMBB() != ML
->getLoopLatch() &&
496 Phi
->getOperand(4).getMBB() != ML
->getLoopLatch())) {
497 LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n");
500 CountReg
= Phi
->getOperand(2).getMBB() == ML
->getLoopLatch()
501 ? Phi
->getOperand(3).getReg()
502 : Phi
->getOperand(1).getReg();
504 // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
505 // the preheader and add the new CountReg to it. We attempt to place it late
506 // in the preheader, but may need to move that earlier based on uses.
507 MachineBasicBlock
*MBB
= LoopStart
->getParent();
508 MachineBasicBlock::iterator InsertPt
= MBB
->getFirstTerminator();
509 for (MachineInstr
&Use
:
510 MRI
->use_instructions(LoopStart
->getOperand(0).getReg()))
511 if ((InsertPt
!= MBB
->end() && !DT
->dominates(&*InsertPt
, &Use
)) ||
512 !DT
->dominates(ML
->getHeader(), Use
.getParent())) {
513 LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n");
517 unsigned NewOpc
= LoopStart
->getOpcode() == ARM::t2DoLoopStart
518 ? ARM::t2DoLoopStartTP
519 : ARM::t2WhileLoopStartTP
;
520 MachineInstrBuilder MI
=
521 BuildMI(*MBB
, InsertPt
, LoopStart
->getDebugLoc(), TII
->get(NewOpc
))
522 .add(LoopStart
->getOperand(0))
523 .add(LoopStart
->getOperand(1))
525 if (NewOpc
== ARM::t2WhileLoopStartTP
)
526 MI
.add(LoopStart
->getOperand(2));
527 LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart
<< " with "
529 MRI
->constrainRegClass(CountReg
, &ARM::rGPRRegClass
);
530 LoopStart
->eraseFromParent();
532 if (SetLRPredicate
) {
533 // Each instruction in the loop needs to be using LR as the predicate from
534 // the Phi as the predicate.
535 Register LR
= LoopPhi
->getOperand(0).getReg();
536 for (MachineInstr
*MI
: MVEInstrs
) {
537 int Idx
= findFirstVPTPredOperandIdx(*MI
);
538 MI
->getOperand(Idx
+ 2).setReg(LR
);
545 // Returns true if Opcode is any VCMP Opcode.
546 static bool IsVCMP(unsigned Opcode
) { return VCMPOpcodeToVPT(Opcode
) != 0; }
548 // Returns true if a VCMP with this Opcode can have its operands swapped.
549 // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
550 // and VCMPr instructions (since the r is always on the right).
551 static bool CanHaveSwappedOperands(unsigned Opcode
) {
555 case ARM::MVE_VCMPf32
:
556 case ARM::MVE_VCMPf16
:
557 case ARM::MVE_VCMPf32r
:
558 case ARM::MVE_VCMPf16r
:
559 case ARM::MVE_VCMPi8r
:
560 case ARM::MVE_VCMPi16r
:
561 case ARM::MVE_VCMPi32r
:
562 case ARM::MVE_VCMPu8r
:
563 case ARM::MVE_VCMPu16r
:
564 case ARM::MVE_VCMPu32r
:
565 case ARM::MVE_VCMPs8r
:
566 case ARM::MVE_VCMPs16r
:
567 case ARM::MVE_VCMPs32r
:
572 // Returns the CondCode of a VCMP Instruction.
573 static ARMCC::CondCodes
GetCondCode(MachineInstr
&Instr
) {
574 assert(IsVCMP(Instr
.getOpcode()) && "Inst must be a VCMP");
575 return ARMCC::CondCodes(Instr
.getOperand(3).getImm());
578 // Returns true if Cond is equivalent to a VPNOT instruction on the result of
579 // Prev. Cond and Prev must be VCMPs.
580 static bool IsVPNOTEquivalent(MachineInstr
&Cond
, MachineInstr
&Prev
) {
581 assert(IsVCMP(Cond
.getOpcode()) && IsVCMP(Prev
.getOpcode()));
583 // Opcodes must match.
584 if (Cond
.getOpcode() != Prev
.getOpcode())
587 MachineOperand
&CondOP1
= Cond
.getOperand(1), &CondOP2
= Cond
.getOperand(2);
588 MachineOperand
&PrevOP1
= Prev
.getOperand(1), &PrevOP2
= Prev
.getOperand(2);
590 // If the VCMP has the opposite condition with the same operands, we can
591 // replace it with a VPNOT
592 ARMCC::CondCodes ExpectedCode
= GetCondCode(Cond
);
593 ExpectedCode
= ARMCC::getOppositeCondition(ExpectedCode
);
594 if (ExpectedCode
== GetCondCode(Prev
))
595 if (CondOP1
.isIdenticalTo(PrevOP1
) && CondOP2
.isIdenticalTo(PrevOP2
))
597 // Check again with operands swapped if possible
598 if (!CanHaveSwappedOperands(Cond
.getOpcode()))
600 ExpectedCode
= ARMCC::getSwappedCondition(ExpectedCode
);
601 return ExpectedCode
== GetCondCode(Prev
) && CondOP1
.isIdenticalTo(PrevOP2
) &&
602 CondOP2
.isIdenticalTo(PrevOP1
);
605 // Returns true if Instr writes to VCCR.
606 static bool IsWritingToVCCR(MachineInstr
&Instr
) {
607 if (Instr
.getNumOperands() == 0)
609 MachineOperand
&Dst
= Instr
.getOperand(0);
612 Register DstReg
= Dst
.getReg();
613 if (!DstReg
.isVirtual())
615 MachineRegisterInfo
&RegInfo
= Instr
.getMF()->getRegInfo();
616 const TargetRegisterClass
*RegClass
= RegInfo
.getRegClassOrNull(DstReg
);
617 return RegClass
&& (RegClass
->getID() == ARM::VCCRRegClassID
);
621 // <Instr that uses %A ('User' Operand)>
623 // %K = VPNOT %Target
624 // <Instr that uses %K ('User' Operand)>
625 // And returns the newly inserted VPNOT.
626 // This optimization is done in the hopes of preventing spills/reloads of VPR by
627 // reducing the number of VCCR values with overlapping lifetimes.
628 MachineInstr
&MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT(
629 MachineBasicBlock
&MBB
, MachineInstr
&Instr
, MachineOperand
&User
,
631 Register NewResult
= MRI
->createVirtualRegister(MRI
->getRegClass(Target
));
633 MachineInstrBuilder MIBuilder
=
634 BuildMI(MBB
, &Instr
, Instr
.getDebugLoc(), TII
->get(ARM::MVE_VPNOT
))
637 addUnpredicatedMveVpredNOp(MIBuilder
);
639 // Make the user use NewResult instead, and clear its kill flag.
640 User
.setReg(NewResult
);
641 User
.setIsKill(false);
643 LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): ";
644 MIBuilder
.getInstr()->dump());
646 return *MIBuilder
.getInstr();
649 // Moves a VPNOT before its first user if an instruction that uses Reg is found
650 // in-between the VPNOT and its user.
651 // Returns true if there is at least one user of the VPNOT in the block.
652 static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock
&MBB
,
653 MachineBasicBlock::iterator Iter
,
655 assert(Iter
->getOpcode() == ARM::MVE_VPNOT
&& "Not a VPNOT!");
656 assert(getVPTInstrPredicate(*Iter
) == ARMVCC::None
&&
657 "The VPNOT cannot be predicated");
659 MachineInstr
&VPNOT
= *Iter
;
660 Register VPNOTResult
= VPNOT
.getOperand(0).getReg();
661 Register VPNOTOperand
= VPNOT
.getOperand(1).getReg();
663 // Whether the VPNOT will need to be moved, and whether we found a user of the
665 bool MustMove
= false, HasUser
= false;
666 MachineOperand
*VPNOTOperandKiller
= nullptr;
667 for (; Iter
!= MBB
.end(); ++Iter
) {
668 if (MachineOperand
*MO
=
669 Iter
->findRegisterUseOperand(VPNOTOperand
, /*TRI=*/nullptr,
671 // If we find the operand that kills the VPNOTOperand's result, save it.
672 VPNOTOperandKiller
= MO
;
675 if (Iter
->findRegisterUseOperandIdx(Reg
, /*TRI=*/nullptr) != -1) {
680 if (Iter
->findRegisterUseOperandIdx(VPNOTResult
, /*TRI=*/nullptr) == -1)
687 // Move the VPNOT right before Iter
688 LLVM_DEBUG(dbgs() << "Moving: "; VPNOT
.dump(); dbgs() << " Before: ";
690 MBB
.splice(Iter
, &MBB
, VPNOT
.getIterator());
691 // If we move the instr, and its operand was killed earlier, remove the kill
693 if (VPNOTOperandKiller
)
694 VPNOTOperandKiller
->setIsKill(false);
701 // This optimisation attempts to reduce the number of overlapping lifetimes of
702 // VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
704 // %A:vccr = (something)
705 // %B:vccr = VPNOT %A
706 // %Foo = (some op that uses %B)
707 // %Bar = (some op that uses %A)
709 // %A:vccr = (something)
710 // %B:vccr = VPNOT %A
711 // %Foo = (some op that uses %B)
712 // %TMP2:vccr = VPNOT %B
713 // %Bar = (some op that uses %A)
714 bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock
&MBB
) {
715 MachineBasicBlock::iterator Iter
= MBB
.begin(), End
= MBB
.end();
716 SmallVector
<MachineInstr
*, 4> DeadInstructions
;
717 bool Modified
= false;
719 while (Iter
!= End
) {
720 Register VCCRValue
, OppositeVCCRValue
;
721 // The first loop looks for 2 unpredicated instructions:
722 // %A:vccr = (instr) ; A is stored in VCCRValue
723 // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue
724 for (; Iter
!= End
; ++Iter
) {
725 // We're only interested in unpredicated instructions that write to VCCR.
726 if (!IsWritingToVCCR(*Iter
) ||
727 getVPTInstrPredicate(*Iter
) != ARMVCC::None
)
729 Register Dst
= Iter
->getOperand(0).getReg();
731 // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
732 // found what we were looking for.
733 if (VCCRValue
&& Iter
->getOpcode() == ARM::MVE_VPNOT
&&
734 Iter
->findRegisterUseOperandIdx(VCCRValue
, /*TRI=*/nullptr) != -1) {
735 // Move the VPNOT closer to its first user if needed, and ignore if it
737 if (!MoveVPNOTBeforeFirstUser(MBB
, Iter
, VCCRValue
))
740 OppositeVCCRValue
= Dst
;
745 // Else, just set VCCRValue.
749 // If the first inner loop didn't find anything, stop here.
753 assert(VCCRValue
&& OppositeVCCRValue
&&
754 "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
755 "stopped before the end of the block!");
756 assert(VCCRValue
!= OppositeVCCRValue
&&
757 "VCCRValue should not be equal to OppositeVCCRValue!");
759 // LastVPNOTResult always contains the same value as OppositeVCCRValue.
760 Register LastVPNOTResult
= OppositeVCCRValue
;
762 // This second loop tries to optimize the remaining instructions.
763 for (; Iter
!= End
; ++Iter
) {
764 bool IsInteresting
= false;
766 if (MachineOperand
*MO
=
767 Iter
->findRegisterUseOperand(VCCRValue
, /*TRI=*/nullptr)) {
768 IsInteresting
= true;
770 // - If the instruction is a VPNOT, it can be removed, and we can just
771 // replace its uses with LastVPNOTResult.
772 // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
773 if (Iter
->getOpcode() == ARM::MVE_VPNOT
) {
774 Register Result
= Iter
->getOperand(0).getReg();
776 MRI
->replaceRegWith(Result
, LastVPNOTResult
);
777 DeadInstructions
.push_back(&*Iter
);
781 << "Replacing all uses of '" << printReg(Result
)
782 << "' with '" << printReg(LastVPNOTResult
) << "'\n");
784 MachineInstr
&VPNOT
=
785 ReplaceRegisterUseWithVPNOT(MBB
, *Iter
, *MO
, LastVPNOTResult
);
788 LastVPNOTResult
= VPNOT
.getOperand(0).getReg();
789 std::swap(VCCRValue
, OppositeVCCRValue
);
791 LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue
)
792 << "' with '" << printReg(LastVPNOTResult
)
793 << "' in instr: " << *Iter
);
796 // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
797 // instead as they contain the same value.
798 if (MachineOperand
*MO
= Iter
->findRegisterUseOperand(
799 OppositeVCCRValue
, /*TRI=*/nullptr)) {
800 IsInteresting
= true;
802 // This is pointless if LastVPNOTResult == OppositeVCCRValue.
803 if (LastVPNOTResult
!= OppositeVCCRValue
) {
804 LLVM_DEBUG(dbgs() << "Replacing usage of '"
805 << printReg(OppositeVCCRValue
) << "' with '"
806 << printReg(LastVPNOTResult
) << " for instr: ";
808 MO
->setReg(LastVPNOTResult
);
812 MO
->setIsKill(false);
815 // If this is an unpredicated VPNOT on
816 // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
817 if (Iter
->getOpcode() == ARM::MVE_VPNOT
&&
818 getVPTInstrPredicate(*Iter
) == ARMVCC::None
) {
819 Register VPNOTOperand
= Iter
->getOperand(1).getReg();
820 if (VPNOTOperand
== LastVPNOTResult
||
821 VPNOTOperand
== OppositeVCCRValue
) {
822 IsInteresting
= true;
824 std::swap(VCCRValue
, OppositeVCCRValue
);
825 LastVPNOTResult
= Iter
->getOperand(0).getReg();
830 // If this instruction was not interesting, and it writes to VCCR, stop.
831 if (!IsInteresting
&& IsWritingToVCCR(*Iter
))
836 for (MachineInstr
*DeadInstruction
: DeadInstructions
)
837 DeadInstruction
->eraseFromParent();
842 // This optimisation replaces VCMPs with VPNOTs when they are equivalent.
843 bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock
&MBB
) {
844 SmallVector
<MachineInstr
*, 4> DeadInstructions
;
846 // The last VCMP that we have seen and that couldn't be replaced.
847 // This is reset when an instruction that writes to VCCR/VPR is found, or when
848 // a VCMP is replaced with a VPNOT.
849 // We'll only replace VCMPs with VPNOTs when this is not null, and when the
850 // current VCMP is the opposite of PrevVCMP.
851 MachineInstr
*PrevVCMP
= nullptr;
852 // If we find an instruction that kills the result of PrevVCMP, we save the
853 // operand here to remove the kill flag in case we need to use PrevVCMP's
855 MachineOperand
*PrevVCMPResultKiller
= nullptr;
857 for (MachineInstr
&Instr
: MBB
.instrs()) {
859 if (MachineOperand
*MO
=
860 Instr
.findRegisterUseOperand(PrevVCMP
->getOperand(0).getReg(),
861 /*TRI=*/nullptr, /*isKill*/ true)) {
862 // If we come accross the instr that kills PrevVCMP's result, record it
863 // so we can remove the kill flag later if we need to.
864 PrevVCMPResultKiller
= MO
;
868 // Ignore predicated instructions.
869 if (getVPTInstrPredicate(Instr
) != ARMVCC::None
)
872 // Only look at VCMPs
873 if (!IsVCMP(Instr
.getOpcode())) {
874 // If the instruction writes to VCCR, forget the previous VCMP.
875 if (IsWritingToVCCR(Instr
))
880 if (!PrevVCMP
|| !IsVPNOTEquivalent(Instr
, *PrevVCMP
)) {
885 // The register containing the result of the VCMP that we're going to
887 Register PrevVCMPResultReg
= PrevVCMP
->getOperand(0).getReg();
889 // Build a VPNOT to replace the VCMP, reusing its operands.
890 MachineInstrBuilder MIBuilder
=
891 BuildMI(MBB
, &Instr
, Instr
.getDebugLoc(), TII
->get(ARM::MVE_VPNOT
))
892 .add(Instr
.getOperand(0))
893 .addReg(PrevVCMPResultReg
);
894 addUnpredicatedMveVpredNOp(MIBuilder
);
895 LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
896 MIBuilder
.getInstr()->dump(); dbgs() << " Removed VCMP: ";
899 // If we found an instruction that uses, and kills PrevVCMP's result,
900 // remove the kill flag.
901 if (PrevVCMPResultKiller
)
902 PrevVCMPResultKiller
->setIsKill(false);
904 // Finally, mark the old VCMP for removal and reset
905 // PrevVCMP/PrevVCMPResultKiller.
906 DeadInstructions
.push_back(&Instr
);
908 PrevVCMPResultKiller
= nullptr;
911 for (MachineInstr
*DeadInstruction
: DeadInstructions
)
912 DeadInstruction
->eraseFromParent();
914 return !DeadInstructions
.empty();
917 bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock
&MBB
,
918 MachineDominatorTree
*DT
) {
919 // Scan through the block, looking for instructions that use constants moves
920 // into VPR that are the negative of one another. These are expected to be
921 // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
922 // mask is kept it or and VPNOT's of it are added or reused as we scan through
924 unsigned LastVPTImm
= 0;
925 Register LastVPTReg
= 0;
926 SmallSet
<MachineInstr
*, 4> DeadInstructions
;
928 for (MachineInstr
&Instr
: MBB
.instrs()) {
929 // Look for predicated MVE instructions.
930 int PIdx
= llvm::findFirstVPTPredOperandIdx(Instr
);
933 Register VPR
= Instr
.getOperand(PIdx
+ 1).getReg();
934 if (!VPR
.isVirtual())
937 // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
938 MachineInstr
*Copy
= MRI
->getVRegDef(VPR
);
939 if (!Copy
|| Copy
->getOpcode() != TargetOpcode::COPY
||
940 !Copy
->getOperand(1).getReg().isVirtual() ||
941 MRI
->getRegClass(Copy
->getOperand(1).getReg()) == &ARM::VCCRRegClass
) {
945 Register GPR
= Copy
->getOperand(1).getReg();
947 // Find the Immediate used by the copy.
948 auto getImm
= [&](Register GPR
) -> unsigned {
949 MachineInstr
*Def
= MRI
->getVRegDef(GPR
);
950 if (Def
&& (Def
->getOpcode() == ARM::t2MOVi
||
951 Def
->getOpcode() == ARM::t2MOVi16
))
952 return Def
->getOperand(1).getImm();
955 unsigned Imm
= getImm(GPR
);
961 unsigned NotImm
= ~Imm
& 0xffff;
962 if (LastVPTReg
!= 0 && LastVPTReg
!= VPR
&& LastVPTImm
== Imm
) {
963 MRI
->clearKillFlags(LastVPTReg
);
964 Instr
.getOperand(PIdx
+ 1).setReg(LastVPTReg
);
965 if (MRI
->use_empty(VPR
)) {
966 DeadInstructions
.insert(Copy
);
967 if (MRI
->hasOneUse(GPR
))
968 DeadInstructions
.insert(MRI
->getVRegDef(GPR
));
970 LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr
);
972 } else if (LastVPTReg
!= 0 && LastVPTImm
== NotImm
) {
973 // We have found the not of a previous constant. Create a VPNot of the
974 // earlier predicate reg and use it instead of the copy.
975 Register NewVPR
= MRI
->createVirtualRegister(&ARM::VCCRRegClass
);
976 auto VPNot
= BuildMI(MBB
, &Instr
, Instr
.getDebugLoc(),
977 TII
->get(ARM::MVE_VPNOT
), NewVPR
)
979 addUnpredicatedMveVpredNOp(VPNot
);
981 // Use the new register and check if the def is now dead.
982 Instr
.getOperand(PIdx
+ 1).setReg(NewVPR
);
983 if (MRI
->use_empty(VPR
)) {
984 DeadInstructions
.insert(Copy
);
985 if (MRI
->hasOneUse(GPR
))
986 DeadInstructions
.insert(MRI
->getVRegDef(GPR
));
988 LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot
<< " to replace use at "
997 for (MachineInstr
*DI
: DeadInstructions
)
998 DI
->eraseFromParent();
1000 return !DeadInstructions
.empty();
1003 // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
1004 // somewhat blunt approximation to allow tail predicated with vpsel
1005 // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
1006 // different semantics under tail predication. Until that is modelled we just
1007 // convert to a VMOVT (via a predicated VORR) instead.
1008 bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock
&MBB
) {
1009 bool HasVCTP
= false;
1010 SmallVector
<MachineInstr
*, 4> DeadInstructions
;
1012 for (MachineInstr
&MI
: MBB
.instrs()) {
1018 if (!HasVCTP
|| MI
.getOpcode() != ARM::MVE_VPSEL
)
1021 MachineInstrBuilder MIBuilder
=
1022 BuildMI(MBB
, &MI
, MI
.getDebugLoc(), TII
->get(ARM::MVE_VORR
))
1023 .add(MI
.getOperand(0))
1024 .add(MI
.getOperand(1))
1025 .add(MI
.getOperand(1))
1026 .addImm(ARMVCC::Then
)
1027 .add(MI
.getOperand(4))
1028 .add(MI
.getOperand(5))
1029 .add(MI
.getOperand(2));
1030 // Silence unused variable warning in release builds.
1032 LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI
.dump();
1033 dbgs() << " with VMOVT: "; MIBuilder
.getInstr()->dump());
1034 DeadInstructions
.push_back(&MI
);
1037 for (MachineInstr
*DeadInstruction
: DeadInstructions
)
1038 DeadInstruction
->eraseFromParent();
1040 return !DeadInstructions
.empty();
1043 // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
1044 // the instruction may be removable as a noop.
1045 bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock
&MBB
) {
1046 bool Changed
= false;
1047 for (MachineInstr
&MI
: MBB
.instrs()) {
1048 if (MI
.getOpcode() != ARM::t2DoLoopStart
)
1050 Register R
= MI
.getOperand(1).getReg();
1051 MachineFunction
*MF
= MI
.getParent()->getParent();
1052 MF
->getRegInfo().setRegAllocationHint(R
, ARMRI::RegLR
, 0);
1058 bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction
&Fn
) {
1059 const ARMSubtarget
&STI
= Fn
.getSubtarget
<ARMSubtarget
>();
1061 if (!STI
.isThumb2() || !STI
.hasLOB())
1064 TII
= static_cast<const Thumb2InstrInfo
*>(STI
.getInstrInfo());
1065 MRI
= &Fn
.getRegInfo();
1066 MachineLoopInfo
*MLI
= &getAnalysis
<MachineLoopInfoWrapperPass
>().getLI();
1067 MachineDominatorTree
*DT
=
1068 &getAnalysis
<MachineDominatorTreeWrapperPass
>().getDomTree();
1070 LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
1071 << "********** Function: " << Fn
.getName() << '\n');
1073 bool Modified
= false;
1074 for (MachineLoop
*ML
: MLI
->getLoopsInPreorder()) {
1075 Modified
|= LowerWhileLoopStart(ML
);
1076 Modified
|= MergeLoopEnd(ML
);
1077 Modified
|= ConvertTailPredLoop(ML
, DT
);
1080 for (MachineBasicBlock
&MBB
: Fn
) {
1081 Modified
|= HintDoLoopStartReg(MBB
);
1082 Modified
|= ReplaceConstByVPNOTs(MBB
, DT
);
1083 Modified
|= ReplaceVCMPsByVPNOTs(MBB
);
1084 Modified
|= ReduceOldVCCRValueUses(MBB
);
1085 Modified
|= ConvertVPSEL(MBB
);
1088 LLVM_DEBUG(dbgs() << "**************************************\n");
1092 /// createMVETPAndVPTOptimisationsPass
1093 FunctionPass
*llvm::createMVETPAndVPTOptimisationsPass() {
1094 return new MVETPAndVPTOptimisations();