[ARM] MVE compare vector splat combine
[llvm-complete.git] / lib / Target / ARM / ARMLowOverheadLoops.cpp
blobdd36702aa48c2e6f48dc8733669ced7d72185e1d
1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 /// form should be in the preheader, whereas the while form should be in the
14 /// preheaders only predecessor. TODO: Could DoLoopStart get moved into the
15 /// pre-preheader?
16 /// - t2LoopDec - placed within in the loop body.
17 /// - t2LoopEnd - the loop latch terminator.
18 ///
19 //===----------------------------------------------------------------------===//
21 #include "ARM.h"
22 #include "ARMBaseInstrInfo.h"
23 #include "ARMBaseRegisterInfo.h"
24 #include "ARMBasicBlockInfo.h"
25 #include "ARMSubtarget.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineLoopInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 using namespace llvm;
32 #define DEBUG_TYPE "arm-low-overhead-loops"
33 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
35 namespace {
37 class ARMLowOverheadLoops : public MachineFunctionPass {
38 const ARMBaseInstrInfo *TII = nullptr;
39 MachineRegisterInfo *MRI = nullptr;
40 std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
42 public:
43 static char ID;
45 ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
47 void getAnalysisUsage(AnalysisUsage &AU) const override {
48 AU.setPreservesCFG();
49 AU.addRequired<MachineLoopInfo>();
50 MachineFunctionPass::getAnalysisUsage(AU);
53 bool runOnMachineFunction(MachineFunction &MF) override;
55 bool ProcessLoop(MachineLoop *ML);
57 bool RevertNonLoops(MachineFunction &MF);
59 void RevertWhile(MachineInstr *MI) const;
61 void RevertLoopDec(MachineInstr *MI) const;
63 void RevertLoopEnd(MachineInstr *MI) const;
65 void Expand(MachineLoop *ML, MachineInstr *Start,
66 MachineInstr *Dec, MachineInstr *End, bool Revert);
68 MachineFunctionProperties getRequiredProperties() const override {
69 return MachineFunctionProperties().set(
70 MachineFunctionProperties::Property::NoVRegs);
73 StringRef getPassName() const override {
74 return ARM_LOW_OVERHEAD_LOOPS_NAME;
79 char ARMLowOverheadLoops::ID = 0;
81 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
82 false, false)
84 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) {
85 if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB())
86 return false;
88 LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n");
90 auto &MLI = getAnalysis<MachineLoopInfo>();
91 MRI = &MF.getRegInfo();
92 TII = static_cast<const ARMBaseInstrInfo*>(
93 MF.getSubtarget().getInstrInfo());
94 BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
95 BBUtils->computeAllBlockSizes();
96 BBUtils->adjustBBOffsetsAfter(&MF.front());
98 bool Changed = false;
99 for (auto ML : MLI) {
100 if (!ML->getParentLoop())
101 Changed |= ProcessLoop(ML);
103 Changed |= RevertNonLoops(MF);
104 return Changed;
107 static bool IsLoopStart(MachineInstr &MI) {
108 return MI.getOpcode() == ARM::t2DoLoopStart ||
109 MI.getOpcode() == ARM::t2WhileLoopStart;
112 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
114 bool Changed = false;
116 // Process inner loops first.
117 for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
118 Changed |= ProcessLoop(*I);
120 LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
122 // Search the given block for a loop start instruction. If one isn't found,
123 // and there's only one predecessor block, search that one too.
124 std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
125 [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
126 for (auto &MI : *MBB) {
127 if (IsLoopStart(MI))
128 return &MI;
130 if (MBB->pred_size() == 1)
131 return SearchForStart(*MBB->pred_begin());
132 return nullptr;
135 MachineInstr *Start = nullptr;
136 MachineInstr *Dec = nullptr;
137 MachineInstr *End = nullptr;
138 bool Revert = false;
140 // Search the preheader for the start intrinsic, or look through the
141 // predecessors of the header to find exactly one set.iterations intrinsic.
142 // FIXME: I don't see why we shouldn't be supporting multiple predecessors
143 // with potentially multiple set.loop.iterations, so we need to enable this.
144 if (auto *Preheader = ML->getLoopPreheader()) {
145 Start = SearchForStart(Preheader);
146 } else {
147 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
148 << " - Performing manual predecessor search.\n");
149 MachineBasicBlock *Pred = nullptr;
150 for (auto *MBB : ML->getHeader()->predecessors()) {
151 if (!ML->contains(MBB)) {
152 if (Pred) {
153 LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
154 Start = nullptr;
155 break;
157 Pred = MBB;
158 Start = SearchForStart(MBB);
163 // Find the low-overhead loop components and decide whether or not to fall
164 // back to a normal loop.
165 for (auto *MBB : reverse(ML->getBlocks())) {
166 for (auto &MI : *MBB) {
167 if (MI.getOpcode() == ARM::t2LoopDec)
168 Dec = &MI;
169 else if (MI.getOpcode() == ARM::t2LoopEnd)
170 End = &MI;
171 else if (IsLoopStart(MI))
172 Start = &MI;
173 else if (MI.getDesc().isCall())
174 // TODO: Though the call will require LE to execute again, does this
175 // mean we should revert? Always executing LE hopefully should be
176 // faster than performing a sub,cmp,br or even subs,br.
177 Revert = true;
179 if (!Dec)
180 continue;
182 // If we find that we load/store LR between LoopDec and LoopEnd, expect
183 // that the decremented value has been spilled to the stack. Because
184 // this value isn't actually going to be produced until the latch, by LE,
185 // we would need to generate a real sub. The value is also likely to be
186 // reloaded for use of LoopEnd - in which in case we'd need to perform
187 // an add because it gets negated again by LE! The other option is to
188 // then generate the other form of LE which doesn't perform the sub.
189 if (MI.mayLoad() || MI.mayStore())
190 Revert =
191 MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR;
194 if (Dec && End && Revert)
195 break;
198 LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
199 if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
200 if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;);
202 if (!Start && !Dec && !End) {
203 LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
204 return Changed;
205 } else if (!(Start && Dec && End)) {
206 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n");
207 return false;
210 if (!End->getOperand(1).isMBB() ||
211 End->getOperand(1).getMBB() != ML->getHeader())
212 report_fatal_error("Expected LoopEnd to target Loop Header");
214 // The WLS and LE instructions have 12-bits for the label offset. WLS
215 // requires a positive offset, while LE uses negative.
216 if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
217 !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
218 LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
219 Revert = true;
221 if (Start->getOpcode() == ARM::t2WhileLoopStart &&
222 (BBUtils->getOffsetOf(Start) >
223 BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
224 !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
225 LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
226 Revert = true;
229 Expand(ML, Start, Dec, End, Revert);
230 return true;
233 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
234 // beq that branches to the exit branch.
235 // FIXME: Need to check that we're not trashing the CPSR when generating the
236 // cmp. We could also try to generate a cbz if the value in LR is also in
237 // another low register.
238 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
239 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
240 MachineBasicBlock *MBB = MI->getParent();
241 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
242 TII->get(ARM::t2CMPri));
243 MIB.addReg(ARM::LR);
244 MIB.addImm(0);
245 MIB.addImm(ARMCC::AL);
246 MIB.addReg(ARM::CPSR);
248 // TODO: Try to use tBcc instead
249 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
250 MIB.add(MI->getOperand(1)); // branch target
251 MIB.addImm(ARMCC::EQ); // condition code
252 MIB.addReg(ARM::CPSR);
253 MI->eraseFromParent();
256 // TODO: Check flags so that we can possibly generate a tSubs or tSub.
257 void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
258 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
259 MachineBasicBlock *MBB = MI->getParent();
260 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
261 TII->get(ARM::t2SUBri));
262 MIB.addDef(ARM::LR);
263 MIB.add(MI->getOperand(1));
264 MIB.add(MI->getOperand(2));
265 MIB.addImm(ARMCC::AL);
266 MIB.addReg(0);
267 MIB.addReg(0);
268 MI->eraseFromParent();
271 // Generate a subs, or sub and cmp, and a branch instead of an LE.
272 // FIXME: Need to check that we're not trashing the CPSR when generating
273 // the cmp.
274 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
275 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
277 // Create cmp
278 MachineBasicBlock *MBB = MI->getParent();
279 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
280 TII->get(ARM::t2CMPri));
281 MIB.addReg(ARM::LR);
282 MIB.addImm(0);
283 MIB.addImm(ARMCC::AL);
284 MIB.addReg(ARM::CPSR);
286 // TODO Try to use tBcc instead.
287 // Create bne
288 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
289 MIB.add(MI->getOperand(1)); // branch target
290 MIB.addImm(ARMCC::NE); // condition code
291 MIB.addReg(ARM::CPSR);
292 MI->eraseFromParent();
295 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
296 MachineInstr *Dec, MachineInstr *End,
297 bool Revert) {
299 auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) {
300 // The trip count should already been held in LR since the instructions
301 // within the loop can only read and write to LR. So, there should be a
302 // mov to setup the count. WLS/DLS perform this move, so find the original
303 // and delete it - inserting WLS/DLS in its place.
304 MachineBasicBlock *MBB = Start->getParent();
305 MachineInstr *InsertPt = Start;
306 for (auto &I : MRI->def_instructions(ARM::LR)) {
307 if (I.getParent() != MBB)
308 continue;
310 // Always execute.
311 if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL)
312 continue;
314 // Only handle move reg, if the trip count it will need moving into a reg
315 // before the setup instruction anyway.
316 if (!I.getDesc().isMoveReg() ||
317 !I.getOperand(1).isIdenticalTo(Start->getOperand(0)))
318 continue;
319 InsertPt = &I;
320 break;
323 unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
324 ARM::t2DLS : ARM::t2WLS;
325 MachineInstrBuilder MIB =
326 BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
328 MIB.addDef(ARM::LR);
329 MIB.add(Start->getOperand(0));
330 if (Opc == ARM::t2WLS)
331 MIB.add(Start->getOperand(1));
333 if (InsertPt != Start)
334 InsertPt->eraseFromParent();
335 Start->eraseFromParent();
336 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
337 return &*MIB;
340 // Combine the LoopDec and LoopEnd instructions into LE(TP).
341 auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec,
342 MachineInstr *End) {
343 MachineBasicBlock *MBB = End->getParent();
344 MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
345 TII->get(ARM::t2LEUpdate));
346 MIB.addDef(ARM::LR);
347 MIB.add(End->getOperand(0));
348 MIB.add(End->getOperand(1));
349 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
351 End->eraseFromParent();
352 Dec->eraseFromParent();
353 return &*MIB;
356 // TODO: We should be able to automatically remove these branches before we
357 // get here - probably by teaching analyzeBranch about the pseudo
358 // instructions.
359 // If there is an unconditional branch, after I, that just branches to the
360 // next block, remove it.
361 auto RemoveDeadBranch = [](MachineInstr *I) {
362 MachineBasicBlock *BB = I->getParent();
363 MachineInstr *Terminator = &BB->instr_back();
364 if (Terminator->isUnconditionalBranch() && I != Terminator) {
365 MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
366 if (BB->isLayoutSuccessor(Succ)) {
367 LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
368 Terminator->eraseFromParent();
373 if (Revert) {
374 if (Start->getOpcode() == ARM::t2WhileLoopStart)
375 RevertWhile(Start);
376 else
377 Start->eraseFromParent();
378 RevertLoopDec(Dec);
379 RevertLoopEnd(End);
380 } else {
381 Start = ExpandLoopStart(ML, Start);
382 RemoveDeadBranch(Start);
383 End = ExpandLoopEnd(ML, Dec, End);
384 RemoveDeadBranch(End);
388 bool ARMLowOverheadLoops::RevertNonLoops(MachineFunction &MF) {
389 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
390 bool Changed = false;
392 for (auto &MBB : MF) {
393 SmallVector<MachineInstr*, 4> Starts;
394 SmallVector<MachineInstr*, 4> Decs;
395 SmallVector<MachineInstr*, 4> Ends;
397 for (auto &I : MBB) {
398 if (IsLoopStart(I))
399 Starts.push_back(&I);
400 else if (I.getOpcode() == ARM::t2LoopDec)
401 Decs.push_back(&I);
402 else if (I.getOpcode() == ARM::t2LoopEnd)
403 Ends.push_back(&I);
406 if (Starts.empty() && Decs.empty() && Ends.empty())
407 continue;
409 Changed = true;
411 for (auto *Start : Starts) {
412 if (Start->getOpcode() == ARM::t2WhileLoopStart)
413 RevertWhile(Start);
414 else
415 Start->eraseFromParent();
417 for (auto *Dec : Decs)
418 RevertLoopDec(Dec);
420 for (auto *End : Ends)
421 RevertLoopEnd(End);
423 return Changed;
426 FunctionPass *llvm::createARMLowOverheadLoopsPass() {
427 return new ARMLowOverheadLoops();