[X86][BMI] Pull out schedule classes from bmi_andn<> and bmi_bls<>
[llvm-core.git] / lib / Target / ARM / ARMLowOverheadLoops.cpp
blobe1c5a9c3e223b2f3a9591eb4bf84a7d243b3c712
1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 /// form should be in the preheader, whereas the while form should be in the
14 /// preheaders only predecessor.
15 /// - t2LoopDec - placed within in the loop body.
16 /// - t2LoopEnd - the loop latch terminator.
17 ///
18 //===----------------------------------------------------------------------===//
20 #include "ARM.h"
21 #include "ARMBaseInstrInfo.h"
22 #include "ARMBaseRegisterInfo.h"
23 #include "ARMBasicBlockInfo.h"
24 #include "ARMSubtarget.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineLoopInfo.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 using namespace llvm;
31 #define DEBUG_TYPE "arm-low-overhead-loops"
32 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
34 namespace {
36 class ARMLowOverheadLoops : public MachineFunctionPass {
37 MachineFunction *MF = nullptr;
38 const ARMBaseInstrInfo *TII = nullptr;
39 MachineRegisterInfo *MRI = nullptr;
40 std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
42 public:
43 static char ID;
45 ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
47 void getAnalysisUsage(AnalysisUsage &AU) const override {
48 AU.setPreservesCFG();
49 AU.addRequired<MachineLoopInfo>();
50 MachineFunctionPass::getAnalysisUsage(AU);
53 bool runOnMachineFunction(MachineFunction &MF) override;
55 MachineFunctionProperties getRequiredProperties() const override {
56 return MachineFunctionProperties().set(
57 MachineFunctionProperties::Property::NoVRegs);
60 StringRef getPassName() const override {
61 return ARM_LOW_OVERHEAD_LOOPS_NAME;
64 private:
65 bool ProcessLoop(MachineLoop *ML);
67 MachineInstr * IsSafeToDefineLR(MachineInstr *MI);
69 bool RevertNonLoops();
71 void RevertWhile(MachineInstr *MI) const;
73 bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
75 void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
77 void Expand(MachineLoop *ML, MachineInstr *Start,
78 MachineInstr *InsertPt, MachineInstr *Dec,
79 MachineInstr *End, bool Revert);
84 char ARMLowOverheadLoops::ID = 0;
86 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
87 false, false)
89 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
90 const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
91 if (!ST.hasLOB())
92 return false;
94 MF = &mf;
95 LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
97 auto &MLI = getAnalysis<MachineLoopInfo>();
98 MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
99 MRI = &MF->getRegInfo();
100 TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
101 BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
102 BBUtils->computeAllBlockSizes();
103 BBUtils->adjustBBOffsetsAfter(&MF->front());
105 bool Changed = false;
106 for (auto ML : MLI) {
107 if (!ML->getParentLoop())
108 Changed |= ProcessLoop(ML);
110 Changed |= RevertNonLoops();
111 return Changed;
114 static bool IsLoopStart(MachineInstr &MI) {
115 return MI.getOpcode() == ARM::t2DoLoopStart ||
116 MI.getOpcode() == ARM::t2WhileLoopStart;
119 template<typename T>
120 static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) {
121 for(auto &MI : make_range(T(Begin), End)) {
122 for (auto &MO : MI.operands()) {
123 if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
124 continue;
125 return &MI;
128 return nullptr;
131 static MachineInstr* SearchForUse(MachineInstr *Begin,
132 MachineBasicBlock::iterator End,
133 unsigned Reg) {
134 for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) {
135 for (auto &MO : MI.operands()) {
136 if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
137 continue;
138 return &MI;
141 return nullptr;
144 // Is it safe to define LR with DLS/WLS?
145 // LR can defined if it is the operand to start, because it's the same value,
146 // or if it's going to be equivalent to the operand to Start.
147 MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) {
149 auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) {
150 return MI->getOpcode() == ARM::tMOVr &&
151 MI->getOperand(0).getReg() == ARM::LR &&
152 MI->getOperand(1).getReg() == Reg &&
153 MI->getOperand(2).getImm() == ARMCC::AL;
156 MachineBasicBlock *MBB = Start->getParent();
157 unsigned CountReg = Start->getOperand(0).getReg();
158 // Walk forward and backward in the block to find the closest instructions
159 // that define LR. Then also filter them out if they're not a mov lr.
160 MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR);
161 if (PredLRDef && !IsMoveLR(PredLRDef, CountReg))
162 PredLRDef = nullptr;
164 MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR);
165 if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg))
166 SuccLRDef = nullptr;
168 // We've either found one, two or none mov lr instructions... Now figure out
169 // if they are performing the equilvant mov that the Start instruction will.
170 // Do this by scanning forward and backward to see if there's a def of the
171 // register holding the count value. If we find a suitable def, return it as
172 // the insert point. Later, if InsertPt != Start, then we can remove the
173 // redundant instruction.
174 if (SuccLRDef) {
175 MachineBasicBlock::iterator End(SuccLRDef);
176 if (!SearchForDef(Start, End, CountReg)) {
177 return SuccLRDef;
178 } else
179 SuccLRDef = nullptr;
181 if (PredLRDef) {
182 MachineBasicBlock::reverse_iterator End(PredLRDef);
183 if (!SearchForDef(Start, End, CountReg)) {
184 return PredLRDef;
185 } else
186 PredLRDef = nullptr;
189 // We can define LR because LR already contains the same value.
190 if (Start->getOperand(0).getReg() == ARM::LR)
191 return Start;
193 // We've found no suitable LR def and Start doesn't use LR directly. Can we
194 // just define LR anyway?
195 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
196 LivePhysRegs LiveRegs(*TRI);
197 LiveRegs.addLiveOuts(*MBB);
199 // Not if we've haven't found a suitable mov and LR is live out.
200 if (LiveRegs.contains(ARM::LR))
201 return nullptr;
203 // If LR is not live out, we can insert the instruction if nothing else
204 // uses LR after it.
205 if (!SearchForUse(Start, MBB->end(), ARM::LR))
206 return Start;
208 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for"
209 << " LR\n");
210 return nullptr;
213 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
215 bool Changed = false;
217 // Process inner loops first.
218 for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
219 Changed |= ProcessLoop(*I);
221 LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
223 // Search the given block for a loop start instruction. If one isn't found,
224 // and there's only one predecessor block, search that one too.
225 std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
226 [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
227 for (auto &MI : *MBB) {
228 if (IsLoopStart(MI))
229 return &MI;
231 if (MBB->pred_size() == 1)
232 return SearchForStart(*MBB->pred_begin());
233 return nullptr;
236 MachineInstr *Start = nullptr;
237 MachineInstr *Dec = nullptr;
238 MachineInstr *End = nullptr;
239 bool Revert = false;
241 // Search the preheader for the start intrinsic, or look through the
242 // predecessors of the header to find exactly one set.iterations intrinsic.
243 // FIXME: I don't see why we shouldn't be supporting multiple predecessors
244 // with potentially multiple set.loop.iterations, so we need to enable this.
245 if (auto *Preheader = ML->getLoopPreheader()) {
246 Start = SearchForStart(Preheader);
247 } else {
248 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
249 << " - Performing manual predecessor search.\n");
250 MachineBasicBlock *Pred = nullptr;
251 for (auto *MBB : ML->getHeader()->predecessors()) {
252 if (!ML->contains(MBB)) {
253 if (Pred) {
254 LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
255 Start = nullptr;
256 break;
258 Pred = MBB;
259 Start = SearchForStart(MBB);
264 // Find the low-overhead loop components and decide whether or not to fall
265 // back to a normal loop.
266 for (auto *MBB : reverse(ML->getBlocks())) {
267 for (auto &MI : *MBB) {
268 if (MI.getOpcode() == ARM::t2LoopDec)
269 Dec = &MI;
270 else if (MI.getOpcode() == ARM::t2LoopEnd)
271 End = &MI;
272 else if (IsLoopStart(MI))
273 Start = &MI;
274 else if (MI.getDesc().isCall()) {
275 // TODO: Though the call will require LE to execute again, does this
276 // mean we should revert? Always executing LE hopefully should be
277 // faster than performing a sub,cmp,br or even subs,br.
278 Revert = true;
279 LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
282 if (!Dec || End)
283 continue;
285 // If we find that LR has been written or read between LoopDec and
286 // LoopEnd, expect that the decremented value is being used else where.
287 // Because this value isn't actually going to be produced until the
288 // latch, by LE, we would need to generate a real sub. The value is also
289 // likely to be copied/reloaded for use of LoopEnd - in which in case
290 // we'd need to perform an add because it gets subtracted again by LE!
291 // The other option is to then generate the other form of LE which doesn't
292 // perform the sub.
293 for (auto &MO : MI.operands()) {
294 if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
295 MO.getReg() == ARM::LR) {
296 LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
297 Revert = true;
298 break;
303 if (Dec && End && Revert)
304 break;
307 LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
308 if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
309 if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;);
311 if (!Start && !Dec && !End) {
312 LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
313 return Changed;
314 } else if (!(Start && Dec && End)) {
315 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n");
316 return false;
319 if (!End->getOperand(1).isMBB())
320 report_fatal_error("Expected LoopEnd to target basic block");
322 // TODO Maybe there's cases where the target doesn't have to be the header,
323 // but for now be safe and revert.
324 if (End->getOperand(1).getMBB() != ML->getHeader()) {
325 LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
326 Revert = true;
329 // The WLS and LE instructions have 12-bits for the label offset. WLS
330 // requires a positive offset, while LE uses negative.
331 if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
332 !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
333 LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
334 Revert = true;
336 if (Start->getOpcode() == ARM::t2WhileLoopStart &&
337 (BBUtils->getOffsetOf(Start) >
338 BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
339 !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
340 LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
341 Revert = true;
344 MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start);
345 if (!InsertPt) {
346 LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
347 Revert = true;
348 } else
349 LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
351 Expand(ML, Start, InsertPt, Dec, End, Revert);
352 return true;
355 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
356 // beq that branches to the exit branch.
357 // TODO: We could also try to generate a cbz if the value in LR is also in
358 // another low register.
359 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
360 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
361 MachineBasicBlock *MBB = MI->getParent();
362 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
363 TII->get(ARM::t2CMPri));
364 MIB.add(MI->getOperand(0));
365 MIB.addImm(0);
366 MIB.addImm(ARMCC::AL);
367 MIB.addReg(ARM::NoRegister);
369 MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
370 unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
371 ARM::tBcc : ARM::t2Bcc;
373 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
374 MIB.add(MI->getOperand(1)); // branch target
375 MIB.addImm(ARMCC::EQ); // condition code
376 MIB.addReg(ARM::CPSR);
377 MI->eraseFromParent();
380 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
381 bool AllowFlags) const {
382 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
383 MachineBasicBlock *MBB = MI->getParent();
385 // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
386 bool SetFlags = false;
387 if (AllowFlags) {
388 if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) {
389 if (!SearchForUse(MI, MBB->end(), ARM::CPSR) &&
390 Def->getOpcode() == ARM::t2LoopEnd)
391 SetFlags = true;
395 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
396 TII->get(ARM::t2SUBri));
397 MIB.addDef(ARM::LR);
398 MIB.add(MI->getOperand(1));
399 MIB.add(MI->getOperand(2));
400 MIB.addImm(ARMCC::AL);
401 MIB.addReg(0);
403 if (SetFlags) {
404 MIB.addReg(ARM::CPSR);
405 MIB->getOperand(5).setIsDef(true);
406 } else
407 MIB.addReg(0);
409 MI->eraseFromParent();
410 return SetFlags;
413 // Generate a subs, or sub and cmp, and a branch instead of an LE.
414 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
415 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
417 MachineBasicBlock *MBB = MI->getParent();
418 // Create cmp
419 if (!SkipCmp) {
420 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
421 TII->get(ARM::t2CMPri));
422 MIB.addReg(ARM::LR);
423 MIB.addImm(0);
424 MIB.addImm(ARMCC::AL);
425 MIB.addReg(ARM::NoRegister);
428 MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
429 unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
430 ARM::tBcc : ARM::t2Bcc;
432 // Create bne
433 MachineInstrBuilder MIB =
434 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
435 MIB.add(MI->getOperand(1)); // branch target
436 MIB.addImm(ARMCC::NE); // condition code
437 MIB.addReg(ARM::CPSR);
438 MI->eraseFromParent();
441 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
442 MachineInstr *InsertPt,
443 MachineInstr *Dec, MachineInstr *End,
444 bool Revert) {
446 auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start,
447 MachineInstr *InsertPt) {
448 MachineBasicBlock *MBB = InsertPt->getParent();
449 unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
450 ARM::t2DLS : ARM::t2WLS;
451 MachineInstrBuilder MIB =
452 BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
454 MIB.addDef(ARM::LR);
455 MIB.add(Start->getOperand(0));
456 if (Opc == ARM::t2WLS)
457 MIB.add(Start->getOperand(1));
459 if (InsertPt != Start)
460 InsertPt->eraseFromParent();
461 Start->eraseFromParent();
462 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
463 return &*MIB;
466 // Combine the LoopDec and LoopEnd instructions into LE(TP).
467 auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec,
468 MachineInstr *End) {
469 MachineBasicBlock *MBB = End->getParent();
470 MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
471 TII->get(ARM::t2LEUpdate));
472 MIB.addDef(ARM::LR);
473 MIB.add(End->getOperand(0));
474 MIB.add(End->getOperand(1));
475 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
477 End->eraseFromParent();
478 Dec->eraseFromParent();
479 return &*MIB;
482 // TODO: We should be able to automatically remove these branches before we
483 // get here - probably by teaching analyzeBranch about the pseudo
484 // instructions.
485 // If there is an unconditional branch, after I, that just branches to the
486 // next block, remove it.
487 auto RemoveDeadBranch = [](MachineInstr *I) {
488 MachineBasicBlock *BB = I->getParent();
489 MachineInstr *Terminator = &BB->instr_back();
490 if (Terminator->isUnconditionalBranch() && I != Terminator) {
491 MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
492 if (BB->isLayoutSuccessor(Succ)) {
493 LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
494 Terminator->eraseFromParent();
499 if (Revert) {
500 if (Start->getOpcode() == ARM::t2WhileLoopStart)
501 RevertWhile(Start);
502 else
503 Start->eraseFromParent();
504 bool FlagsAlreadySet = RevertLoopDec(Dec, true);
505 RevertLoopEnd(End, FlagsAlreadySet);
506 } else {
507 Start = ExpandLoopStart(ML, Start, InsertPt);
508 RemoveDeadBranch(Start);
509 End = ExpandLoopEnd(ML, Dec, End);
510 RemoveDeadBranch(End);
514 bool ARMLowOverheadLoops::RevertNonLoops() {
515 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
516 bool Changed = false;
518 for (auto &MBB : *MF) {
519 SmallVector<MachineInstr*, 4> Starts;
520 SmallVector<MachineInstr*, 4> Decs;
521 SmallVector<MachineInstr*, 4> Ends;
523 for (auto &I : MBB) {
524 if (IsLoopStart(I))
525 Starts.push_back(&I);
526 else if (I.getOpcode() == ARM::t2LoopDec)
527 Decs.push_back(&I);
528 else if (I.getOpcode() == ARM::t2LoopEnd)
529 Ends.push_back(&I);
532 if (Starts.empty() && Decs.empty() && Ends.empty())
533 continue;
535 Changed = true;
537 for (auto *Start : Starts) {
538 if (Start->getOpcode() == ARM::t2WhileLoopStart)
539 RevertWhile(Start);
540 else
541 Start->eraseFromParent();
543 for (auto *Dec : Decs)
544 RevertLoopDec(Dec);
546 for (auto *End : Ends)
547 RevertLoopEnd(End);
549 return Changed;
552 FunctionPass *llvm::createARMLowOverheadLoopsPass() {
553 return new ARMLowOverheadLoops();