Revert r354244 "[DAGCombiner] Eliminate dead stores to stack."
[llvm-complete.git] / lib / Target / AArch64 / AArch64InstrInfo.cpp
blobf5d86c2e54aefb9ac5b7c17b2793224b3f284177
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
11 //===----------------------------------------------------------------------===//
13 #include "AArch64InstrInfo.h"
14 #include "AArch64MachineFunctionInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/MachineOperand.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/MachineModuleInfo.h"
30 #include "llvm/CodeGen/StackMaps.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/MC/MCInst.h"
36 #include "llvm/MC/MCInstrDesc.h"
37 #include "llvm/Support/Casting.h"
38 #include "llvm/Support/CodeGen.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Compiler.h"
41 #include "llvm/Support/ErrorHandling.h"
42 #include "llvm/Support/MathExtras.h"
43 #include "llvm/Target/TargetMachine.h"
44 #include "llvm/Target/TargetOptions.h"
45 #include <cassert>
46 #include <cstdint>
47 #include <iterator>
48 #include <utility>
50 using namespace llvm;
52 #define GET_INSTRINFO_CTOR_DTOR
53 #include "AArch64GenInstrInfo.inc"
55 static cl::opt<unsigned> TBZDisplacementBits(
56 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
57 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
59 static cl::opt<unsigned> CBZDisplacementBits(
60 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
61 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
63 static cl::opt<unsigned>
64 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
65 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
67 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
68 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
69 AArch64::CATCHRET),
70 RI(STI.getTargetTriple()), Subtarget(STI) {}
72 /// GetInstSize - Return the number of bytes of code the specified
73 /// instruction may be. This returns the maximum number of bytes.
74 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
75 const MachineBasicBlock &MBB = *MI.getParent();
76 const MachineFunction *MF = MBB.getParent();
77 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
79 if (MI.getOpcode() == AArch64::INLINEASM)
80 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
82 // FIXME: We currently only handle pseudoinstructions that don't get expanded
83 // before the assembly printer.
84 unsigned NumBytes = 0;
85 const MCInstrDesc &Desc = MI.getDesc();
86 switch (Desc.getOpcode()) {
87 default:
88 // Anything not explicitly designated otherwise is a normal 4-byte insn.
89 NumBytes = 4;
90 break;
91 case TargetOpcode::DBG_VALUE:
92 case TargetOpcode::EH_LABEL:
93 case TargetOpcode::IMPLICIT_DEF:
94 case TargetOpcode::KILL:
95 NumBytes = 0;
96 break;
97 case TargetOpcode::STACKMAP:
98 // The upper bound for a stackmap intrinsic is the full length of its shadow
99 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
100 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
101 break;
102 case TargetOpcode::PATCHPOINT:
103 // The size of the patchpoint intrinsic is the number of bytes requested
104 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
105 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
106 break;
107 case AArch64::TLSDESC_CALLSEQ:
108 // This gets lowered to an instruction sequence which takes 16 bytes
109 NumBytes = 16;
110 break;
111 case AArch64::JumpTableDest32:
112 case AArch64::JumpTableDest16:
113 case AArch64::JumpTableDest8:
114 NumBytes = 12;
115 break;
116 case AArch64::SPACE:
117 NumBytes = MI.getOperand(1).getImm();
118 break;
121 return NumBytes;
124 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
125 SmallVectorImpl<MachineOperand> &Cond) {
126 // Block ends with fall-through condbranch.
127 switch (LastInst->getOpcode()) {
128 default:
129 llvm_unreachable("Unknown branch instruction?");
130 case AArch64::Bcc:
131 Target = LastInst->getOperand(1).getMBB();
132 Cond.push_back(LastInst->getOperand(0));
133 break;
134 case AArch64::CBZW:
135 case AArch64::CBZX:
136 case AArch64::CBNZW:
137 case AArch64::CBNZX:
138 Target = LastInst->getOperand(1).getMBB();
139 Cond.push_back(MachineOperand::CreateImm(-1));
140 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
141 Cond.push_back(LastInst->getOperand(0));
142 break;
143 case AArch64::TBZW:
144 case AArch64::TBZX:
145 case AArch64::TBNZW:
146 case AArch64::TBNZX:
147 Target = LastInst->getOperand(2).getMBB();
148 Cond.push_back(MachineOperand::CreateImm(-1));
149 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
150 Cond.push_back(LastInst->getOperand(0));
151 Cond.push_back(LastInst->getOperand(1));
155 static unsigned getBranchDisplacementBits(unsigned Opc) {
156 switch (Opc) {
157 default:
158 llvm_unreachable("unexpected opcode!");
159 case AArch64::B:
160 return 64;
161 case AArch64::TBNZW:
162 case AArch64::TBZW:
163 case AArch64::TBNZX:
164 case AArch64::TBZX:
165 return TBZDisplacementBits;
166 case AArch64::CBNZW:
167 case AArch64::CBZW:
168 case AArch64::CBNZX:
169 case AArch64::CBZX:
170 return CBZDisplacementBits;
171 case AArch64::Bcc:
172 return BCCDisplacementBits;
176 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
177 int64_t BrOffset) const {
178 unsigned Bits = getBranchDisplacementBits(BranchOp);
179 assert(Bits >= 3 && "max branch displacement must be enough to jump"
180 "over conditional branch expansion");
181 return isIntN(Bits, BrOffset / 4);
184 MachineBasicBlock *
185 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
186 switch (MI.getOpcode()) {
187 default:
188 llvm_unreachable("unexpected opcode!");
189 case AArch64::B:
190 return MI.getOperand(0).getMBB();
191 case AArch64::TBZW:
192 case AArch64::TBNZW:
193 case AArch64::TBZX:
194 case AArch64::TBNZX:
195 return MI.getOperand(2).getMBB();
196 case AArch64::CBZW:
197 case AArch64::CBNZW:
198 case AArch64::CBZX:
199 case AArch64::CBNZX:
200 case AArch64::Bcc:
201 return MI.getOperand(1).getMBB();
205 // Branch analysis.
206 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
207 MachineBasicBlock *&TBB,
208 MachineBasicBlock *&FBB,
209 SmallVectorImpl<MachineOperand> &Cond,
210 bool AllowModify) const {
211 // If the block has no terminators, it just falls into the block after it.
212 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
213 if (I == MBB.end())
214 return false;
216 if (!isUnpredicatedTerminator(*I))
217 return false;
219 // Get the last instruction in the block.
220 MachineInstr *LastInst = &*I;
222 // If there is only one terminator instruction, process it.
223 unsigned LastOpc = LastInst->getOpcode();
224 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
225 if (isUncondBranchOpcode(LastOpc)) {
226 TBB = LastInst->getOperand(0).getMBB();
227 return false;
229 if (isCondBranchOpcode(LastOpc)) {
230 // Block ends with fall-through condbranch.
231 parseCondBranch(LastInst, TBB, Cond);
232 return false;
234 return true; // Can't handle indirect branch.
237 // Get the instruction before it if it is a terminator.
238 MachineInstr *SecondLastInst = &*I;
239 unsigned SecondLastOpc = SecondLastInst->getOpcode();
241 // If AllowModify is true and the block ends with two or more unconditional
242 // branches, delete all but the first unconditional branch.
243 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
244 while (isUncondBranchOpcode(SecondLastOpc)) {
245 LastInst->eraseFromParent();
246 LastInst = SecondLastInst;
247 LastOpc = LastInst->getOpcode();
248 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
249 // Return now the only terminator is an unconditional branch.
250 TBB = LastInst->getOperand(0).getMBB();
251 return false;
252 } else {
253 SecondLastInst = &*I;
254 SecondLastOpc = SecondLastInst->getOpcode();
259 // If there are three terminators, we don't know what sort of block this is.
260 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
261 return true;
263 // If the block ends with a B and a Bcc, handle it.
264 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
265 parseCondBranch(SecondLastInst, TBB, Cond);
266 FBB = LastInst->getOperand(0).getMBB();
267 return false;
270 // If the block ends with two unconditional branches, handle it. The second
271 // one is not executed, so remove it.
272 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
273 TBB = SecondLastInst->getOperand(0).getMBB();
274 I = LastInst;
275 if (AllowModify)
276 I->eraseFromParent();
277 return false;
280 // ...likewise if it ends with an indirect branch followed by an unconditional
281 // branch.
282 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
283 I = LastInst;
284 if (AllowModify)
285 I->eraseFromParent();
286 return true;
289 // Otherwise, can't handle this.
290 return true;
293 bool AArch64InstrInfo::reverseBranchCondition(
294 SmallVectorImpl<MachineOperand> &Cond) const {
295 if (Cond[0].getImm() != -1) {
296 // Regular Bcc
297 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
298 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
299 } else {
300 // Folded compare-and-branch
301 switch (Cond[1].getImm()) {
302 default:
303 llvm_unreachable("Unknown conditional branch!");
304 case AArch64::CBZW:
305 Cond[1].setImm(AArch64::CBNZW);
306 break;
307 case AArch64::CBNZW:
308 Cond[1].setImm(AArch64::CBZW);
309 break;
310 case AArch64::CBZX:
311 Cond[1].setImm(AArch64::CBNZX);
312 break;
313 case AArch64::CBNZX:
314 Cond[1].setImm(AArch64::CBZX);
315 break;
316 case AArch64::TBZW:
317 Cond[1].setImm(AArch64::TBNZW);
318 break;
319 case AArch64::TBNZW:
320 Cond[1].setImm(AArch64::TBZW);
321 break;
322 case AArch64::TBZX:
323 Cond[1].setImm(AArch64::TBNZX);
324 break;
325 case AArch64::TBNZX:
326 Cond[1].setImm(AArch64::TBZX);
327 break;
331 return false;
334 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
335 int *BytesRemoved) const {
336 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
337 if (I == MBB.end())
338 return 0;
340 if (!isUncondBranchOpcode(I->getOpcode()) &&
341 !isCondBranchOpcode(I->getOpcode()))
342 return 0;
344 // Remove the branch.
345 I->eraseFromParent();
347 I = MBB.end();
349 if (I == MBB.begin()) {
350 if (BytesRemoved)
351 *BytesRemoved = 4;
352 return 1;
354 --I;
355 if (!isCondBranchOpcode(I->getOpcode())) {
356 if (BytesRemoved)
357 *BytesRemoved = 4;
358 return 1;
361 // Remove the branch.
362 I->eraseFromParent();
363 if (BytesRemoved)
364 *BytesRemoved = 8;
366 return 2;
369 void AArch64InstrInfo::instantiateCondBranch(
370 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
371 ArrayRef<MachineOperand> Cond) const {
372 if (Cond[0].getImm() != -1) {
373 // Regular Bcc
374 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
375 } else {
376 // Folded compare-and-branch
377 // Note that we use addOperand instead of addReg to keep the flags.
378 const MachineInstrBuilder MIB =
379 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
380 if (Cond.size() > 3)
381 MIB.addImm(Cond[3].getImm());
382 MIB.addMBB(TBB);
386 unsigned AArch64InstrInfo::insertBranch(
387 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
388 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
389 // Shouldn't be a fall through.
390 assert(TBB && "insertBranch must not be told to insert a fallthrough");
392 if (!FBB) {
393 if (Cond.empty()) // Unconditional branch?
394 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
395 else
396 instantiateCondBranch(MBB, DL, TBB, Cond);
398 if (BytesAdded)
399 *BytesAdded = 4;
401 return 1;
404 // Two-way conditional branch.
405 instantiateCondBranch(MBB, DL, TBB, Cond);
406 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
408 if (BytesAdded)
409 *BytesAdded = 8;
411 return 2;
414 // Find the original register that VReg is copied from.
415 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
416 while (TargetRegisterInfo::isVirtualRegister(VReg)) {
417 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
418 if (!DefMI->isFullCopy())
419 return VReg;
420 VReg = DefMI->getOperand(1).getReg();
422 return VReg;
425 // Determine if VReg is defined by an instruction that can be folded into a
426 // csel instruction. If so, return the folded opcode, and the replacement
427 // register.
428 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
429 unsigned *NewVReg = nullptr) {
430 VReg = removeCopies(MRI, VReg);
431 if (!TargetRegisterInfo::isVirtualRegister(VReg))
432 return 0;
434 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
435 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
436 unsigned Opc = 0;
437 unsigned SrcOpNum = 0;
438 switch (DefMI->getOpcode()) {
439 case AArch64::ADDSXri:
440 case AArch64::ADDSWri:
441 // if NZCV is used, do not fold.
442 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
443 return 0;
444 // fall-through to ADDXri and ADDWri.
445 LLVM_FALLTHROUGH;
446 case AArch64::ADDXri:
447 case AArch64::ADDWri:
448 // add x, 1 -> csinc.
449 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
450 DefMI->getOperand(3).getImm() != 0)
451 return 0;
452 SrcOpNum = 1;
453 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
454 break;
456 case AArch64::ORNXrr:
457 case AArch64::ORNWrr: {
458 // not x -> csinv, represented as orn dst, xzr, src.
459 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
460 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
461 return 0;
462 SrcOpNum = 2;
463 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
464 break;
467 case AArch64::SUBSXrr:
468 case AArch64::SUBSWrr:
469 // if NZCV is used, do not fold.
470 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
471 return 0;
472 // fall-through to SUBXrr and SUBWrr.
473 LLVM_FALLTHROUGH;
474 case AArch64::SUBXrr:
475 case AArch64::SUBWrr: {
476 // neg x -> csneg, represented as sub dst, xzr, src.
477 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
478 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
479 return 0;
480 SrcOpNum = 2;
481 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
482 break;
484 default:
485 return 0;
487 assert(Opc && SrcOpNum && "Missing parameters");
489 if (NewVReg)
490 *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
491 return Opc;
494 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
495 ArrayRef<MachineOperand> Cond,
496 unsigned TrueReg, unsigned FalseReg,
497 int &CondCycles, int &TrueCycles,
498 int &FalseCycles) const {
499 // Check register classes.
500 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
501 const TargetRegisterClass *RC =
502 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
503 if (!RC)
504 return false;
506 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
507 unsigned ExtraCondLat = Cond.size() != 1;
509 // GPRs are handled by csel.
510 // FIXME: Fold in x+1, -x, and ~x when applicable.
511 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
512 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
513 // Single-cycle csel, csinc, csinv, and csneg.
514 CondCycles = 1 + ExtraCondLat;
515 TrueCycles = FalseCycles = 1;
516 if (canFoldIntoCSel(MRI, TrueReg))
517 TrueCycles = 0;
518 else if (canFoldIntoCSel(MRI, FalseReg))
519 FalseCycles = 0;
520 return true;
523 // Scalar floating point is handled by fcsel.
524 // FIXME: Form fabs, fmin, and fmax when applicable.
525 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
526 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
527 CondCycles = 5 + ExtraCondLat;
528 TrueCycles = FalseCycles = 2;
529 return true;
532 // Can't do vectors.
533 return false;
536 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
537 MachineBasicBlock::iterator I,
538 const DebugLoc &DL, unsigned DstReg,
539 ArrayRef<MachineOperand> Cond,
540 unsigned TrueReg, unsigned FalseReg) const {
541 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
543 // Parse the condition code, see parseCondBranch() above.
544 AArch64CC::CondCode CC;
545 switch (Cond.size()) {
546 default:
547 llvm_unreachable("Unknown condition opcode in Cond");
548 case 1: // b.cc
549 CC = AArch64CC::CondCode(Cond[0].getImm());
550 break;
551 case 3: { // cbz/cbnz
552 // We must insert a compare against 0.
553 bool Is64Bit;
554 switch (Cond[1].getImm()) {
555 default:
556 llvm_unreachable("Unknown branch opcode in Cond");
557 case AArch64::CBZW:
558 Is64Bit = false;
559 CC = AArch64CC::EQ;
560 break;
561 case AArch64::CBZX:
562 Is64Bit = true;
563 CC = AArch64CC::EQ;
564 break;
565 case AArch64::CBNZW:
566 Is64Bit = false;
567 CC = AArch64CC::NE;
568 break;
569 case AArch64::CBNZX:
570 Is64Bit = true;
571 CC = AArch64CC::NE;
572 break;
574 unsigned SrcReg = Cond[2].getReg();
575 if (Is64Bit) {
576 // cmp reg, #0 is actually subs xzr, reg, #0.
577 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
578 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
579 .addReg(SrcReg)
580 .addImm(0)
581 .addImm(0);
582 } else {
583 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
584 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
585 .addReg(SrcReg)
586 .addImm(0)
587 .addImm(0);
589 break;
591 case 4: { // tbz/tbnz
592 // We must insert a tst instruction.
593 switch (Cond[1].getImm()) {
594 default:
595 llvm_unreachable("Unknown branch opcode in Cond");
596 case AArch64::TBZW:
597 case AArch64::TBZX:
598 CC = AArch64CC::EQ;
599 break;
600 case AArch64::TBNZW:
601 case AArch64::TBNZX:
602 CC = AArch64CC::NE;
603 break;
605 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
606 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
607 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
608 .addReg(Cond[2].getReg())
609 .addImm(
610 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
611 else
612 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
613 .addReg(Cond[2].getReg())
614 .addImm(
615 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
616 break;
620 unsigned Opc = 0;
621 const TargetRegisterClass *RC = nullptr;
622 bool TryFold = false;
623 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
624 RC = &AArch64::GPR64RegClass;
625 Opc = AArch64::CSELXr;
626 TryFold = true;
627 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
628 RC = &AArch64::GPR32RegClass;
629 Opc = AArch64::CSELWr;
630 TryFold = true;
631 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
632 RC = &AArch64::FPR64RegClass;
633 Opc = AArch64::FCSELDrrr;
634 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
635 RC = &AArch64::FPR32RegClass;
636 Opc = AArch64::FCSELSrrr;
638 assert(RC && "Unsupported regclass");
640 // Try folding simple instructions into the csel.
641 if (TryFold) {
642 unsigned NewVReg = 0;
643 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
644 if (FoldedOpc) {
645 // The folded opcodes csinc, csinc and csneg apply the operation to
646 // FalseReg, so we need to invert the condition.
647 CC = AArch64CC::getInvertedCondCode(CC);
648 TrueReg = FalseReg;
649 } else
650 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
652 // Fold the operation. Leave any dead instructions for DCE to clean up.
653 if (FoldedOpc) {
654 FalseReg = NewVReg;
655 Opc = FoldedOpc;
656 // The extends the live range of NewVReg.
657 MRI.clearKillFlags(NewVReg);
661 // Pull all virtual register into the appropriate class.
662 MRI.constrainRegClass(TrueReg, RC);
663 MRI.constrainRegClass(FalseReg, RC);
665 // Insert the csel.
666 BuildMI(MBB, I, DL, get(Opc), DstReg)
667 .addReg(TrueReg)
668 .addReg(FalseReg)
669 .addImm(CC);
672 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
673 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
674 uint64_t Imm = MI.getOperand(1).getImm();
675 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
676 uint64_t Encoding;
677 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
680 // FIXME: this implementation should be micro-architecture dependent, so a
681 // micro-architecture target hook should be introduced here in future.
682 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
683 if (!Subtarget.hasCustomCheapAsMoveHandling())
684 return MI.isAsCheapAsAMove();
686 const unsigned Opcode = MI.getOpcode();
688 // Firstly, check cases gated by features.
690 if (Subtarget.hasZeroCycleZeroingFP()) {
691 if (Opcode == AArch64::FMOVH0 ||
692 Opcode == AArch64::FMOVS0 ||
693 Opcode == AArch64::FMOVD0)
694 return true;
697 if (Subtarget.hasZeroCycleZeroingGP()) {
698 if (Opcode == TargetOpcode::COPY &&
699 (MI.getOperand(1).getReg() == AArch64::WZR ||
700 MI.getOperand(1).getReg() == AArch64::XZR))
701 return true;
704 // Secondly, check cases specific to sub-targets.
706 if (Subtarget.hasExynosCheapAsMoveHandling()) {
707 if (isExynosCheapAsMove(MI))
708 return true;
710 return MI.isAsCheapAsAMove();
713 // Finally, check generic cases.
715 switch (Opcode) {
716 default:
717 return false;
719 // add/sub on register without shift
720 case AArch64::ADDWri:
721 case AArch64::ADDXri:
722 case AArch64::SUBWri:
723 case AArch64::SUBXri:
724 return (MI.getOperand(3).getImm() == 0);
726 // logical ops on immediate
727 case AArch64::ANDWri:
728 case AArch64::ANDXri:
729 case AArch64::EORWri:
730 case AArch64::EORXri:
731 case AArch64::ORRWri:
732 case AArch64::ORRXri:
733 return true;
735 // logical ops on register without shift
736 case AArch64::ANDWrr:
737 case AArch64::ANDXrr:
738 case AArch64::BICWrr:
739 case AArch64::BICXrr:
740 case AArch64::EONWrr:
741 case AArch64::EONXrr:
742 case AArch64::EORWrr:
743 case AArch64::EORXrr:
744 case AArch64::ORNWrr:
745 case AArch64::ORNXrr:
746 case AArch64::ORRWrr:
747 case AArch64::ORRXrr:
748 return true;
750 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
751 // ORRXri, it is as cheap as MOV
752 case AArch64::MOVi32imm:
753 return canBeExpandedToORR(MI, 32);
754 case AArch64::MOVi64imm:
755 return canBeExpandedToORR(MI, 64);
758 llvm_unreachable("Unknown opcode to check as cheap as a move!");
761 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
762 switch (MI.getOpcode()) {
763 default:
764 return false;
766 case AArch64::ADDWrs:
767 case AArch64::ADDXrs:
768 case AArch64::ADDSWrs:
769 case AArch64::ADDSXrs: {
770 unsigned Imm = MI.getOperand(3).getImm();
771 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
772 if (ShiftVal == 0)
773 return true;
774 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
777 case AArch64::ADDWrx:
778 case AArch64::ADDXrx:
779 case AArch64::ADDXrx64:
780 case AArch64::ADDSWrx:
781 case AArch64::ADDSXrx:
782 case AArch64::ADDSXrx64: {
783 unsigned Imm = MI.getOperand(3).getImm();
784 switch (AArch64_AM::getArithExtendType(Imm)) {
785 default:
786 return false;
787 case AArch64_AM::UXTB:
788 case AArch64_AM::UXTH:
789 case AArch64_AM::UXTW:
790 case AArch64_AM::UXTX:
791 return AArch64_AM::getArithShiftValue(Imm) <= 4;
795 case AArch64::SUBWrs:
796 case AArch64::SUBSWrs: {
797 unsigned Imm = MI.getOperand(3).getImm();
798 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
799 return ShiftVal == 0 ||
800 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
803 case AArch64::SUBXrs:
804 case AArch64::SUBSXrs: {
805 unsigned Imm = MI.getOperand(3).getImm();
806 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
807 return ShiftVal == 0 ||
808 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
811 case AArch64::SUBWrx:
812 case AArch64::SUBXrx:
813 case AArch64::SUBXrx64:
814 case AArch64::SUBSWrx:
815 case AArch64::SUBSXrx:
816 case AArch64::SUBSXrx64: {
817 unsigned Imm = MI.getOperand(3).getImm();
818 switch (AArch64_AM::getArithExtendType(Imm)) {
819 default:
820 return false;
821 case AArch64_AM::UXTB:
822 case AArch64_AM::UXTH:
823 case AArch64_AM::UXTW:
824 case AArch64_AM::UXTX:
825 return AArch64_AM::getArithShiftValue(Imm) == 0;
829 case AArch64::LDRBBroW:
830 case AArch64::LDRBBroX:
831 case AArch64::LDRBroW:
832 case AArch64::LDRBroX:
833 case AArch64::LDRDroW:
834 case AArch64::LDRDroX:
835 case AArch64::LDRHHroW:
836 case AArch64::LDRHHroX:
837 case AArch64::LDRHroW:
838 case AArch64::LDRHroX:
839 case AArch64::LDRQroW:
840 case AArch64::LDRQroX:
841 case AArch64::LDRSBWroW:
842 case AArch64::LDRSBWroX:
843 case AArch64::LDRSBXroW:
844 case AArch64::LDRSBXroX:
845 case AArch64::LDRSHWroW:
846 case AArch64::LDRSHWroX:
847 case AArch64::LDRSHXroW:
848 case AArch64::LDRSHXroX:
849 case AArch64::LDRSWroW:
850 case AArch64::LDRSWroX:
851 case AArch64::LDRSroW:
852 case AArch64::LDRSroX:
853 case AArch64::LDRWroW:
854 case AArch64::LDRWroX:
855 case AArch64::LDRXroW:
856 case AArch64::LDRXroX:
857 case AArch64::PRFMroW:
858 case AArch64::PRFMroX:
859 case AArch64::STRBBroW:
860 case AArch64::STRBBroX:
861 case AArch64::STRBroW:
862 case AArch64::STRBroX:
863 case AArch64::STRDroW:
864 case AArch64::STRDroX:
865 case AArch64::STRHHroW:
866 case AArch64::STRHHroX:
867 case AArch64::STRHroW:
868 case AArch64::STRHroX:
869 case AArch64::STRQroW:
870 case AArch64::STRQroX:
871 case AArch64::STRSroW:
872 case AArch64::STRSroX:
873 case AArch64::STRWroW:
874 case AArch64::STRWroX:
875 case AArch64::STRXroW:
876 case AArch64::STRXroX: {
877 unsigned IsSigned = MI.getOperand(3).getImm();
878 return !IsSigned;
883 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
884 unsigned Opc = MI.getOpcode();
885 switch (Opc) {
886 default:
887 return false;
888 case AArch64::SEH_StackAlloc:
889 case AArch64::SEH_SaveFPLR:
890 case AArch64::SEH_SaveFPLR_X:
891 case AArch64::SEH_SaveReg:
892 case AArch64::SEH_SaveReg_X:
893 case AArch64::SEH_SaveRegP:
894 case AArch64::SEH_SaveRegP_X:
895 case AArch64::SEH_SaveFReg:
896 case AArch64::SEH_SaveFReg_X:
897 case AArch64::SEH_SaveFRegP:
898 case AArch64::SEH_SaveFRegP_X:
899 case AArch64::SEH_SetFP:
900 case AArch64::SEH_AddFP:
901 case AArch64::SEH_Nop:
902 case AArch64::SEH_PrologEnd:
903 case AArch64::SEH_EpilogStart:
904 case AArch64::SEH_EpilogEnd:
905 return true;
909 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
910 unsigned &SrcReg, unsigned &DstReg,
911 unsigned &SubIdx) const {
912 switch (MI.getOpcode()) {
913 default:
914 return false;
915 case AArch64::SBFMXri: // aka sxtw
916 case AArch64::UBFMXri: // aka uxtw
917 // Check for the 32 -> 64 bit extension case, these instructions can do
918 // much more.
919 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
920 return false;
921 // This is a signed or unsigned 32 -> 64 bit extension.
922 SrcReg = MI.getOperand(1).getReg();
923 DstReg = MI.getOperand(0).getReg();
924 SubIdx = AArch64::sub_32;
925 return true;
929 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
930 MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
931 const TargetRegisterInfo *TRI = &getRegisterInfo();
932 MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
933 int64_t OffsetA = 0, OffsetB = 0;
934 unsigned WidthA = 0, WidthB = 0;
936 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
937 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
939 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
940 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
941 return false;
943 // Retrieve the base, offset from the base and width. Width
944 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
945 // base are identical, and the offset of a lower memory access +
946 // the width doesn't overlap the offset of a higher memory access,
947 // then the memory accesses are different.
948 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
949 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
950 if (BaseOpA->isIdenticalTo(*BaseOpB)) {
951 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
952 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
953 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
954 if (LowOffset + LowWidth <= HighOffset)
955 return true;
958 return false;
961 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
962 const MachineBasicBlock *MBB,
963 const MachineFunction &MF) const {
964 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
965 return true;
966 switch (MI.getOpcode()) {
967 case AArch64::HINT:
968 // CSDB hints are scheduling barriers.
969 if (MI.getOperand(0).getImm() == 0x14)
970 return true;
971 break;
972 case AArch64::DSB:
973 case AArch64::ISB:
974 // DSB and ISB also are scheduling barriers.
975 return true;
976 default:;
978 return isSEHInstruction(MI);
981 /// analyzeCompare - For a comparison instruction, return the source registers
982 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
983 /// Return true if the comparison instruction can be analyzed.
984 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
985 unsigned &SrcReg2, int &CmpMask,
986 int &CmpValue) const {
987 // The first operand can be a frame index where we'd normally expect a
988 // register.
989 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
990 if (!MI.getOperand(1).isReg())
991 return false;
993 switch (MI.getOpcode()) {
994 default:
995 break;
996 case AArch64::SUBSWrr:
997 case AArch64::SUBSWrs:
998 case AArch64::SUBSWrx:
999 case AArch64::SUBSXrr:
1000 case AArch64::SUBSXrs:
1001 case AArch64::SUBSXrx:
1002 case AArch64::ADDSWrr:
1003 case AArch64::ADDSWrs:
1004 case AArch64::ADDSWrx:
1005 case AArch64::ADDSXrr:
1006 case AArch64::ADDSXrs:
1007 case AArch64::ADDSXrx:
1008 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1009 SrcReg = MI.getOperand(1).getReg();
1010 SrcReg2 = MI.getOperand(2).getReg();
1011 CmpMask = ~0;
1012 CmpValue = 0;
1013 return true;
1014 case AArch64::SUBSWri:
1015 case AArch64::ADDSWri:
1016 case AArch64::SUBSXri:
1017 case AArch64::ADDSXri:
1018 SrcReg = MI.getOperand(1).getReg();
1019 SrcReg2 = 0;
1020 CmpMask = ~0;
1021 // FIXME: In order to convert CmpValue to 0 or 1
1022 CmpValue = MI.getOperand(2).getImm() != 0;
1023 return true;
1024 case AArch64::ANDSWri:
1025 case AArch64::ANDSXri:
1026 // ANDS does not use the same encoding scheme as the others xxxS
1027 // instructions.
1028 SrcReg = MI.getOperand(1).getReg();
1029 SrcReg2 = 0;
1030 CmpMask = ~0;
1031 // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1032 // while the type of CmpValue is int. When converting uint64_t to int,
1033 // the high 32 bits of uint64_t will be lost.
1034 // In fact it causes a bug in spec2006-483.xalancbmk
1035 // CmpValue is only used to compare with zero in OptimizeCompareInstr
1036 CmpValue = AArch64_AM::decodeLogicalImmediate(
1037 MI.getOperand(2).getImm(),
1038 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1039 return true;
1042 return false;
1045 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1046 MachineBasicBlock *MBB = Instr.getParent();
1047 assert(MBB && "Can't get MachineBasicBlock here");
1048 MachineFunction *MF = MBB->getParent();
1049 assert(MF && "Can't get MachineFunction here");
1050 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1051 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1052 MachineRegisterInfo *MRI = &MF->getRegInfo();
1054 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1055 ++OpIdx) {
1056 MachineOperand &MO = Instr.getOperand(OpIdx);
1057 const TargetRegisterClass *OpRegCstraints =
1058 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1060 // If there's no constraint, there's nothing to do.
1061 if (!OpRegCstraints)
1062 continue;
1063 // If the operand is a frame index, there's nothing to do here.
1064 // A frame index operand will resolve correctly during PEI.
1065 if (MO.isFI())
1066 continue;
1068 assert(MO.isReg() &&
1069 "Operand has register constraints without being a register!");
1071 unsigned Reg = MO.getReg();
1072 if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
1073 if (!OpRegCstraints->contains(Reg))
1074 return false;
1075 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1076 !MRI->constrainRegClass(Reg, OpRegCstraints))
1077 return false;
1080 return true;
1083 /// Return the opcode that does not set flags when possible - otherwise
1084 /// return the original opcode. The caller is responsible to do the actual
1085 /// substitution and legality checking.
1086 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1087 // Don't convert all compare instructions, because for some the zero register
1088 // encoding becomes the sp register.
1089 bool MIDefinesZeroReg = false;
1090 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1091 MIDefinesZeroReg = true;
1093 switch (MI.getOpcode()) {
1094 default:
1095 return MI.getOpcode();
1096 case AArch64::ADDSWrr:
1097 return AArch64::ADDWrr;
1098 case AArch64::ADDSWri:
1099 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1100 case AArch64::ADDSWrs:
1101 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1102 case AArch64::ADDSWrx:
1103 return AArch64::ADDWrx;
1104 case AArch64::ADDSXrr:
1105 return AArch64::ADDXrr;
1106 case AArch64::ADDSXri:
1107 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1108 case AArch64::ADDSXrs:
1109 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1110 case AArch64::ADDSXrx:
1111 return AArch64::ADDXrx;
1112 case AArch64::SUBSWrr:
1113 return AArch64::SUBWrr;
1114 case AArch64::SUBSWri:
1115 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1116 case AArch64::SUBSWrs:
1117 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1118 case AArch64::SUBSWrx:
1119 return AArch64::SUBWrx;
1120 case AArch64::SUBSXrr:
1121 return AArch64::SUBXrr;
1122 case AArch64::SUBSXri:
1123 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1124 case AArch64::SUBSXrs:
1125 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1126 case AArch64::SUBSXrx:
1127 return AArch64::SUBXrx;
1131 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1133 /// True when condition flags are accessed (either by writing or reading)
1134 /// on the instruction trace starting at From and ending at To.
1136 /// Note: If From and To are from different blocks it's assumed CC are accessed
1137 /// on the path.
1138 static bool areCFlagsAccessedBetweenInstrs(
1139 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1140 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1141 // Early exit if To is at the beginning of the BB.
1142 if (To == To->getParent()->begin())
1143 return true;
1145 // Check whether the instructions are in the same basic block
1146 // If not, assume the condition flags might get modified somewhere.
1147 if (To->getParent() != From->getParent())
1148 return true;
1150 // From must be above To.
1151 assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1152 [From](MachineInstr &MI) {
1153 return MI.getIterator() == From;
1154 }) != To->getParent()->rend());
1156 // We iterate backward starting \p To until we hit \p From.
1157 for (--To; To != From; --To) {
1158 const MachineInstr &Instr = *To;
1160 if (((AccessToCheck & AK_Write) &&
1161 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1162 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1163 return true;
1165 return false;
1168 /// Try to optimize a compare instruction. A compare instruction is an
1169 /// instruction which produces AArch64::NZCV. It can be truly compare
1170 /// instruction
1171 /// when there are no uses of its destination register.
1173 /// The following steps are tried in order:
1174 /// 1. Convert CmpInstr into an unconditional version.
1175 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1176 /// condition code or an instruction which can be converted into such an
1177 /// instruction.
1178 /// Only comparison with zero is supported.
1179 bool AArch64InstrInfo::optimizeCompareInstr(
1180 MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
1181 int CmpValue, const MachineRegisterInfo *MRI) const {
1182 assert(CmpInstr.getParent());
1183 assert(MRI);
1185 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1186 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1187 if (DeadNZCVIdx != -1) {
1188 if (CmpInstr.definesRegister(AArch64::WZR) ||
1189 CmpInstr.definesRegister(AArch64::XZR)) {
1190 CmpInstr.eraseFromParent();
1191 return true;
1193 unsigned Opc = CmpInstr.getOpcode();
1194 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1195 if (NewOpc == Opc)
1196 return false;
1197 const MCInstrDesc &MCID = get(NewOpc);
1198 CmpInstr.setDesc(MCID);
1199 CmpInstr.RemoveOperand(DeadNZCVIdx);
1200 bool succeeded = UpdateOperandRegClass(CmpInstr);
1201 (void)succeeded;
1202 assert(succeeded && "Some operands reg class are incompatible!");
1203 return true;
1206 // Continue only if we have a "ri" where immediate is zero.
1207 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1208 // function.
1209 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1210 if (CmpValue != 0 || SrcReg2 != 0)
1211 return false;
1213 // CmpInstr is a Compare instruction if destination register is not used.
1214 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1215 return false;
1217 return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1220 /// Get opcode of S version of Instr.
1221 /// If Instr is S version its opcode is returned.
1222 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1223 /// or we are not interested in it.
1224 static unsigned sForm(MachineInstr &Instr) {
1225 switch (Instr.getOpcode()) {
1226 default:
1227 return AArch64::INSTRUCTION_LIST_END;
1229 case AArch64::ADDSWrr:
1230 case AArch64::ADDSWri:
1231 case AArch64::ADDSXrr:
1232 case AArch64::ADDSXri:
1233 case AArch64::SUBSWrr:
1234 case AArch64::SUBSWri:
1235 case AArch64::SUBSXrr:
1236 case AArch64::SUBSXri:
1237 return Instr.getOpcode();
1239 case AArch64::ADDWrr:
1240 return AArch64::ADDSWrr;
1241 case AArch64::ADDWri:
1242 return AArch64::ADDSWri;
1243 case AArch64::ADDXrr:
1244 return AArch64::ADDSXrr;
1245 case AArch64::ADDXri:
1246 return AArch64::ADDSXri;
1247 case AArch64::ADCWr:
1248 return AArch64::ADCSWr;
1249 case AArch64::ADCXr:
1250 return AArch64::ADCSXr;
1251 case AArch64::SUBWrr:
1252 return AArch64::SUBSWrr;
1253 case AArch64::SUBWri:
1254 return AArch64::SUBSWri;
1255 case AArch64::SUBXrr:
1256 return AArch64::SUBSXrr;
1257 case AArch64::SUBXri:
1258 return AArch64::SUBSXri;
1259 case AArch64::SBCWr:
1260 return AArch64::SBCSWr;
1261 case AArch64::SBCXr:
1262 return AArch64::SBCSXr;
1263 case AArch64::ANDWri:
1264 return AArch64::ANDSWri;
1265 case AArch64::ANDXri:
1266 return AArch64::ANDSXri;
1270 /// Check if AArch64::NZCV should be alive in successors of MBB.
1271 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1272 for (auto *BB : MBB->successors())
1273 if (BB->isLiveIn(AArch64::NZCV))
1274 return true;
1275 return false;
1278 namespace {
1280 struct UsedNZCV {
1281 bool N = false;
1282 bool Z = false;
1283 bool C = false;
1284 bool V = false;
1286 UsedNZCV() = default;
1288 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1289 this->N |= UsedFlags.N;
1290 this->Z |= UsedFlags.Z;
1291 this->C |= UsedFlags.C;
1292 this->V |= UsedFlags.V;
1293 return *this;
1297 } // end anonymous namespace
1299 /// Find a condition code used by the instruction.
1300 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1301 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1302 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1303 switch (Instr.getOpcode()) {
1304 default:
1305 return AArch64CC::Invalid;
1307 case AArch64::Bcc: {
1308 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1309 assert(Idx >= 2);
1310 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1313 case AArch64::CSINVWr:
1314 case AArch64::CSINVXr:
1315 case AArch64::CSINCWr:
1316 case AArch64::CSINCXr:
1317 case AArch64::CSELWr:
1318 case AArch64::CSELXr:
1319 case AArch64::CSNEGWr:
1320 case AArch64::CSNEGXr:
1321 case AArch64::FCSELSrrr:
1322 case AArch64::FCSELDrrr: {
1323 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1324 assert(Idx >= 1);
1325 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1330 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1331 assert(CC != AArch64CC::Invalid);
1332 UsedNZCV UsedFlags;
1333 switch (CC) {
1334 default:
1335 break;
1337 case AArch64CC::EQ: // Z set
1338 case AArch64CC::NE: // Z clear
1339 UsedFlags.Z = true;
1340 break;
1342 case AArch64CC::HI: // Z clear and C set
1343 case AArch64CC::LS: // Z set or C clear
1344 UsedFlags.Z = true;
1345 LLVM_FALLTHROUGH;
1346 case AArch64CC::HS: // C set
1347 case AArch64CC::LO: // C clear
1348 UsedFlags.C = true;
1349 break;
1351 case AArch64CC::MI: // N set
1352 case AArch64CC::PL: // N clear
1353 UsedFlags.N = true;
1354 break;
1356 case AArch64CC::VS: // V set
1357 case AArch64CC::VC: // V clear
1358 UsedFlags.V = true;
1359 break;
1361 case AArch64CC::GT: // Z clear, N and V the same
1362 case AArch64CC::LE: // Z set, N and V differ
1363 UsedFlags.Z = true;
1364 LLVM_FALLTHROUGH;
1365 case AArch64CC::GE: // N and V the same
1366 case AArch64CC::LT: // N and V differ
1367 UsedFlags.N = true;
1368 UsedFlags.V = true;
1369 break;
1371 return UsedFlags;
1374 static bool isADDSRegImm(unsigned Opcode) {
1375 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1378 static bool isSUBSRegImm(unsigned Opcode) {
1379 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1382 /// Check if CmpInstr can be substituted by MI.
1384 /// CmpInstr can be substituted:
1385 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1386 /// - and, MI and CmpInstr are from the same MachineBB
1387 /// - and, condition flags are not alive in successors of the CmpInstr parent
1388 /// - and, if MI opcode is the S form there must be no defs of flags between
1389 /// MI and CmpInstr
1390 /// or if MI opcode is not the S form there must be neither defs of flags
1391 /// nor uses of flags between MI and CmpInstr.
1392 /// - and C/V flags are not used after CmpInstr
1393 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1394 const TargetRegisterInfo *TRI) {
1395 assert(MI);
1396 assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1397 assert(CmpInstr);
1399 const unsigned CmpOpcode = CmpInstr->getOpcode();
1400 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1401 return false;
1403 if (MI->getParent() != CmpInstr->getParent())
1404 return false;
1406 if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1407 return false;
1409 AccessKind AccessToCheck = AK_Write;
1410 if (sForm(*MI) != MI->getOpcode())
1411 AccessToCheck = AK_All;
1412 if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1413 return false;
1415 UsedNZCV NZCVUsedAfterCmp;
1416 for (auto I = std::next(CmpInstr->getIterator()),
1417 E = CmpInstr->getParent()->instr_end();
1418 I != E; ++I) {
1419 const MachineInstr &Instr = *I;
1420 if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1421 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1422 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1423 return false;
1424 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1427 if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1428 break;
1431 return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1434 /// Substitute an instruction comparing to zero with another instruction
1435 /// which produces needed condition flags.
1437 /// Return true on success.
1438 bool AArch64InstrInfo::substituteCmpToZero(
1439 MachineInstr &CmpInstr, unsigned SrcReg,
1440 const MachineRegisterInfo *MRI) const {
1441 assert(MRI);
1442 // Get the unique definition of SrcReg.
1443 MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1444 if (!MI)
1445 return false;
1447 const TargetRegisterInfo *TRI = &getRegisterInfo();
1449 unsigned NewOpc = sForm(*MI);
1450 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1451 return false;
1453 if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1454 return false;
1456 // Update the instruction to set NZCV.
1457 MI->setDesc(get(NewOpc));
1458 CmpInstr.eraseFromParent();
1459 bool succeeded = UpdateOperandRegClass(*MI);
1460 (void)succeeded;
1461 assert(succeeded && "Some operands reg class are incompatible!");
1462 MI->addRegisterDefined(AArch64::NZCV, TRI);
1463 return true;
1466 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1467 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1468 MI.getOpcode() != AArch64::CATCHRET)
1469 return false;
1471 MachineBasicBlock &MBB = *MI.getParent();
1472 DebugLoc DL = MI.getDebugLoc();
1474 if (MI.getOpcode() == AArch64::CATCHRET) {
1475 // Skip to the first instruction before the epilog.
1476 const TargetInstrInfo *TII =
1477 MBB.getParent()->getSubtarget().getInstrInfo();
1478 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1479 auto MBBI = MachineBasicBlock::iterator(MI);
1480 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1481 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1482 FirstEpilogSEH != MBB.begin())
1483 FirstEpilogSEH = std::prev(FirstEpilogSEH);
1484 if (FirstEpilogSEH != MBB.begin())
1485 FirstEpilogSEH = std::next(FirstEpilogSEH);
1486 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1487 .addReg(AArch64::X0, RegState::Define)
1488 .addMBB(TargetMBB);
1489 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1490 .addReg(AArch64::X0, RegState::Define)
1491 .addReg(AArch64::X0)
1492 .addMBB(TargetMBB)
1493 .addImm(0);
1494 return true;
1497 unsigned Reg = MI.getOperand(0).getReg();
1498 const GlobalValue *GV =
1499 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1500 const TargetMachine &TM = MBB.getParent()->getTarget();
1501 unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1502 const unsigned char MO_NC = AArch64II::MO_NC;
1504 if ((OpFlags & AArch64II::MO_GOT) != 0) {
1505 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1506 .addGlobalAddress(GV, 0, OpFlags);
1507 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1508 .addReg(Reg, RegState::Kill)
1509 .addImm(0)
1510 .addMemOperand(*MI.memoperands_begin());
1511 } else if (TM.getCodeModel() == CodeModel::Large) {
1512 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1513 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1514 .addImm(0);
1515 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1516 .addReg(Reg, RegState::Kill)
1517 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1518 .addImm(16);
1519 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1520 .addReg(Reg, RegState::Kill)
1521 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1522 .addImm(32);
1523 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1524 .addReg(Reg, RegState::Kill)
1525 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1526 .addImm(48);
1527 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1528 .addReg(Reg, RegState::Kill)
1529 .addImm(0)
1530 .addMemOperand(*MI.memoperands_begin());
1531 } else if (TM.getCodeModel() == CodeModel::Tiny) {
1532 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1533 .addGlobalAddress(GV, 0, OpFlags);
1534 } else {
1535 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1536 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1537 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1538 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1539 .addReg(Reg, RegState::Kill)
1540 .addGlobalAddress(GV, 0, LoFlags)
1541 .addMemOperand(*MI.memoperands_begin());
1544 MBB.erase(MI);
1546 return true;
1549 // Return true if this instruction simply sets its single destination register
1550 // to zero. This is equivalent to a register rename of the zero-register.
1551 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1552 switch (MI.getOpcode()) {
1553 default:
1554 break;
1555 case AArch64::MOVZWi:
1556 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1557 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1558 assert(MI.getDesc().getNumOperands() == 3 &&
1559 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1560 return true;
1562 break;
1563 case AArch64::ANDWri: // and Rd, Rzr, #imm
1564 return MI.getOperand(1).getReg() == AArch64::WZR;
1565 case AArch64::ANDXri:
1566 return MI.getOperand(1).getReg() == AArch64::XZR;
1567 case TargetOpcode::COPY:
1568 return MI.getOperand(1).getReg() == AArch64::WZR;
1570 return false;
1573 // Return true if this instruction simply renames a general register without
1574 // modifying bits.
1575 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1576 switch (MI.getOpcode()) {
1577 default:
1578 break;
1579 case TargetOpcode::COPY: {
1580 // GPR32 copies will by lowered to ORRXrs
1581 unsigned DstReg = MI.getOperand(0).getReg();
1582 return (AArch64::GPR32RegClass.contains(DstReg) ||
1583 AArch64::GPR64RegClass.contains(DstReg));
1585 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1586 if (MI.getOperand(1).getReg() == AArch64::XZR) {
1587 assert(MI.getDesc().getNumOperands() == 4 &&
1588 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1589 return true;
1591 break;
1592 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1593 if (MI.getOperand(2).getImm() == 0) {
1594 assert(MI.getDesc().getNumOperands() == 4 &&
1595 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1596 return true;
1598 break;
1600 return false;
1603 // Return true if this instruction simply renames a general register without
1604 // modifying bits.
1605 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1606 switch (MI.getOpcode()) {
1607 default:
1608 break;
1609 case TargetOpcode::COPY: {
1610 // FPR64 copies will by lowered to ORR.16b
1611 unsigned DstReg = MI.getOperand(0).getReg();
1612 return (AArch64::FPR64RegClass.contains(DstReg) ||
1613 AArch64::FPR128RegClass.contains(DstReg));
1615 case AArch64::ORRv16i8:
1616 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1617 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1618 "invalid ORRv16i8 operands");
1619 return true;
1621 break;
1623 return false;
1626 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1627 int &FrameIndex) const {
1628 switch (MI.getOpcode()) {
1629 default:
1630 break;
1631 case AArch64::LDRWui:
1632 case AArch64::LDRXui:
1633 case AArch64::LDRBui:
1634 case AArch64::LDRHui:
1635 case AArch64::LDRSui:
1636 case AArch64::LDRDui:
1637 case AArch64::LDRQui:
1638 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1639 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1640 FrameIndex = MI.getOperand(1).getIndex();
1641 return MI.getOperand(0).getReg();
1643 break;
1646 return 0;
1649 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1650 int &FrameIndex) const {
1651 switch (MI.getOpcode()) {
1652 default:
1653 break;
1654 case AArch64::STRWui:
1655 case AArch64::STRXui:
1656 case AArch64::STRBui:
1657 case AArch64::STRHui:
1658 case AArch64::STRSui:
1659 case AArch64::STRDui:
1660 case AArch64::STRQui:
1661 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1662 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1663 FrameIndex = MI.getOperand(1).getIndex();
1664 return MI.getOperand(0).getReg();
1666 break;
1668 return 0;
1671 /// Check all MachineMemOperands for a hint to suppress pairing.
1672 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1673 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1674 return MMO->getFlags() & MOSuppressPair;
1678 /// Set a flag on the first MachineMemOperand to suppress pairing.
1679 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1680 if (MI.memoperands_empty())
1681 return;
1682 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1685 /// Check all MachineMemOperands for a hint that the load/store is strided.
1686 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1687 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1688 return MMO->getFlags() & MOStridedAccess;
1692 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1693 switch (Opc) {
1694 default:
1695 return false;
1696 case AArch64::STURSi:
1697 case AArch64::STURDi:
1698 case AArch64::STURQi:
1699 case AArch64::STURBBi:
1700 case AArch64::STURHHi:
1701 case AArch64::STURWi:
1702 case AArch64::STURXi:
1703 case AArch64::LDURSi:
1704 case AArch64::LDURDi:
1705 case AArch64::LDURQi:
1706 case AArch64::LDURWi:
1707 case AArch64::LDURXi:
1708 case AArch64::LDURSWi:
1709 case AArch64::LDURHHi:
1710 case AArch64::LDURBBi:
1711 case AArch64::LDURSBWi:
1712 case AArch64::LDURSHWi:
1713 return true;
1717 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1718 switch (MI.getOpcode()) {
1719 default:
1720 return false;
1721 // Scaled instructions.
1722 case AArch64::STRSui:
1723 case AArch64::STRDui:
1724 case AArch64::STRQui:
1725 case AArch64::STRXui:
1726 case AArch64::STRWui:
1727 case AArch64::LDRSui:
1728 case AArch64::LDRDui:
1729 case AArch64::LDRQui:
1730 case AArch64::LDRXui:
1731 case AArch64::LDRWui:
1732 case AArch64::LDRSWui:
1733 // Unscaled instructions.
1734 case AArch64::STURSi:
1735 case AArch64::STURDi:
1736 case AArch64::STURQi:
1737 case AArch64::STURWi:
1738 case AArch64::STURXi:
1739 case AArch64::LDURSi:
1740 case AArch64::LDURDi:
1741 case AArch64::LDURQi:
1742 case AArch64::LDURWi:
1743 case AArch64::LDURXi:
1744 case AArch64::LDURSWi:
1745 return true;
1749 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1750 bool &Is64Bit) {
1751 switch (Opc) {
1752 default:
1753 llvm_unreachable("Opcode has no flag setting equivalent!");
1754 // 32-bit cases:
1755 case AArch64::ADDWri:
1756 Is64Bit = false;
1757 return AArch64::ADDSWri;
1758 case AArch64::ADDWrr:
1759 Is64Bit = false;
1760 return AArch64::ADDSWrr;
1761 case AArch64::ADDWrs:
1762 Is64Bit = false;
1763 return AArch64::ADDSWrs;
1764 case AArch64::ADDWrx:
1765 Is64Bit = false;
1766 return AArch64::ADDSWrx;
1767 case AArch64::ANDWri:
1768 Is64Bit = false;
1769 return AArch64::ANDSWri;
1770 case AArch64::ANDWrr:
1771 Is64Bit = false;
1772 return AArch64::ANDSWrr;
1773 case AArch64::ANDWrs:
1774 Is64Bit = false;
1775 return AArch64::ANDSWrs;
1776 case AArch64::BICWrr:
1777 Is64Bit = false;
1778 return AArch64::BICSWrr;
1779 case AArch64::BICWrs:
1780 Is64Bit = false;
1781 return AArch64::BICSWrs;
1782 case AArch64::SUBWri:
1783 Is64Bit = false;
1784 return AArch64::SUBSWri;
1785 case AArch64::SUBWrr:
1786 Is64Bit = false;
1787 return AArch64::SUBSWrr;
1788 case AArch64::SUBWrs:
1789 Is64Bit = false;
1790 return AArch64::SUBSWrs;
1791 case AArch64::SUBWrx:
1792 Is64Bit = false;
1793 return AArch64::SUBSWrx;
1794 // 64-bit cases:
1795 case AArch64::ADDXri:
1796 Is64Bit = true;
1797 return AArch64::ADDSXri;
1798 case AArch64::ADDXrr:
1799 Is64Bit = true;
1800 return AArch64::ADDSXrr;
1801 case AArch64::ADDXrs:
1802 Is64Bit = true;
1803 return AArch64::ADDSXrs;
1804 case AArch64::ADDXrx:
1805 Is64Bit = true;
1806 return AArch64::ADDSXrx;
1807 case AArch64::ANDXri:
1808 Is64Bit = true;
1809 return AArch64::ANDSXri;
1810 case AArch64::ANDXrr:
1811 Is64Bit = true;
1812 return AArch64::ANDSXrr;
1813 case AArch64::ANDXrs:
1814 Is64Bit = true;
1815 return AArch64::ANDSXrs;
1816 case AArch64::BICXrr:
1817 Is64Bit = true;
1818 return AArch64::BICSXrr;
1819 case AArch64::BICXrs:
1820 Is64Bit = true;
1821 return AArch64::BICSXrs;
1822 case AArch64::SUBXri:
1823 Is64Bit = true;
1824 return AArch64::SUBSXri;
1825 case AArch64::SUBXrr:
1826 Is64Bit = true;
1827 return AArch64::SUBSXrr;
1828 case AArch64::SUBXrs:
1829 Is64Bit = true;
1830 return AArch64::SUBSXrs;
1831 case AArch64::SUBXrx:
1832 Is64Bit = true;
1833 return AArch64::SUBSXrx;
1837 // Is this a candidate for ld/st merging or pairing? For example, we don't
1838 // touch volatiles or load/stores that have a hint to avoid pair formation.
1839 bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
1840 // If this is a volatile load/store, don't mess with it.
1841 if (MI.hasOrderedMemoryRef())
1842 return false;
1844 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
1845 assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
1846 "Expected a reg or frame index operand.");
1847 if (!MI.getOperand(2).isImm())
1848 return false;
1850 // Can't merge/pair if the instruction modifies the base register.
1851 // e.g., ldr x0, [x0]
1852 // This case will never occur with an FI base.
1853 if (MI.getOperand(1).isReg()) {
1854 unsigned BaseReg = MI.getOperand(1).getReg();
1855 const TargetRegisterInfo *TRI = &getRegisterInfo();
1856 if (MI.modifiesRegister(BaseReg, TRI))
1857 return false;
1860 // Check if this load/store has a hint to avoid pair formation.
1861 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1862 if (isLdStPairSuppressed(MI))
1863 return false;
1865 // On some CPUs quad load/store pairs are slower than two single load/stores.
1866 if (Subtarget.isPaired128Slow()) {
1867 switch (MI.getOpcode()) {
1868 default:
1869 break;
1870 case AArch64::LDURQi:
1871 case AArch64::STURQi:
1872 case AArch64::LDRQui:
1873 case AArch64::STRQui:
1874 return false;
1878 return true;
1881 bool AArch64InstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
1882 MachineOperand *&BaseOp,
1883 int64_t &Offset,
1884 const TargetRegisterInfo *TRI) const {
1885 unsigned Width;
1886 return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
1889 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
1890 MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
1891 unsigned &Width, const TargetRegisterInfo *TRI) const {
1892 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1893 // Handle only loads/stores with base register followed by immediate offset.
1894 if (LdSt.getNumExplicitOperands() == 3) {
1895 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
1896 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
1897 !LdSt.getOperand(2).isImm())
1898 return false;
1899 } else if (LdSt.getNumExplicitOperands() == 4) {
1900 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
1901 if (!LdSt.getOperand(1).isReg() ||
1902 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
1903 !LdSt.getOperand(3).isImm())
1904 return false;
1905 } else
1906 return false;
1908 // Get the scaling factor for the instruction and set the width for the
1909 // instruction.
1910 unsigned Scale = 0;
1911 int64_t Dummy1, Dummy2;
1913 // If this returns false, then it's an instruction we don't want to handle.
1914 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
1915 return false;
1917 // Compute the offset. Offset is calculated as the immediate operand
1918 // multiplied by the scaling factor. Unscaled instructions have scaling factor
1919 // set to 1.
1920 if (LdSt.getNumExplicitOperands() == 3) {
1921 BaseOp = &LdSt.getOperand(1);
1922 Offset = LdSt.getOperand(2).getImm() * Scale;
1923 } else {
1924 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
1925 BaseOp = &LdSt.getOperand(2);
1926 Offset = LdSt.getOperand(3).getImm() * Scale;
1929 assert((BaseOp->isReg() || BaseOp->isFI()) &&
1930 "getMemOperandWithOffset only supports base "
1931 "operands of type register or frame index.");
1933 return true;
1936 MachineOperand &
1937 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
1938 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1939 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
1940 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
1941 return OfsOp;
1944 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
1945 unsigned &Width, int64_t &MinOffset,
1946 int64_t &MaxOffset) const {
1947 switch (Opcode) {
1948 // Not a memory operation or something we want to handle.
1949 default:
1950 Scale = Width = 0;
1951 MinOffset = MaxOffset = 0;
1952 return false;
1953 case AArch64::STRWpost:
1954 case AArch64::LDRWpost:
1955 Width = 32;
1956 Scale = 4;
1957 MinOffset = -256;
1958 MaxOffset = 255;
1959 break;
1960 case AArch64::LDURQi:
1961 case AArch64::STURQi:
1962 Width = 16;
1963 Scale = 1;
1964 MinOffset = -256;
1965 MaxOffset = 255;
1966 break;
1967 case AArch64::LDURXi:
1968 case AArch64::LDURDi:
1969 case AArch64::STURXi:
1970 case AArch64::STURDi:
1971 Width = 8;
1972 Scale = 1;
1973 MinOffset = -256;
1974 MaxOffset = 255;
1975 break;
1976 case AArch64::LDURWi:
1977 case AArch64::LDURSi:
1978 case AArch64::LDURSWi:
1979 case AArch64::STURWi:
1980 case AArch64::STURSi:
1981 Width = 4;
1982 Scale = 1;
1983 MinOffset = -256;
1984 MaxOffset = 255;
1985 break;
1986 case AArch64::LDURHi:
1987 case AArch64::LDURHHi:
1988 case AArch64::LDURSHXi:
1989 case AArch64::LDURSHWi:
1990 case AArch64::STURHi:
1991 case AArch64::STURHHi:
1992 Width = 2;
1993 Scale = 1;
1994 MinOffset = -256;
1995 MaxOffset = 255;
1996 break;
1997 case AArch64::LDURBi:
1998 case AArch64::LDURBBi:
1999 case AArch64::LDURSBXi:
2000 case AArch64::LDURSBWi:
2001 case AArch64::STURBi:
2002 case AArch64::STURBBi:
2003 Width = 1;
2004 Scale = 1;
2005 MinOffset = -256;
2006 MaxOffset = 255;
2007 break;
2008 case AArch64::LDPQi:
2009 case AArch64::LDNPQi:
2010 case AArch64::STPQi:
2011 case AArch64::STNPQi:
2012 Scale = 16;
2013 Width = 32;
2014 MinOffset = -64;
2015 MaxOffset = 63;
2016 break;
2017 case AArch64::LDRQui:
2018 case AArch64::STRQui:
2019 Scale = Width = 16;
2020 MinOffset = 0;
2021 MaxOffset = 4095;
2022 break;
2023 case AArch64::LDPXi:
2024 case AArch64::LDPDi:
2025 case AArch64::LDNPXi:
2026 case AArch64::LDNPDi:
2027 case AArch64::STPXi:
2028 case AArch64::STPDi:
2029 case AArch64::STNPXi:
2030 case AArch64::STNPDi:
2031 Scale = 8;
2032 Width = 16;
2033 MinOffset = -64;
2034 MaxOffset = 63;
2035 break;
2036 case AArch64::LDRXui:
2037 case AArch64::LDRDui:
2038 case AArch64::STRXui:
2039 case AArch64::STRDui:
2040 Scale = Width = 8;
2041 MinOffset = 0;
2042 MaxOffset = 4095;
2043 break;
2044 case AArch64::LDPWi:
2045 case AArch64::LDPSi:
2046 case AArch64::LDNPWi:
2047 case AArch64::LDNPSi:
2048 case AArch64::STPWi:
2049 case AArch64::STPSi:
2050 case AArch64::STNPWi:
2051 case AArch64::STNPSi:
2052 Scale = 4;
2053 Width = 8;
2054 MinOffset = -64;
2055 MaxOffset = 63;
2056 break;
2057 case AArch64::LDRWui:
2058 case AArch64::LDRSui:
2059 case AArch64::LDRSWui:
2060 case AArch64::STRWui:
2061 case AArch64::STRSui:
2062 Scale = Width = 4;
2063 MinOffset = 0;
2064 MaxOffset = 4095;
2065 break;
2066 case AArch64::LDRHui:
2067 case AArch64::LDRHHui:
2068 case AArch64::STRHui:
2069 case AArch64::STRHHui:
2070 Scale = Width = 2;
2071 MinOffset = 0;
2072 MaxOffset = 4095;
2073 break;
2074 case AArch64::LDRBui:
2075 case AArch64::LDRBBui:
2076 case AArch64::STRBui:
2077 case AArch64::STRBBui:
2078 Scale = Width = 1;
2079 MinOffset = 0;
2080 MaxOffset = 4095;
2081 break;
2084 return true;
2087 static unsigned getOffsetStride(unsigned Opc) {
2088 switch (Opc) {
2089 default:
2090 return 0;
2091 case AArch64::LDURQi:
2092 case AArch64::STURQi:
2093 return 16;
2094 case AArch64::LDURXi:
2095 case AArch64::LDURDi:
2096 case AArch64::STURXi:
2097 case AArch64::STURDi:
2098 return 8;
2099 case AArch64::LDURWi:
2100 case AArch64::LDURSi:
2101 case AArch64::LDURSWi:
2102 case AArch64::STURWi:
2103 case AArch64::STURSi:
2104 return 4;
2108 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
2109 // scaled.
2110 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2111 unsigned OffsetStride = getOffsetStride(Opc);
2112 if (OffsetStride == 0)
2113 return false;
2114 // If the byte-offset isn't a multiple of the stride, we can't scale this
2115 // offset.
2116 if (Offset % OffsetStride != 0)
2117 return false;
2119 // Convert the byte-offset used by unscaled into an "element" offset used
2120 // by the scaled pair load/store instructions.
2121 Offset /= OffsetStride;
2122 return true;
2125 // Unscale the scaled offsets. Returns false if the scaled offset can't be
2126 // unscaled.
2127 static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
2128 unsigned OffsetStride = getOffsetStride(Opc);
2129 if (OffsetStride == 0)
2130 return false;
2132 // Convert the "element" offset used by scaled pair load/store instructions
2133 // into the byte-offset used by unscaled.
2134 Offset *= OffsetStride;
2135 return true;
2138 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2139 if (FirstOpc == SecondOpc)
2140 return true;
2141 // We can also pair sign-ext and zero-ext instructions.
2142 switch (FirstOpc) {
2143 default:
2144 return false;
2145 case AArch64::LDRWui:
2146 case AArch64::LDURWi:
2147 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2148 case AArch64::LDRSWui:
2149 case AArch64::LDURSWi:
2150 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2152 // These instructions can't be paired based on their opcodes.
2153 return false;
2156 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2157 int64_t Offset1, unsigned Opcode1, int FI2,
2158 int64_t Offset2, unsigned Opcode2) {
2159 // Accesses through fixed stack object frame indices may access a different
2160 // fixed stack slot. Check that the object offsets + offsets match.
2161 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2162 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2163 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2164 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2165 // Get the byte-offset from the object offset.
2166 if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
2167 return false;
2168 ObjectOffset1 += Offset1;
2169 ObjectOffset2 += Offset2;
2170 // Get the "element" index in the object.
2171 if (!scaleOffset(Opcode1, ObjectOffset1) ||
2172 !scaleOffset(Opcode2, ObjectOffset2))
2173 return false;
2174 return ObjectOffset1 + 1 == ObjectOffset2;
2177 return FI1 == FI2;
2180 /// Detect opportunities for ldp/stp formation.
2182 /// Only called for LdSt for which getMemOperandWithOffset returns true.
2183 bool AArch64InstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
2184 MachineOperand &BaseOp2,
2185 unsigned NumLoads) const {
2186 MachineInstr &FirstLdSt = *BaseOp1.getParent();
2187 MachineInstr &SecondLdSt = *BaseOp2.getParent();
2188 if (BaseOp1.getType() != BaseOp2.getType())
2189 return false;
2191 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2192 "Only base registers and frame indices are supported.");
2194 // Check for both base regs and base FI.
2195 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2196 return false;
2198 // Only cluster up to a single pair.
2199 if (NumLoads > 1)
2200 return false;
2202 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2203 return false;
2205 // Can we pair these instructions based on their opcodes?
2206 unsigned FirstOpc = FirstLdSt.getOpcode();
2207 unsigned SecondOpc = SecondLdSt.getOpcode();
2208 if (!canPairLdStOpc(FirstOpc, SecondOpc))
2209 return false;
2211 // Can't merge volatiles or load/stores that have a hint to avoid pair
2212 // formation, for example.
2213 if (!isCandidateToMergeOrPair(FirstLdSt) ||
2214 !isCandidateToMergeOrPair(SecondLdSt))
2215 return false;
2217 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2218 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2219 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2220 return false;
2222 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2223 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2224 return false;
2226 // Pairwise instructions have a 7-bit signed offset field.
2227 if (Offset1 > 63 || Offset1 < -64)
2228 return false;
2230 // The caller should already have ordered First/SecondLdSt by offset.
2231 // Note: except for non-equal frame index bases
2232 if (BaseOp1.isFI()) {
2233 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
2234 "Caller should have ordered offsets.");
2236 const MachineFrameInfo &MFI =
2237 FirstLdSt.getParent()->getParent()->getFrameInfo();
2238 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2239 BaseOp2.getIndex(), Offset2, SecondOpc);
2242 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2243 "Caller should have ordered offsets.");
2245 return Offset1 + 1 == Offset2;
2248 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2249 unsigned Reg, unsigned SubIdx,
2250 unsigned State,
2251 const TargetRegisterInfo *TRI) {
2252 if (!SubIdx)
2253 return MIB.addReg(Reg, State);
2255 if (TargetRegisterInfo::isPhysicalRegister(Reg))
2256 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2257 return MIB.addReg(Reg, State, SubIdx);
2260 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2261 unsigned NumRegs) {
2262 // We really want the positive remainder mod 32 here, that happens to be
2263 // easily obtainable with a mask.
2264 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2267 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2268 MachineBasicBlock::iterator I,
2269 const DebugLoc &DL, unsigned DestReg,
2270 unsigned SrcReg, bool KillSrc,
2271 unsigned Opcode,
2272 ArrayRef<unsigned> Indices) const {
2273 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2274 const TargetRegisterInfo *TRI = &getRegisterInfo();
2275 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2276 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2277 unsigned NumRegs = Indices.size();
2279 int SubReg = 0, End = NumRegs, Incr = 1;
2280 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2281 SubReg = NumRegs - 1;
2282 End = -1;
2283 Incr = -1;
2286 for (; SubReg != End; SubReg += Incr) {
2287 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2288 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2289 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2290 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2294 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2295 MachineBasicBlock::iterator I,
2296 DebugLoc DL, unsigned DestReg,
2297 unsigned SrcReg, bool KillSrc,
2298 unsigned Opcode, unsigned ZeroReg,
2299 llvm::ArrayRef<unsigned> Indices) const {
2300 const TargetRegisterInfo *TRI = &getRegisterInfo();
2301 unsigned NumRegs = Indices.size();
2303 #ifndef NDEBUG
2304 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2305 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2306 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2307 "GPR reg sequences should not be able to overlap");
2308 #endif
2310 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2311 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2312 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2313 MIB.addReg(ZeroReg);
2314 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2315 MIB.addImm(0);
2319 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2320 MachineBasicBlock::iterator I,
2321 const DebugLoc &DL, unsigned DestReg,
2322 unsigned SrcReg, bool KillSrc) const {
2323 if (AArch64::GPR32spRegClass.contains(DestReg) &&
2324 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2325 const TargetRegisterInfo *TRI = &getRegisterInfo();
2327 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2328 // If either operand is WSP, expand to ADD #0.
2329 if (Subtarget.hasZeroCycleRegMove()) {
2330 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2331 unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2332 &AArch64::GPR64spRegClass);
2333 unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2334 &AArch64::GPR64spRegClass);
2335 // This instruction is reading and writing X registers. This may upset
2336 // the register scavenger and machine verifier, so we need to indicate
2337 // that we are reading an undefined value from SrcRegX, but a proper
2338 // value from SrcReg.
2339 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2340 .addReg(SrcRegX, RegState::Undef)
2341 .addImm(0)
2342 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2343 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2344 } else {
2345 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2346 .addReg(SrcReg, getKillRegState(KillSrc))
2347 .addImm(0)
2348 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2350 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2351 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2352 .addImm(0)
2353 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2354 } else {
2355 if (Subtarget.hasZeroCycleRegMove()) {
2356 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2357 unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2358 &AArch64::GPR64spRegClass);
2359 unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2360 &AArch64::GPR64spRegClass);
2361 // This instruction is reading and writing X registers. This may upset
2362 // the register scavenger and machine verifier, so we need to indicate
2363 // that we are reading an undefined value from SrcRegX, but a proper
2364 // value from SrcReg.
2365 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2366 .addReg(AArch64::XZR)
2367 .addReg(SrcRegX, RegState::Undef)
2368 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2369 } else {
2370 // Otherwise, expand to ORR WZR.
2371 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2372 .addReg(AArch64::WZR)
2373 .addReg(SrcReg, getKillRegState(KillSrc));
2376 return;
2379 if (AArch64::GPR64spRegClass.contains(DestReg) &&
2380 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2381 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2382 // If either operand is SP, expand to ADD #0.
2383 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2384 .addReg(SrcReg, getKillRegState(KillSrc))
2385 .addImm(0)
2386 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2387 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2388 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2389 .addImm(0)
2390 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2391 } else {
2392 // Otherwise, expand to ORR XZR.
2393 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2394 .addReg(AArch64::XZR)
2395 .addReg(SrcReg, getKillRegState(KillSrc));
2397 return;
2400 // Copy a DDDD register quad by copying the individual sub-registers.
2401 if (AArch64::DDDDRegClass.contains(DestReg) &&
2402 AArch64::DDDDRegClass.contains(SrcReg)) {
2403 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2404 AArch64::dsub2, AArch64::dsub3};
2405 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2406 Indices);
2407 return;
2410 // Copy a DDD register triple by copying the individual sub-registers.
2411 if (AArch64::DDDRegClass.contains(DestReg) &&
2412 AArch64::DDDRegClass.contains(SrcReg)) {
2413 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2414 AArch64::dsub2};
2415 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2416 Indices);
2417 return;
2420 // Copy a DD register pair by copying the individual sub-registers.
2421 if (AArch64::DDRegClass.contains(DestReg) &&
2422 AArch64::DDRegClass.contains(SrcReg)) {
2423 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2424 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2425 Indices);
2426 return;
2429 // Copy a QQQQ register quad by copying the individual sub-registers.
2430 if (AArch64::QQQQRegClass.contains(DestReg) &&
2431 AArch64::QQQQRegClass.contains(SrcReg)) {
2432 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2433 AArch64::qsub2, AArch64::qsub3};
2434 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2435 Indices);
2436 return;
2439 // Copy a QQQ register triple by copying the individual sub-registers.
2440 if (AArch64::QQQRegClass.contains(DestReg) &&
2441 AArch64::QQQRegClass.contains(SrcReg)) {
2442 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2443 AArch64::qsub2};
2444 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2445 Indices);
2446 return;
2449 // Copy a QQ register pair by copying the individual sub-registers.
2450 if (AArch64::QQRegClass.contains(DestReg) &&
2451 AArch64::QQRegClass.contains(SrcReg)) {
2452 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2453 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2454 Indices);
2455 return;
2458 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2459 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2460 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2461 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2462 AArch64::XZR, Indices);
2463 return;
2466 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2467 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2468 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2469 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2470 AArch64::WZR, Indices);
2471 return;
2474 if (AArch64::FPR128RegClass.contains(DestReg) &&
2475 AArch64::FPR128RegClass.contains(SrcReg)) {
2476 if (Subtarget.hasNEON()) {
2477 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2478 .addReg(SrcReg)
2479 .addReg(SrcReg, getKillRegState(KillSrc));
2480 } else {
2481 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2482 .addReg(AArch64::SP, RegState::Define)
2483 .addReg(SrcReg, getKillRegState(KillSrc))
2484 .addReg(AArch64::SP)
2485 .addImm(-16);
2486 BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2487 .addReg(AArch64::SP, RegState::Define)
2488 .addReg(DestReg, RegState::Define)
2489 .addReg(AArch64::SP)
2490 .addImm(16);
2492 return;
2495 if (AArch64::FPR64RegClass.contains(DestReg) &&
2496 AArch64::FPR64RegClass.contains(SrcReg)) {
2497 if (Subtarget.hasNEON()) {
2498 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2499 &AArch64::FPR128RegClass);
2500 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2501 &AArch64::FPR128RegClass);
2502 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2503 .addReg(SrcReg)
2504 .addReg(SrcReg, getKillRegState(KillSrc));
2505 } else {
2506 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2507 .addReg(SrcReg, getKillRegState(KillSrc));
2509 return;
2512 if (AArch64::FPR32RegClass.contains(DestReg) &&
2513 AArch64::FPR32RegClass.contains(SrcReg)) {
2514 if (Subtarget.hasNEON()) {
2515 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2516 &AArch64::FPR128RegClass);
2517 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2518 &AArch64::FPR128RegClass);
2519 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2520 .addReg(SrcReg)
2521 .addReg(SrcReg, getKillRegState(KillSrc));
2522 } else {
2523 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2524 .addReg(SrcReg, getKillRegState(KillSrc));
2526 return;
2529 if (AArch64::FPR16RegClass.contains(DestReg) &&
2530 AArch64::FPR16RegClass.contains(SrcReg)) {
2531 if (Subtarget.hasNEON()) {
2532 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2533 &AArch64::FPR128RegClass);
2534 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2535 &AArch64::FPR128RegClass);
2536 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2537 .addReg(SrcReg)
2538 .addReg(SrcReg, getKillRegState(KillSrc));
2539 } else {
2540 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2541 &AArch64::FPR32RegClass);
2542 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2543 &AArch64::FPR32RegClass);
2544 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2545 .addReg(SrcReg, getKillRegState(KillSrc));
2547 return;
2550 if (AArch64::FPR8RegClass.contains(DestReg) &&
2551 AArch64::FPR8RegClass.contains(SrcReg)) {
2552 if (Subtarget.hasNEON()) {
2553 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2554 &AArch64::FPR128RegClass);
2555 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2556 &AArch64::FPR128RegClass);
2557 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2558 .addReg(SrcReg)
2559 .addReg(SrcReg, getKillRegState(KillSrc));
2560 } else {
2561 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2562 &AArch64::FPR32RegClass);
2563 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2564 &AArch64::FPR32RegClass);
2565 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2566 .addReg(SrcReg, getKillRegState(KillSrc));
2568 return;
2571 // Copies between GPR64 and FPR64.
2572 if (AArch64::FPR64RegClass.contains(DestReg) &&
2573 AArch64::GPR64RegClass.contains(SrcReg)) {
2574 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2575 .addReg(SrcReg, getKillRegState(KillSrc));
2576 return;
2578 if (AArch64::GPR64RegClass.contains(DestReg) &&
2579 AArch64::FPR64RegClass.contains(SrcReg)) {
2580 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2581 .addReg(SrcReg, getKillRegState(KillSrc));
2582 return;
2584 // Copies between GPR32 and FPR32.
2585 if (AArch64::FPR32RegClass.contains(DestReg) &&
2586 AArch64::GPR32RegClass.contains(SrcReg)) {
2587 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2588 .addReg(SrcReg, getKillRegState(KillSrc));
2589 return;
2591 if (AArch64::GPR32RegClass.contains(DestReg) &&
2592 AArch64::FPR32RegClass.contains(SrcReg)) {
2593 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2594 .addReg(SrcReg, getKillRegState(KillSrc));
2595 return;
2598 if (DestReg == AArch64::NZCV) {
2599 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2600 BuildMI(MBB, I, DL, get(AArch64::MSR))
2601 .addImm(AArch64SysReg::NZCV)
2602 .addReg(SrcReg, getKillRegState(KillSrc))
2603 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
2604 return;
2607 if (SrcReg == AArch64::NZCV) {
2608 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
2609 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
2610 .addImm(AArch64SysReg::NZCV)
2611 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
2612 return;
2615 llvm_unreachable("unimplemented reg-to-reg copy");
2618 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
2619 MachineBasicBlock &MBB,
2620 MachineBasicBlock::iterator InsertBefore,
2621 const MCInstrDesc &MCID,
2622 unsigned SrcReg, bool IsKill,
2623 unsigned SubIdx0, unsigned SubIdx1, int FI,
2624 MachineMemOperand *MMO) {
2625 unsigned SrcReg0 = SrcReg;
2626 unsigned SrcReg1 = SrcReg;
2627 if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
2628 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
2629 SubIdx0 = 0;
2630 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
2631 SubIdx1 = 0;
2633 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2634 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
2635 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
2636 .addFrameIndex(FI)
2637 .addImm(0)
2638 .addMemOperand(MMO);
2641 void AArch64InstrInfo::storeRegToStackSlot(
2642 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
2643 bool isKill, int FI, const TargetRegisterClass *RC,
2644 const TargetRegisterInfo *TRI) const {
2645 MachineFunction &MF = *MBB.getParent();
2646 MachineFrameInfo &MFI = MF.getFrameInfo();
2647 unsigned Align = MFI.getObjectAlignment(FI);
2649 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2650 MachineMemOperand *MMO = MF.getMachineMemOperand(
2651 PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
2652 unsigned Opc = 0;
2653 bool Offset = true;
2654 switch (TRI->getSpillSize(*RC)) {
2655 case 1:
2656 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2657 Opc = AArch64::STRBui;
2658 break;
2659 case 2:
2660 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2661 Opc = AArch64::STRHui;
2662 break;
2663 case 4:
2664 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2665 Opc = AArch64::STRWui;
2666 if (TargetRegisterInfo::isVirtualRegister(SrcReg))
2667 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
2668 else
2669 assert(SrcReg != AArch64::WSP);
2670 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2671 Opc = AArch64::STRSui;
2672 break;
2673 case 8:
2674 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2675 Opc = AArch64::STRXui;
2676 if (TargetRegisterInfo::isVirtualRegister(SrcReg))
2677 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2678 else
2679 assert(SrcReg != AArch64::SP);
2680 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2681 Opc = AArch64::STRDui;
2682 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2683 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2684 get(AArch64::STPWi), SrcReg, isKill,
2685 AArch64::sube32, AArch64::subo32, FI, MMO);
2686 return;
2688 break;
2689 case 16:
2690 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2691 Opc = AArch64::STRQui;
2692 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2693 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2694 Opc = AArch64::ST1Twov1d;
2695 Offset = false;
2696 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2697 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2698 get(AArch64::STPXi), SrcReg, isKill,
2699 AArch64::sube64, AArch64::subo64, FI, MMO);
2700 return;
2702 break;
2703 case 24:
2704 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2705 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2706 Opc = AArch64::ST1Threev1d;
2707 Offset = false;
2709 break;
2710 case 32:
2711 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2712 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2713 Opc = AArch64::ST1Fourv1d;
2714 Offset = false;
2715 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2716 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2717 Opc = AArch64::ST1Twov2d;
2718 Offset = false;
2720 break;
2721 case 48:
2722 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2723 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2724 Opc = AArch64::ST1Threev2d;
2725 Offset = false;
2727 break;
2728 case 64:
2729 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2730 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2731 Opc = AArch64::ST1Fourv2d;
2732 Offset = false;
2734 break;
2736 assert(Opc && "Unknown register class");
2738 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2739 .addReg(SrcReg, getKillRegState(isKill))
2740 .addFrameIndex(FI);
2742 if (Offset)
2743 MI.addImm(0);
2744 MI.addMemOperand(MMO);
2747 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
2748 MachineBasicBlock &MBB,
2749 MachineBasicBlock::iterator InsertBefore,
2750 const MCInstrDesc &MCID,
2751 unsigned DestReg, unsigned SubIdx0,
2752 unsigned SubIdx1, int FI,
2753 MachineMemOperand *MMO) {
2754 unsigned DestReg0 = DestReg;
2755 unsigned DestReg1 = DestReg;
2756 bool IsUndef = true;
2757 if (TargetRegisterInfo::isPhysicalRegister(DestReg)) {
2758 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
2759 SubIdx0 = 0;
2760 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
2761 SubIdx1 = 0;
2762 IsUndef = false;
2764 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2765 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
2766 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
2767 .addFrameIndex(FI)
2768 .addImm(0)
2769 .addMemOperand(MMO);
2772 void AArch64InstrInfo::loadRegFromStackSlot(
2773 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
2774 int FI, const TargetRegisterClass *RC,
2775 const TargetRegisterInfo *TRI) const {
2776 MachineFunction &MF = *MBB.getParent();
2777 MachineFrameInfo &MFI = MF.getFrameInfo();
2778 unsigned Align = MFI.getObjectAlignment(FI);
2779 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2780 MachineMemOperand *MMO = MF.getMachineMemOperand(
2781 PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
2783 unsigned Opc = 0;
2784 bool Offset = true;
2785 switch (TRI->getSpillSize(*RC)) {
2786 case 1:
2787 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2788 Opc = AArch64::LDRBui;
2789 break;
2790 case 2:
2791 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2792 Opc = AArch64::LDRHui;
2793 break;
2794 case 4:
2795 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2796 Opc = AArch64::LDRWui;
2797 if (TargetRegisterInfo::isVirtualRegister(DestReg))
2798 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
2799 else
2800 assert(DestReg != AArch64::WSP);
2801 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2802 Opc = AArch64::LDRSui;
2803 break;
2804 case 8:
2805 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2806 Opc = AArch64::LDRXui;
2807 if (TargetRegisterInfo::isVirtualRegister(DestReg))
2808 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
2809 else
2810 assert(DestReg != AArch64::SP);
2811 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2812 Opc = AArch64::LDRDui;
2813 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2814 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
2815 get(AArch64::LDPWi), DestReg, AArch64::sube32,
2816 AArch64::subo32, FI, MMO);
2817 return;
2819 break;
2820 case 16:
2821 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2822 Opc = AArch64::LDRQui;
2823 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2824 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2825 Opc = AArch64::LD1Twov1d;
2826 Offset = false;
2827 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2828 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
2829 get(AArch64::LDPXi), DestReg, AArch64::sube64,
2830 AArch64::subo64, FI, MMO);
2831 return;
2833 break;
2834 case 24:
2835 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2836 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2837 Opc = AArch64::LD1Threev1d;
2838 Offset = false;
2840 break;
2841 case 32:
2842 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2843 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2844 Opc = AArch64::LD1Fourv1d;
2845 Offset = false;
2846 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2847 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2848 Opc = AArch64::LD1Twov2d;
2849 Offset = false;
2851 break;
2852 case 48:
2853 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2854 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2855 Opc = AArch64::LD1Threev2d;
2856 Offset = false;
2858 break;
2859 case 64:
2860 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2861 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2862 Opc = AArch64::LD1Fourv2d;
2863 Offset = false;
2865 break;
2867 assert(Opc && "Unknown register class");
2869 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2870 .addReg(DestReg, getDefRegState(true))
2871 .addFrameIndex(FI);
2872 if (Offset)
2873 MI.addImm(0);
2874 MI.addMemOperand(MMO);
2877 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
2878 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
2879 unsigned DestReg, unsigned SrcReg, int Offset,
2880 const TargetInstrInfo *TII,
2881 MachineInstr::MIFlag Flag, bool SetNZCV,
2882 bool NeedsWinCFI) {
2883 if (DestReg == SrcReg && Offset == 0)
2884 return;
2886 assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
2887 "SP increment/decrement not 16-byte aligned");
2889 bool isSub = Offset < 0;
2890 if (isSub)
2891 Offset = -Offset;
2893 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
2894 // scratch register. If DestReg is a virtual register, use it as the
2895 // scratch register; otherwise, create a new virtual register (to be
2896 // replaced by the scavenger at the end of PEI). That case can be optimized
2897 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
2898 // register can be loaded with offset%8 and the add/sub can use an extending
2899 // instruction with LSL#3.
2900 // Currently the function handles any offsets but generates a poor sequence
2901 // of code.
2902 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
2904 unsigned Opc;
2905 if (SetNZCV)
2906 Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
2907 else
2908 Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
2909 const unsigned MaxEncoding = 0xfff;
2910 const unsigned ShiftSize = 12;
2911 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
2912 while (((unsigned)Offset) >= (1 << ShiftSize)) {
2913 unsigned ThisVal;
2914 if (((unsigned)Offset) > MaxEncodableValue) {
2915 ThisVal = MaxEncodableValue;
2916 } else {
2917 ThisVal = Offset & MaxEncodableValue;
2919 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
2920 "Encoding cannot handle value that big");
2921 BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
2922 .addReg(SrcReg)
2923 .addImm(ThisVal >> ShiftSize)
2924 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
2925 .setMIFlag(Flag);
2927 if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP)
2928 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
2929 .addImm(ThisVal)
2930 .setMIFlag(Flag);
2932 SrcReg = DestReg;
2933 Offset -= ThisVal;
2934 if (Offset == 0)
2935 return;
2937 BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
2938 .addReg(SrcReg)
2939 .addImm(Offset)
2940 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2941 .setMIFlag(Flag);
2943 if (NeedsWinCFI) {
2944 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
2945 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
2946 if (Offset == 0)
2947 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
2948 setMIFlag(Flag);
2949 else
2950 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
2951 addImm(Offset).setMIFlag(Flag);
2952 } else if (DestReg == AArch64::SP) {
2953 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
2954 addImm(Offset).setMIFlag(Flag);
2959 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
2960 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
2961 MachineBasicBlock::iterator InsertPt, int FrameIndex,
2962 LiveIntervals *LIS) const {
2963 // This is a bit of a hack. Consider this instruction:
2965 // %0 = COPY %sp; GPR64all:%0
2967 // We explicitly chose GPR64all for the virtual register so such a copy might
2968 // be eliminated by RegisterCoalescer. However, that may not be possible, and
2969 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
2970 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
2972 // To prevent that, we are going to constrain the %0 register class here.
2974 // <rdar://problem/11522048>
2976 if (MI.isFullCopy()) {
2977 unsigned DstReg = MI.getOperand(0).getReg();
2978 unsigned SrcReg = MI.getOperand(1).getReg();
2979 if (SrcReg == AArch64::SP &&
2980 TargetRegisterInfo::isVirtualRegister(DstReg)) {
2981 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
2982 return nullptr;
2984 if (DstReg == AArch64::SP &&
2985 TargetRegisterInfo::isVirtualRegister(SrcReg)) {
2986 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2987 return nullptr;
2991 // Handle the case where a copy is being spilled or filled but the source
2992 // and destination register class don't match. For example:
2994 // %0 = COPY %xzr; GPR64common:%0
2996 // In this case we can still safely fold away the COPY and generate the
2997 // following spill code:
2999 // STRXui %xzr, %stack.0
3001 // This also eliminates spilled cross register class COPYs (e.g. between x and
3002 // d regs) of the same size. For example:
3004 // %0 = COPY %1; GPR64:%0, FPR64:%1
3006 // will be filled as
3008 // LDRDui %0, fi<#0>
3010 // instead of
3012 // LDRXui %Temp, fi<#0>
3013 // %0 = FMOV %Temp
3015 if (MI.isCopy() && Ops.size() == 1 &&
3016 // Make sure we're only folding the explicit COPY defs/uses.
3017 (Ops[0] == 0 || Ops[0] == 1)) {
3018 bool IsSpill = Ops[0] == 0;
3019 bool IsFill = !IsSpill;
3020 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3021 const MachineRegisterInfo &MRI = MF.getRegInfo();
3022 MachineBasicBlock &MBB = *MI.getParent();
3023 const MachineOperand &DstMO = MI.getOperand(0);
3024 const MachineOperand &SrcMO = MI.getOperand(1);
3025 unsigned DstReg = DstMO.getReg();
3026 unsigned SrcReg = SrcMO.getReg();
3027 // This is slightly expensive to compute for physical regs since
3028 // getMinimalPhysRegClass is slow.
3029 auto getRegClass = [&](unsigned Reg) {
3030 return TargetRegisterInfo::isVirtualRegister(Reg)
3031 ? MRI.getRegClass(Reg)
3032 : TRI.getMinimalPhysRegClass(Reg);
3035 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3036 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3037 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3038 "Mismatched register size in non subreg COPY");
3039 if (IsSpill)
3040 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3041 getRegClass(SrcReg), &TRI);
3042 else
3043 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3044 getRegClass(DstReg), &TRI);
3045 return &*--InsertPt;
3048 // Handle cases like spilling def of:
3050 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3052 // where the physical register source can be widened and stored to the full
3053 // virtual reg destination stack slot, in this case producing:
3055 // STRXui %xzr, %stack.0
3057 if (IsSpill && DstMO.isUndef() &&
3058 TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
3059 assert(SrcMO.getSubReg() == 0 &&
3060 "Unexpected subreg on physical register");
3061 const TargetRegisterClass *SpillRC;
3062 unsigned SpillSubreg;
3063 switch (DstMO.getSubReg()) {
3064 default:
3065 SpillRC = nullptr;
3066 break;
3067 case AArch64::sub_32:
3068 case AArch64::ssub:
3069 if (AArch64::GPR32RegClass.contains(SrcReg)) {
3070 SpillRC = &AArch64::GPR64RegClass;
3071 SpillSubreg = AArch64::sub_32;
3072 } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3073 SpillRC = &AArch64::FPR64RegClass;
3074 SpillSubreg = AArch64::ssub;
3075 } else
3076 SpillRC = nullptr;
3077 break;
3078 case AArch64::dsub:
3079 if (AArch64::FPR64RegClass.contains(SrcReg)) {
3080 SpillRC = &AArch64::FPR128RegClass;
3081 SpillSubreg = AArch64::dsub;
3082 } else
3083 SpillRC = nullptr;
3084 break;
3087 if (SpillRC)
3088 if (unsigned WidenedSrcReg =
3089 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3090 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3091 FrameIndex, SpillRC, &TRI);
3092 return &*--InsertPt;
3096 // Handle cases like filling use of:
3098 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3100 // where we can load the full virtual reg source stack slot, into the subreg
3101 // destination, in this case producing:
3103 // LDRWui %0:sub_32<def,read-undef>, %stack.0
3105 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3106 const TargetRegisterClass *FillRC;
3107 switch (DstMO.getSubReg()) {
3108 default:
3109 FillRC = nullptr;
3110 break;
3111 case AArch64::sub_32:
3112 FillRC = &AArch64::GPR32RegClass;
3113 break;
3114 case AArch64::ssub:
3115 FillRC = &AArch64::FPR32RegClass;
3116 break;
3117 case AArch64::dsub:
3118 FillRC = &AArch64::FPR64RegClass;
3119 break;
3122 if (FillRC) {
3123 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3124 TRI.getRegSizeInBits(*FillRC) &&
3125 "Mismatched regclass size on folded subreg COPY");
3126 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3127 MachineInstr &LoadMI = *--InsertPt;
3128 MachineOperand &LoadDst = LoadMI.getOperand(0);
3129 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3130 LoadDst.setSubReg(DstMO.getSubReg());
3131 LoadDst.setIsUndef();
3132 return &LoadMI;
3137 // Cannot fold.
3138 return nullptr;
3141 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
3142 bool *OutUseUnscaledOp,
3143 unsigned *OutUnscaledOp,
3144 int *EmittableOffset) {
3145 int Scale = 1;
3146 bool IsSigned = false;
3147 // The ImmIdx should be changed case by case if it is not 2.
3148 unsigned ImmIdx = 2;
3149 unsigned UnscaledOp = 0;
3150 // Set output values in case of early exit.
3151 if (EmittableOffset)
3152 *EmittableOffset = 0;
3153 if (OutUseUnscaledOp)
3154 *OutUseUnscaledOp = false;
3155 if (OutUnscaledOp)
3156 *OutUnscaledOp = 0;
3157 switch (MI.getOpcode()) {
3158 default:
3159 llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
3160 // Vector spills/fills can't take an immediate offset.
3161 case AArch64::LD1Twov2d:
3162 case AArch64::LD1Threev2d:
3163 case AArch64::LD1Fourv2d:
3164 case AArch64::LD1Twov1d:
3165 case AArch64::LD1Threev1d:
3166 case AArch64::LD1Fourv1d:
3167 case AArch64::ST1Twov2d:
3168 case AArch64::ST1Threev2d:
3169 case AArch64::ST1Fourv2d:
3170 case AArch64::ST1Twov1d:
3171 case AArch64::ST1Threev1d:
3172 case AArch64::ST1Fourv1d:
3173 return AArch64FrameOffsetCannotUpdate;
3174 case AArch64::PRFMui:
3175 Scale = 8;
3176 UnscaledOp = AArch64::PRFUMi;
3177 break;
3178 case AArch64::LDRXui:
3179 Scale = 8;
3180 UnscaledOp = AArch64::LDURXi;
3181 break;
3182 case AArch64::LDRWui:
3183 Scale = 4;
3184 UnscaledOp = AArch64::LDURWi;
3185 break;
3186 case AArch64::LDRBui:
3187 Scale = 1;
3188 UnscaledOp = AArch64::LDURBi;
3189 break;
3190 case AArch64::LDRHui:
3191 Scale = 2;
3192 UnscaledOp = AArch64::LDURHi;
3193 break;
3194 case AArch64::LDRSui:
3195 Scale = 4;
3196 UnscaledOp = AArch64::LDURSi;
3197 break;
3198 case AArch64::LDRDui:
3199 Scale = 8;
3200 UnscaledOp = AArch64::LDURDi;
3201 break;
3202 case AArch64::LDRQui:
3203 Scale = 16;
3204 UnscaledOp = AArch64::LDURQi;
3205 break;
3206 case AArch64::LDRBBui:
3207 Scale = 1;
3208 UnscaledOp = AArch64::LDURBBi;
3209 break;
3210 case AArch64::LDRHHui:
3211 Scale = 2;
3212 UnscaledOp = AArch64::LDURHHi;
3213 break;
3214 case AArch64::LDRSBXui:
3215 Scale = 1;
3216 UnscaledOp = AArch64::LDURSBXi;
3217 break;
3218 case AArch64::LDRSBWui:
3219 Scale = 1;
3220 UnscaledOp = AArch64::LDURSBWi;
3221 break;
3222 case AArch64::LDRSHXui:
3223 Scale = 2;
3224 UnscaledOp = AArch64::LDURSHXi;
3225 break;
3226 case AArch64::LDRSHWui:
3227 Scale = 2;
3228 UnscaledOp = AArch64::LDURSHWi;
3229 break;
3230 case AArch64::LDRSWui:
3231 Scale = 4;
3232 UnscaledOp = AArch64::LDURSWi;
3233 break;
3235 case AArch64::STRXui:
3236 Scale = 8;
3237 UnscaledOp = AArch64::STURXi;
3238 break;
3239 case AArch64::STRWui:
3240 Scale = 4;
3241 UnscaledOp = AArch64::STURWi;
3242 break;
3243 case AArch64::STRBui:
3244 Scale = 1;
3245 UnscaledOp = AArch64::STURBi;
3246 break;
3247 case AArch64::STRHui:
3248 Scale = 2;
3249 UnscaledOp = AArch64::STURHi;
3250 break;
3251 case AArch64::STRSui:
3252 Scale = 4;
3253 UnscaledOp = AArch64::STURSi;
3254 break;
3255 case AArch64::STRDui:
3256 Scale = 8;
3257 UnscaledOp = AArch64::STURDi;
3258 break;
3259 case AArch64::STRQui:
3260 Scale = 16;
3261 UnscaledOp = AArch64::STURQi;
3262 break;
3263 case AArch64::STRBBui:
3264 Scale = 1;
3265 UnscaledOp = AArch64::STURBBi;
3266 break;
3267 case AArch64::STRHHui:
3268 Scale = 2;
3269 UnscaledOp = AArch64::STURHHi;
3270 break;
3272 case AArch64::LDPXi:
3273 case AArch64::LDPDi:
3274 case AArch64::STPXi:
3275 case AArch64::STPDi:
3276 case AArch64::LDNPXi:
3277 case AArch64::LDNPDi:
3278 case AArch64::STNPXi:
3279 case AArch64::STNPDi:
3280 ImmIdx = 3;
3281 IsSigned = true;
3282 Scale = 8;
3283 break;
3284 case AArch64::LDPQi:
3285 case AArch64::STPQi:
3286 case AArch64::LDNPQi:
3287 case AArch64::STNPQi:
3288 ImmIdx = 3;
3289 IsSigned = true;
3290 Scale = 16;
3291 break;
3292 case AArch64::LDPWi:
3293 case AArch64::LDPSi:
3294 case AArch64::STPWi:
3295 case AArch64::STPSi:
3296 case AArch64::LDNPWi:
3297 case AArch64::LDNPSi:
3298 case AArch64::STNPWi:
3299 case AArch64::STNPSi:
3300 ImmIdx = 3;
3301 IsSigned = true;
3302 Scale = 4;
3303 break;
3305 case AArch64::LDURXi:
3306 case AArch64::LDURWi:
3307 case AArch64::LDURBi:
3308 case AArch64::LDURHi:
3309 case AArch64::LDURSi:
3310 case AArch64::LDURDi:
3311 case AArch64::LDURQi:
3312 case AArch64::LDURHHi:
3313 case AArch64::LDURBBi:
3314 case AArch64::LDURSBXi:
3315 case AArch64::LDURSBWi:
3316 case AArch64::LDURSHXi:
3317 case AArch64::LDURSHWi:
3318 case AArch64::LDURSWi:
3319 case AArch64::STURXi:
3320 case AArch64::STURWi:
3321 case AArch64::STURBi:
3322 case AArch64::STURHi:
3323 case AArch64::STURSi:
3324 case AArch64::STURDi:
3325 case AArch64::STURQi:
3326 case AArch64::STURBBi:
3327 case AArch64::STURHHi:
3328 Scale = 1;
3329 break;
3332 Offset += MI.getOperand(ImmIdx).getImm() * Scale;
3334 bool useUnscaledOp = false;
3335 // If the offset doesn't match the scale, we rewrite the instruction to
3336 // use the unscaled instruction instead. Likewise, if we have a negative
3337 // offset (and have an unscaled op to use).
3338 if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
3339 useUnscaledOp = true;
3341 // Use an unscaled addressing mode if the instruction has a negative offset
3342 // (or if the instruction is already using an unscaled addressing mode).
3343 unsigned MaskBits;
3344 if (IsSigned) {
3345 // ldp/stp instructions.
3346 MaskBits = 7;
3347 Offset /= Scale;
3348 } else if (UnscaledOp == 0 || useUnscaledOp) {
3349 MaskBits = 9;
3350 IsSigned = true;
3351 Scale = 1;
3352 } else {
3353 MaskBits = 12;
3354 IsSigned = false;
3355 Offset /= Scale;
3358 // Attempt to fold address computation.
3359 int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
3360 int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
3361 if (Offset >= MinOff && Offset <= MaxOff) {
3362 if (EmittableOffset)
3363 *EmittableOffset = Offset;
3364 Offset = 0;
3365 } else {
3366 int NewOff = Offset < 0 ? MinOff : MaxOff;
3367 if (EmittableOffset)
3368 *EmittableOffset = NewOff;
3369 Offset = (Offset - NewOff) * Scale;
3371 if (OutUseUnscaledOp)
3372 *OutUseUnscaledOp = useUnscaledOp;
3373 if (OutUnscaledOp)
3374 *OutUnscaledOp = UnscaledOp;
3375 return AArch64FrameOffsetCanUpdate |
3376 (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
3379 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3380 unsigned FrameReg, int &Offset,
3381 const AArch64InstrInfo *TII) {
3382 unsigned Opcode = MI.getOpcode();
3383 unsigned ImmIdx = FrameRegIdx + 1;
3385 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3386 Offset += MI.getOperand(ImmIdx).getImm();
3387 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3388 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3389 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3390 MI.eraseFromParent();
3391 Offset = 0;
3392 return true;
3395 int NewOffset;
3396 unsigned UnscaledOp;
3397 bool UseUnscaledOp;
3398 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3399 &UnscaledOp, &NewOffset);
3400 if (Status & AArch64FrameOffsetCanUpdate) {
3401 if (Status & AArch64FrameOffsetIsLegal)
3402 // Replace the FrameIndex with FrameReg.
3403 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3404 if (UseUnscaledOp)
3405 MI.setDesc(TII->get(UnscaledOp));
3407 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3408 return Offset == 0;
3411 return false;
3414 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3415 NopInst.setOpcode(AArch64::HINT);
3416 NopInst.addOperand(MCOperand::createImm(0));
3419 // AArch64 supports MachineCombiner.
3420 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3422 // True when Opc sets flag
3423 static bool isCombineInstrSettingFlag(unsigned Opc) {
3424 switch (Opc) {
3425 case AArch64::ADDSWrr:
3426 case AArch64::ADDSWri:
3427 case AArch64::ADDSXrr:
3428 case AArch64::ADDSXri:
3429 case AArch64::SUBSWrr:
3430 case AArch64::SUBSXrr:
3431 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3432 case AArch64::SUBSWri:
3433 case AArch64::SUBSXri:
3434 return true;
3435 default:
3436 break;
3438 return false;
3441 // 32b Opcodes that can be combined with a MUL
3442 static bool isCombineInstrCandidate32(unsigned Opc) {
3443 switch (Opc) {
3444 case AArch64::ADDWrr:
3445 case AArch64::ADDWri:
3446 case AArch64::SUBWrr:
3447 case AArch64::ADDSWrr:
3448 case AArch64::ADDSWri:
3449 case AArch64::SUBSWrr:
3450 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3451 case AArch64::SUBWri:
3452 case AArch64::SUBSWri:
3453 return true;
3454 default:
3455 break;
3457 return false;
3460 // 64b Opcodes that can be combined with a MUL
3461 static bool isCombineInstrCandidate64(unsigned Opc) {
3462 switch (Opc) {
3463 case AArch64::ADDXrr:
3464 case AArch64::ADDXri:
3465 case AArch64::SUBXrr:
3466 case AArch64::ADDSXrr:
3467 case AArch64::ADDSXri:
3468 case AArch64::SUBSXrr:
3469 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3470 case AArch64::SUBXri:
3471 case AArch64::SUBSXri:
3472 return true;
3473 default:
3474 break;
3476 return false;
3479 // FP Opcodes that can be combined with a FMUL
3480 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3481 switch (Inst.getOpcode()) {
3482 default:
3483 break;
3484 case AArch64::FADDSrr:
3485 case AArch64::FADDDrr:
3486 case AArch64::FADDv2f32:
3487 case AArch64::FADDv2f64:
3488 case AArch64::FADDv4f32:
3489 case AArch64::FSUBSrr:
3490 case AArch64::FSUBDrr:
3491 case AArch64::FSUBv2f32:
3492 case AArch64::FSUBv2f64:
3493 case AArch64::FSUBv4f32:
3494 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3495 return (Options.UnsafeFPMath ||
3496 Options.AllowFPOpFusion == FPOpFusion::Fast);
3498 return false;
3501 // Opcodes that can be combined with a MUL
3502 static bool isCombineInstrCandidate(unsigned Opc) {
3503 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
3507 // Utility routine that checks if \param MO is defined by an
3508 // \param CombineOpc instruction in the basic block \param MBB
3509 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
3510 unsigned CombineOpc, unsigned ZeroReg = 0,
3511 bool CheckZeroReg = false) {
3512 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3513 MachineInstr *MI = nullptr;
3515 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3516 MI = MRI.getUniqueVRegDef(MO.getReg());
3517 // And it needs to be in the trace (otherwise, it won't have a depth).
3518 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3519 return false;
3520 // Must only used by the user we combine with.
3521 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3522 return false;
3524 if (CheckZeroReg) {
3525 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3526 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3527 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3528 // The third input reg must be zero.
3529 if (MI->getOperand(3).getReg() != ZeroReg)
3530 return false;
3533 return true;
3537 // Is \param MO defined by an integer multiply and can be combined?
3538 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3539 unsigned MulOpc, unsigned ZeroReg) {
3540 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3544 // Is \param MO defined by a floating-point multiply and can be combined?
3545 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3546 unsigned MulOpc) {
3547 return canCombine(MBB, MO, MulOpc);
3550 // TODO: There are many more machine instruction opcodes to match:
3551 // 1. Other data types (integer, vectors)
3552 // 2. Other math / logic operations (xor, or)
3553 // 3. Other forms of the same operation (intrinsics and other variants)
3554 bool AArch64InstrInfo::isAssociativeAndCommutative(
3555 const MachineInstr &Inst) const {
3556 switch (Inst.getOpcode()) {
3557 case AArch64::FADDDrr:
3558 case AArch64::FADDSrr:
3559 case AArch64::FADDv2f32:
3560 case AArch64::FADDv2f64:
3561 case AArch64::FADDv4f32:
3562 case AArch64::FMULDrr:
3563 case AArch64::FMULSrr:
3564 case AArch64::FMULX32:
3565 case AArch64::FMULX64:
3566 case AArch64::FMULXv2f32:
3567 case AArch64::FMULXv2f64:
3568 case AArch64::FMULXv4f32:
3569 case AArch64::FMULv2f32:
3570 case AArch64::FMULv2f64:
3571 case AArch64::FMULv4f32:
3572 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3573 default:
3574 return false;
3578 /// Find instructions that can be turned into madd.
3579 static bool getMaddPatterns(MachineInstr &Root,
3580 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3581 unsigned Opc = Root.getOpcode();
3582 MachineBasicBlock &MBB = *Root.getParent();
3583 bool Found = false;
3585 if (!isCombineInstrCandidate(Opc))
3586 return false;
3587 if (isCombineInstrSettingFlag(Opc)) {
3588 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3589 // When NZCV is live bail out.
3590 if (Cmp_NZCV == -1)
3591 return false;
3592 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3593 // When opcode can't change bail out.
3594 // CHECKME: do we miss any cases for opcode conversion?
3595 if (NewOpc == Opc)
3596 return false;
3597 Opc = NewOpc;
3600 switch (Opc) {
3601 default:
3602 break;
3603 case AArch64::ADDWrr:
3604 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3605 "ADDWrr does not have register operands");
3606 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3607 AArch64::WZR)) {
3608 Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
3609 Found = true;
3611 if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3612 AArch64::WZR)) {
3613 Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
3614 Found = true;
3616 break;
3617 case AArch64::ADDXrr:
3618 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3619 AArch64::XZR)) {
3620 Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
3621 Found = true;
3623 if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3624 AArch64::XZR)) {
3625 Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
3626 Found = true;
3628 break;
3629 case AArch64::SUBWrr:
3630 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3631 AArch64::WZR)) {
3632 Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
3633 Found = true;
3635 if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3636 AArch64::WZR)) {
3637 Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
3638 Found = true;
3640 break;
3641 case AArch64::SUBXrr:
3642 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3643 AArch64::XZR)) {
3644 Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
3645 Found = true;
3647 if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3648 AArch64::XZR)) {
3649 Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
3650 Found = true;
3652 break;
3653 case AArch64::ADDWri:
3654 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3655 AArch64::WZR)) {
3656 Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
3657 Found = true;
3659 break;
3660 case AArch64::ADDXri:
3661 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3662 AArch64::XZR)) {
3663 Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
3664 Found = true;
3666 break;
3667 case AArch64::SUBWri:
3668 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3669 AArch64::WZR)) {
3670 Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
3671 Found = true;
3673 break;
3674 case AArch64::SUBXri:
3675 if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3676 AArch64::XZR)) {
3677 Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
3678 Found = true;
3680 break;
3682 return Found;
3684 /// Floating-Point Support
3686 /// Find instructions that can be turned into madd.
3687 static bool getFMAPatterns(MachineInstr &Root,
3688 SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3690 if (!isCombineInstrCandidateFP(Root))
3691 return false;
3693 MachineBasicBlock &MBB = *Root.getParent();
3694 bool Found = false;
3696 switch (Root.getOpcode()) {
3697 default:
3698 assert(false && "Unsupported FP instruction in combiner\n");
3699 break;
3700 case AArch64::FADDSrr:
3701 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3702 "FADDWrr does not have register operands");
3703 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3704 Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
3705 Found = true;
3706 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3707 AArch64::FMULv1i32_indexed)) {
3708 Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
3709 Found = true;
3711 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3712 Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
3713 Found = true;
3714 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3715 AArch64::FMULv1i32_indexed)) {
3716 Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
3717 Found = true;
3719 break;
3720 case AArch64::FADDDrr:
3721 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3722 Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
3723 Found = true;
3724 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3725 AArch64::FMULv1i64_indexed)) {
3726 Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
3727 Found = true;
3729 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3730 Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
3731 Found = true;
3732 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3733 AArch64::FMULv1i64_indexed)) {
3734 Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
3735 Found = true;
3737 break;
3738 case AArch64::FADDv2f32:
3739 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3740 AArch64::FMULv2i32_indexed)) {
3741 Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
3742 Found = true;
3743 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3744 AArch64::FMULv2f32)) {
3745 Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
3746 Found = true;
3748 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3749 AArch64::FMULv2i32_indexed)) {
3750 Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
3751 Found = true;
3752 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3753 AArch64::FMULv2f32)) {
3754 Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
3755 Found = true;
3757 break;
3758 case AArch64::FADDv2f64:
3759 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3760 AArch64::FMULv2i64_indexed)) {
3761 Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
3762 Found = true;
3763 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3764 AArch64::FMULv2f64)) {
3765 Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
3766 Found = true;
3768 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3769 AArch64::FMULv2i64_indexed)) {
3770 Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
3771 Found = true;
3772 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3773 AArch64::FMULv2f64)) {
3774 Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
3775 Found = true;
3777 break;
3778 case AArch64::FADDv4f32:
3779 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3780 AArch64::FMULv4i32_indexed)) {
3781 Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
3782 Found = true;
3783 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3784 AArch64::FMULv4f32)) {
3785 Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
3786 Found = true;
3788 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3789 AArch64::FMULv4i32_indexed)) {
3790 Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
3791 Found = true;
3792 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3793 AArch64::FMULv4f32)) {
3794 Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
3795 Found = true;
3797 break;
3799 case AArch64::FSUBSrr:
3800 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3801 Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
3802 Found = true;
3804 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3805 Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
3806 Found = true;
3807 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3808 AArch64::FMULv1i32_indexed)) {
3809 Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
3810 Found = true;
3812 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
3813 Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
3814 Found = true;
3816 break;
3817 case AArch64::FSUBDrr:
3818 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3819 Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
3820 Found = true;
3822 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3823 Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
3824 Found = true;
3825 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3826 AArch64::FMULv1i64_indexed)) {
3827 Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
3828 Found = true;
3830 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
3831 Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
3832 Found = true;
3834 break;
3835 case AArch64::FSUBv2f32:
3836 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3837 AArch64::FMULv2i32_indexed)) {
3838 Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
3839 Found = true;
3840 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3841 AArch64::FMULv2f32)) {
3842 Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
3843 Found = true;
3845 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3846 AArch64::FMULv2i32_indexed)) {
3847 Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
3848 Found = true;
3849 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3850 AArch64::FMULv2f32)) {
3851 Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
3852 Found = true;
3854 break;
3855 case AArch64::FSUBv2f64:
3856 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3857 AArch64::FMULv2i64_indexed)) {
3858 Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
3859 Found = true;
3860 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3861 AArch64::FMULv2f64)) {
3862 Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
3863 Found = true;
3865 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3866 AArch64::FMULv2i64_indexed)) {
3867 Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1);
3868 Found = true;
3869 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3870 AArch64::FMULv2f64)) {
3871 Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1);
3872 Found = true;
3874 break;
3875 case AArch64::FSUBv4f32:
3876 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3877 AArch64::FMULv4i32_indexed)) {
3878 Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
3879 Found = true;
3880 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3881 AArch64::FMULv4f32)) {
3882 Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
3883 Found = true;
3885 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3886 AArch64::FMULv4i32_indexed)) {
3887 Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1);
3888 Found = true;
3889 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3890 AArch64::FMULv4f32)) {
3891 Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1);
3892 Found = true;
3894 break;
3896 return Found;
3899 /// Return true when a code sequence can improve throughput. It
3900 /// should be called only for instructions in loops.
3901 /// \param Pattern - combiner pattern
3902 bool AArch64InstrInfo::isThroughputPattern(
3903 MachineCombinerPattern Pattern) const {
3904 switch (Pattern) {
3905 default:
3906 break;
3907 case MachineCombinerPattern::FMULADDS_OP1:
3908 case MachineCombinerPattern::FMULADDS_OP2:
3909 case MachineCombinerPattern::FMULSUBS_OP1:
3910 case MachineCombinerPattern::FMULSUBS_OP2:
3911 case MachineCombinerPattern::FMULADDD_OP1:
3912 case MachineCombinerPattern::FMULADDD_OP2:
3913 case MachineCombinerPattern::FMULSUBD_OP1:
3914 case MachineCombinerPattern::FMULSUBD_OP2:
3915 case MachineCombinerPattern::FNMULSUBS_OP1:
3916 case MachineCombinerPattern::FNMULSUBD_OP1:
3917 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
3918 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
3919 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
3920 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
3921 case MachineCombinerPattern::FMLAv2f32_OP2:
3922 case MachineCombinerPattern::FMLAv2f32_OP1:
3923 case MachineCombinerPattern::FMLAv2f64_OP1:
3924 case MachineCombinerPattern::FMLAv2f64_OP2:
3925 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
3926 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
3927 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
3928 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
3929 case MachineCombinerPattern::FMLAv4f32_OP1:
3930 case MachineCombinerPattern::FMLAv4f32_OP2:
3931 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
3932 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
3933 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
3934 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
3935 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
3936 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
3937 case MachineCombinerPattern::FMLSv2f32_OP2:
3938 case MachineCombinerPattern::FMLSv2f64_OP2:
3939 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
3940 case MachineCombinerPattern::FMLSv4f32_OP2:
3941 return true;
3942 } // end switch (Pattern)
3943 return false;
3945 /// Return true when there is potentially a faster code sequence for an
3946 /// instruction chain ending in \p Root. All potential patterns are listed in
3947 /// the \p Pattern vector. Pattern should be sorted in priority order since the
3948 /// pattern evaluator stops checking as soon as it finds a faster sequence.
3950 bool AArch64InstrInfo::getMachineCombinerPatterns(
3951 MachineInstr &Root,
3952 SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
3953 // Integer patterns
3954 if (getMaddPatterns(Root, Patterns))
3955 return true;
3956 // Floating point patterns
3957 if (getFMAPatterns(Root, Patterns))
3958 return true;
3960 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
3963 enum class FMAInstKind { Default, Indexed, Accumulator };
3964 /// genFusedMultiply - Generate fused multiply instructions.
3965 /// This function supports both integer and floating point instructions.
3966 /// A typical example:
3967 /// F|MUL I=A,B,0
3968 /// F|ADD R,I,C
3969 /// ==> F|MADD R,A,B,C
3970 /// \param MF Containing MachineFunction
3971 /// \param MRI Register information
3972 /// \param TII Target information
3973 /// \param Root is the F|ADD instruction
3974 /// \param [out] InsInstrs is a vector of machine instructions and will
3975 /// contain the generated madd instruction
3976 /// \param IdxMulOpd is index of operand in Root that is the result of
3977 /// the F|MUL. In the example above IdxMulOpd is 1.
3978 /// \param MaddOpc the opcode fo the f|madd instruction
3979 /// \param RC Register class of operands
3980 /// \param kind of fma instruction (addressing mode) to be generated
3981 /// \param ReplacedAddend is the result register from the instruction
3982 /// replacing the non-combined operand, if any.
3983 static MachineInstr *
3984 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
3985 const TargetInstrInfo *TII, MachineInstr &Root,
3986 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
3987 unsigned MaddOpc, const TargetRegisterClass *RC,
3988 FMAInstKind kind = FMAInstKind::Default,
3989 const unsigned *ReplacedAddend = nullptr) {
3990 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
3992 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
3993 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
3994 unsigned ResultReg = Root.getOperand(0).getReg();
3995 unsigned SrcReg0 = MUL->getOperand(1).getReg();
3996 bool Src0IsKill = MUL->getOperand(1).isKill();
3997 unsigned SrcReg1 = MUL->getOperand(2).getReg();
3998 bool Src1IsKill = MUL->getOperand(2).isKill();
4000 unsigned SrcReg2;
4001 bool Src2IsKill;
4002 if (ReplacedAddend) {
4003 // If we just generated a new addend, we must be it's only use.
4004 SrcReg2 = *ReplacedAddend;
4005 Src2IsKill = true;
4006 } else {
4007 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4008 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4011 if (TargetRegisterInfo::isVirtualRegister(ResultReg))
4012 MRI.constrainRegClass(ResultReg, RC);
4013 if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
4014 MRI.constrainRegClass(SrcReg0, RC);
4015 if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
4016 MRI.constrainRegClass(SrcReg1, RC);
4017 if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
4018 MRI.constrainRegClass(SrcReg2, RC);
4020 MachineInstrBuilder MIB;
4021 if (kind == FMAInstKind::Default)
4022 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4023 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4024 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4025 .addReg(SrcReg2, getKillRegState(Src2IsKill));
4026 else if (kind == FMAInstKind::Indexed)
4027 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4028 .addReg(SrcReg2, getKillRegState(Src2IsKill))
4029 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4030 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4031 .addImm(MUL->getOperand(3).getImm());
4032 else if (kind == FMAInstKind::Accumulator)
4033 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4034 .addReg(SrcReg2, getKillRegState(Src2IsKill))
4035 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4036 .addReg(SrcReg1, getKillRegState(Src1IsKill));
4037 else
4038 assert(false && "Invalid FMA instruction kind \n");
4039 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4040 InsInstrs.push_back(MIB);
4041 return MUL;
4044 /// genMaddR - Generate madd instruction and combine mul and add using
4045 /// an extra virtual register
4046 /// Example - an ADD intermediate needs to be stored in a register:
4047 /// MUL I=A,B,0
4048 /// ADD R,I,Imm
4049 /// ==> ORR V, ZR, Imm
4050 /// ==> MADD R,A,B,V
4051 /// \param MF Containing MachineFunction
4052 /// \param MRI Register information
4053 /// \param TII Target information
4054 /// \param Root is the ADD instruction
4055 /// \param [out] InsInstrs is a vector of machine instructions and will
4056 /// contain the generated madd instruction
4057 /// \param IdxMulOpd is index of operand in Root that is the result of
4058 /// the MUL. In the example above IdxMulOpd is 1.
4059 /// \param MaddOpc the opcode fo the madd instruction
4060 /// \param VR is a virtual register that holds the value of an ADD operand
4061 /// (V in the example above).
4062 /// \param RC Register class of operands
4063 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4064 const TargetInstrInfo *TII, MachineInstr &Root,
4065 SmallVectorImpl<MachineInstr *> &InsInstrs,
4066 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4067 const TargetRegisterClass *RC) {
4068 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4070 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4071 unsigned ResultReg = Root.getOperand(0).getReg();
4072 unsigned SrcReg0 = MUL->getOperand(1).getReg();
4073 bool Src0IsKill = MUL->getOperand(1).isKill();
4074 unsigned SrcReg1 = MUL->getOperand(2).getReg();
4075 bool Src1IsKill = MUL->getOperand(2).isKill();
4077 if (TargetRegisterInfo::isVirtualRegister(ResultReg))
4078 MRI.constrainRegClass(ResultReg, RC);
4079 if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
4080 MRI.constrainRegClass(SrcReg0, RC);
4081 if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
4082 MRI.constrainRegClass(SrcReg1, RC);
4083 if (TargetRegisterInfo::isVirtualRegister(VR))
4084 MRI.constrainRegClass(VR, RC);
4086 MachineInstrBuilder MIB =
4087 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4088 .addReg(SrcReg0, getKillRegState(Src0IsKill))
4089 .addReg(SrcReg1, getKillRegState(Src1IsKill))
4090 .addReg(VR);
4091 // Insert the MADD
4092 InsInstrs.push_back(MIB);
4093 return MUL;
4096 /// When getMachineCombinerPatterns() finds potential patterns,
4097 /// this function generates the instructions that could replace the
4098 /// original code sequence
4099 void AArch64InstrInfo::genAlternativeCodeSequence(
4100 MachineInstr &Root, MachineCombinerPattern Pattern,
4101 SmallVectorImpl<MachineInstr *> &InsInstrs,
4102 SmallVectorImpl<MachineInstr *> &DelInstrs,
4103 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4104 MachineBasicBlock &MBB = *Root.getParent();
4105 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4106 MachineFunction &MF = *MBB.getParent();
4107 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4109 MachineInstr *MUL;
4110 const TargetRegisterClass *RC;
4111 unsigned Opc;
4112 switch (Pattern) {
4113 default:
4114 // Reassociate instructions.
4115 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4116 DelInstrs, InstrIdxForVirtReg);
4117 return;
4118 case MachineCombinerPattern::MULADDW_OP1:
4119 case MachineCombinerPattern::MULADDX_OP1:
4120 // MUL I=A,B,0
4121 // ADD R,I,C
4122 // ==> MADD R,A,B,C
4123 // --- Create(MADD);
4124 if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4125 Opc = AArch64::MADDWrrr;
4126 RC = &AArch64::GPR32RegClass;
4127 } else {
4128 Opc = AArch64::MADDXrrr;
4129 RC = &AArch64::GPR64RegClass;
4131 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4132 break;
4133 case MachineCombinerPattern::MULADDW_OP2:
4134 case MachineCombinerPattern::MULADDX_OP2:
4135 // MUL I=A,B,0
4136 // ADD R,C,I
4137 // ==> MADD R,A,B,C
4138 // --- Create(MADD);
4139 if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4140 Opc = AArch64::MADDWrrr;
4141 RC = &AArch64::GPR32RegClass;
4142 } else {
4143 Opc = AArch64::MADDXrrr;
4144 RC = &AArch64::GPR64RegClass;
4146 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4147 break;
4148 case MachineCombinerPattern::MULADDWI_OP1:
4149 case MachineCombinerPattern::MULADDXI_OP1: {
4150 // MUL I=A,B,0
4151 // ADD R,I,Imm
4152 // ==> ORR V, ZR, Imm
4153 // ==> MADD R,A,B,V
4154 // --- Create(MADD);
4155 const TargetRegisterClass *OrrRC;
4156 unsigned BitSize, OrrOpc, ZeroReg;
4157 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4158 OrrOpc = AArch64::ORRWri;
4159 OrrRC = &AArch64::GPR32spRegClass;
4160 BitSize = 32;
4161 ZeroReg = AArch64::WZR;
4162 Opc = AArch64::MADDWrrr;
4163 RC = &AArch64::GPR32RegClass;
4164 } else {
4165 OrrOpc = AArch64::ORRXri;
4166 OrrRC = &AArch64::GPR64spRegClass;
4167 BitSize = 64;
4168 ZeroReg = AArch64::XZR;
4169 Opc = AArch64::MADDXrrr;
4170 RC = &AArch64::GPR64RegClass;
4172 unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4173 uint64_t Imm = Root.getOperand(2).getImm();
4175 if (Root.getOperand(3).isImm()) {
4176 unsigned Val = Root.getOperand(3).getImm();
4177 Imm = Imm << Val;
4179 uint64_t UImm = SignExtend64(Imm, BitSize);
4180 uint64_t Encoding;
4181 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4182 MachineInstrBuilder MIB1 =
4183 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4184 .addReg(ZeroReg)
4185 .addImm(Encoding);
4186 InsInstrs.push_back(MIB1);
4187 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4188 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4190 break;
4192 case MachineCombinerPattern::MULSUBW_OP1:
4193 case MachineCombinerPattern::MULSUBX_OP1: {
4194 // MUL I=A,B,0
4195 // SUB R,I, C
4196 // ==> SUB V, 0, C
4197 // ==> MADD R,A,B,V // = -C + A*B
4198 // --- Create(MADD);
4199 const TargetRegisterClass *SubRC;
4200 unsigned SubOpc, ZeroReg;
4201 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4202 SubOpc = AArch64::SUBWrr;
4203 SubRC = &AArch64::GPR32spRegClass;
4204 ZeroReg = AArch64::WZR;
4205 Opc = AArch64::MADDWrrr;
4206 RC = &AArch64::GPR32RegClass;
4207 } else {
4208 SubOpc = AArch64::SUBXrr;
4209 SubRC = &AArch64::GPR64spRegClass;
4210 ZeroReg = AArch64::XZR;
4211 Opc = AArch64::MADDXrrr;
4212 RC = &AArch64::GPR64RegClass;
4214 unsigned NewVR = MRI.createVirtualRegister(SubRC);
4215 // SUB NewVR, 0, C
4216 MachineInstrBuilder MIB1 =
4217 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4218 .addReg(ZeroReg)
4219 .add(Root.getOperand(2));
4220 InsInstrs.push_back(MIB1);
4221 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4222 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4223 break;
4225 case MachineCombinerPattern::MULSUBW_OP2:
4226 case MachineCombinerPattern::MULSUBX_OP2:
4227 // MUL I=A,B,0
4228 // SUB R,C,I
4229 // ==> MSUB R,A,B,C (computes C - A*B)
4230 // --- Create(MSUB);
4231 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4232 Opc = AArch64::MSUBWrrr;
4233 RC = &AArch64::GPR32RegClass;
4234 } else {
4235 Opc = AArch64::MSUBXrrr;
4236 RC = &AArch64::GPR64RegClass;
4238 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4239 break;
4240 case MachineCombinerPattern::MULSUBWI_OP1:
4241 case MachineCombinerPattern::MULSUBXI_OP1: {
4242 // MUL I=A,B,0
4243 // SUB R,I, Imm
4244 // ==> ORR V, ZR, -Imm
4245 // ==> MADD R,A,B,V // = -Imm + A*B
4246 // --- Create(MADD);
4247 const TargetRegisterClass *OrrRC;
4248 unsigned BitSize, OrrOpc, ZeroReg;
4249 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4250 OrrOpc = AArch64::ORRWri;
4251 OrrRC = &AArch64::GPR32spRegClass;
4252 BitSize = 32;
4253 ZeroReg = AArch64::WZR;
4254 Opc = AArch64::MADDWrrr;
4255 RC = &AArch64::GPR32RegClass;
4256 } else {
4257 OrrOpc = AArch64::ORRXri;
4258 OrrRC = &AArch64::GPR64spRegClass;
4259 BitSize = 64;
4260 ZeroReg = AArch64::XZR;
4261 Opc = AArch64::MADDXrrr;
4262 RC = &AArch64::GPR64RegClass;
4264 unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4265 uint64_t Imm = Root.getOperand(2).getImm();
4266 if (Root.getOperand(3).isImm()) {
4267 unsigned Val = Root.getOperand(3).getImm();
4268 Imm = Imm << Val;
4270 uint64_t UImm = SignExtend64(-Imm, BitSize);
4271 uint64_t Encoding;
4272 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4273 MachineInstrBuilder MIB1 =
4274 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4275 .addReg(ZeroReg)
4276 .addImm(Encoding);
4277 InsInstrs.push_back(MIB1);
4278 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4279 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4281 break;
4283 // Floating Point Support
4284 case MachineCombinerPattern::FMULADDS_OP1:
4285 case MachineCombinerPattern::FMULADDD_OP1:
4286 // MUL I=A,B,0
4287 // ADD R,I,C
4288 // ==> MADD R,A,B,C
4289 // --- Create(MADD);
4290 if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
4291 Opc = AArch64::FMADDSrrr;
4292 RC = &AArch64::FPR32RegClass;
4293 } else {
4294 Opc = AArch64::FMADDDrrr;
4295 RC = &AArch64::FPR64RegClass;
4297 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4298 break;
4299 case MachineCombinerPattern::FMULADDS_OP2:
4300 case MachineCombinerPattern::FMULADDD_OP2:
4301 // FMUL I=A,B,0
4302 // FADD R,C,I
4303 // ==> FMADD R,A,B,C
4304 // --- Create(FMADD);
4305 if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
4306 Opc = AArch64::FMADDSrrr;
4307 RC = &AArch64::FPR32RegClass;
4308 } else {
4309 Opc = AArch64::FMADDDrrr;
4310 RC = &AArch64::FPR64RegClass;
4312 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4313 break;
4315 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4316 Opc = AArch64::FMLAv1i32_indexed;
4317 RC = &AArch64::FPR32RegClass;
4318 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4319 FMAInstKind::Indexed);
4320 break;
4321 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4322 Opc = AArch64::FMLAv1i32_indexed;
4323 RC = &AArch64::FPR32RegClass;
4324 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4325 FMAInstKind::Indexed);
4326 break;
4328 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4329 Opc = AArch64::FMLAv1i64_indexed;
4330 RC = &AArch64::FPR64RegClass;
4331 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4332 FMAInstKind::Indexed);
4333 break;
4334 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4335 Opc = AArch64::FMLAv1i64_indexed;
4336 RC = &AArch64::FPR64RegClass;
4337 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4338 FMAInstKind::Indexed);
4339 break;
4341 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4342 case MachineCombinerPattern::FMLAv2f32_OP1:
4343 RC = &AArch64::FPR64RegClass;
4344 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
4345 Opc = AArch64::FMLAv2i32_indexed;
4346 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4347 FMAInstKind::Indexed);
4348 } else {
4349 Opc = AArch64::FMLAv2f32;
4350 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4351 FMAInstKind::Accumulator);
4353 break;
4354 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4355 case MachineCombinerPattern::FMLAv2f32_OP2:
4356 RC = &AArch64::FPR64RegClass;
4357 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
4358 Opc = AArch64::FMLAv2i32_indexed;
4359 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4360 FMAInstKind::Indexed);
4361 } else {
4362 Opc = AArch64::FMLAv2f32;
4363 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4364 FMAInstKind::Accumulator);
4366 break;
4368 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4369 case MachineCombinerPattern::FMLAv2f64_OP1:
4370 RC = &AArch64::FPR128RegClass;
4371 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
4372 Opc = AArch64::FMLAv2i64_indexed;
4373 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4374 FMAInstKind::Indexed);
4375 } else {
4376 Opc = AArch64::FMLAv2f64;
4377 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4378 FMAInstKind::Accumulator);
4380 break;
4381 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4382 case MachineCombinerPattern::FMLAv2f64_OP2:
4383 RC = &AArch64::FPR128RegClass;
4384 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
4385 Opc = AArch64::FMLAv2i64_indexed;
4386 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4387 FMAInstKind::Indexed);
4388 } else {
4389 Opc = AArch64::FMLAv2f64;
4390 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4391 FMAInstKind::Accumulator);
4393 break;
4395 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4396 case MachineCombinerPattern::FMLAv4f32_OP1:
4397 RC = &AArch64::FPR128RegClass;
4398 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
4399 Opc = AArch64::FMLAv4i32_indexed;
4400 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4401 FMAInstKind::Indexed);
4402 } else {
4403 Opc = AArch64::FMLAv4f32;
4404 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4405 FMAInstKind::Accumulator);
4407 break;
4409 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4410 case MachineCombinerPattern::FMLAv4f32_OP2:
4411 RC = &AArch64::FPR128RegClass;
4412 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
4413 Opc = AArch64::FMLAv4i32_indexed;
4414 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4415 FMAInstKind::Indexed);
4416 } else {
4417 Opc = AArch64::FMLAv4f32;
4418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4419 FMAInstKind::Accumulator);
4421 break;
4423 case MachineCombinerPattern::FMULSUBS_OP1:
4424 case MachineCombinerPattern::FMULSUBD_OP1: {
4425 // FMUL I=A,B,0
4426 // FSUB R,I,C
4427 // ==> FNMSUB R,A,B,C // = -C + A*B
4428 // --- Create(FNMSUB);
4429 if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
4430 Opc = AArch64::FNMSUBSrrr;
4431 RC = &AArch64::FPR32RegClass;
4432 } else {
4433 Opc = AArch64::FNMSUBDrrr;
4434 RC = &AArch64::FPR64RegClass;
4436 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4437 break;
4440 case MachineCombinerPattern::FNMULSUBS_OP1:
4441 case MachineCombinerPattern::FNMULSUBD_OP1: {
4442 // FNMUL I=A,B,0
4443 // FSUB R,I,C
4444 // ==> FNMADD R,A,B,C // = -A*B - C
4445 // --- Create(FNMADD);
4446 if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
4447 Opc = AArch64::FNMADDSrrr;
4448 RC = &AArch64::FPR32RegClass;
4449 } else {
4450 Opc = AArch64::FNMADDDrrr;
4451 RC = &AArch64::FPR64RegClass;
4453 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4454 break;
4457 case MachineCombinerPattern::FMULSUBS_OP2:
4458 case MachineCombinerPattern::FMULSUBD_OP2: {
4459 // FMUL I=A,B,0
4460 // FSUB R,C,I
4461 // ==> FMSUB R,A,B,C (computes C - A*B)
4462 // --- Create(FMSUB);
4463 if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
4464 Opc = AArch64::FMSUBSrrr;
4465 RC = &AArch64::FPR32RegClass;
4466 } else {
4467 Opc = AArch64::FMSUBDrrr;
4468 RC = &AArch64::FPR64RegClass;
4470 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4471 break;
4474 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4475 Opc = AArch64::FMLSv1i32_indexed;
4476 RC = &AArch64::FPR32RegClass;
4477 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4478 FMAInstKind::Indexed);
4479 break;
4481 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4482 Opc = AArch64::FMLSv1i64_indexed;
4483 RC = &AArch64::FPR64RegClass;
4484 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4485 FMAInstKind::Indexed);
4486 break;
4488 case MachineCombinerPattern::FMLSv2f32_OP2:
4489 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4490 RC = &AArch64::FPR64RegClass;
4491 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
4492 Opc = AArch64::FMLSv2i32_indexed;
4493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4494 FMAInstKind::Indexed);
4495 } else {
4496 Opc = AArch64::FMLSv2f32;
4497 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4498 FMAInstKind::Accumulator);
4500 break;
4502 case MachineCombinerPattern::FMLSv2f64_OP2:
4503 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4504 RC = &AArch64::FPR128RegClass;
4505 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
4506 Opc = AArch64::FMLSv2i64_indexed;
4507 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4508 FMAInstKind::Indexed);
4509 } else {
4510 Opc = AArch64::FMLSv2f64;
4511 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4512 FMAInstKind::Accumulator);
4514 break;
4516 case MachineCombinerPattern::FMLSv4f32_OP2:
4517 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4518 RC = &AArch64::FPR128RegClass;
4519 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
4520 Opc = AArch64::FMLSv4i32_indexed;
4521 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4522 FMAInstKind::Indexed);
4523 } else {
4524 Opc = AArch64::FMLSv4f32;
4525 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4526 FMAInstKind::Accumulator);
4528 break;
4529 case MachineCombinerPattern::FMLSv2f32_OP1:
4530 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
4531 RC = &AArch64::FPR64RegClass;
4532 unsigned NewVR = MRI.createVirtualRegister(RC);
4533 MachineInstrBuilder MIB1 =
4534 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
4535 .add(Root.getOperand(2));
4536 InsInstrs.push_back(MIB1);
4537 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4538 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
4539 Opc = AArch64::FMLAv2i32_indexed;
4540 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4541 FMAInstKind::Indexed, &NewVR);
4542 } else {
4543 Opc = AArch64::FMLAv2f32;
4544 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4545 FMAInstKind::Accumulator, &NewVR);
4547 break;
4549 case MachineCombinerPattern::FMLSv4f32_OP1:
4550 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
4551 RC = &AArch64::FPR128RegClass;
4552 unsigned NewVR = MRI.createVirtualRegister(RC);
4553 MachineInstrBuilder MIB1 =
4554 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
4555 .add(Root.getOperand(2));
4556 InsInstrs.push_back(MIB1);
4557 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4558 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
4559 Opc = AArch64::FMLAv4i32_indexed;
4560 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4561 FMAInstKind::Indexed, &NewVR);
4562 } else {
4563 Opc = AArch64::FMLAv4f32;
4564 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4565 FMAInstKind::Accumulator, &NewVR);
4567 break;
4569 case MachineCombinerPattern::FMLSv2f64_OP1:
4570 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
4571 RC = &AArch64::FPR128RegClass;
4572 unsigned NewVR = MRI.createVirtualRegister(RC);
4573 MachineInstrBuilder MIB1 =
4574 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
4575 .add(Root.getOperand(2));
4576 InsInstrs.push_back(MIB1);
4577 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4578 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
4579 Opc = AArch64::FMLAv2i64_indexed;
4580 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4581 FMAInstKind::Indexed, &NewVR);
4582 } else {
4583 Opc = AArch64::FMLAv2f64;
4584 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4585 FMAInstKind::Accumulator, &NewVR);
4587 break;
4589 } // end switch (Pattern)
4590 // Record MUL and ADD/SUB for deletion
4591 DelInstrs.push_back(MUL);
4592 DelInstrs.push_back(&Root);
4595 /// Replace csincr-branch sequence by simple conditional branch
4597 /// Examples:
4598 /// 1. \code
4599 /// csinc w9, wzr, wzr, <condition code>
4600 /// tbnz w9, #0, 0x44
4601 /// \endcode
4602 /// to
4603 /// \code
4604 /// b.<inverted condition code>
4605 /// \endcode
4607 /// 2. \code
4608 /// csinc w9, wzr, wzr, <condition code>
4609 /// tbz w9, #0, 0x44
4610 /// \endcode
4611 /// to
4612 /// \code
4613 /// b.<condition code>
4614 /// \endcode
4616 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
4617 /// compare's constant operand is power of 2.
4619 /// Examples:
4620 /// \code
4621 /// and w8, w8, #0x400
4622 /// cbnz w8, L1
4623 /// \endcode
4624 /// to
4625 /// \code
4626 /// tbnz w8, #10, L1
4627 /// \endcode
4629 /// \param MI Conditional Branch
4630 /// \return True when the simple conditional branch is generated
4632 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
4633 bool IsNegativeBranch = false;
4634 bool IsTestAndBranch = false;
4635 unsigned TargetBBInMI = 0;
4636 switch (MI.getOpcode()) {
4637 default:
4638 llvm_unreachable("Unknown branch instruction?");
4639 case AArch64::Bcc:
4640 return false;
4641 case AArch64::CBZW:
4642 case AArch64::CBZX:
4643 TargetBBInMI = 1;
4644 break;
4645 case AArch64::CBNZW:
4646 case AArch64::CBNZX:
4647 TargetBBInMI = 1;
4648 IsNegativeBranch = true;
4649 break;
4650 case AArch64::TBZW:
4651 case AArch64::TBZX:
4652 TargetBBInMI = 2;
4653 IsTestAndBranch = true;
4654 break;
4655 case AArch64::TBNZW:
4656 case AArch64::TBNZX:
4657 TargetBBInMI = 2;
4658 IsNegativeBranch = true;
4659 IsTestAndBranch = true;
4660 break;
4662 // So we increment a zero register and test for bits other
4663 // than bit 0? Conservatively bail out in case the verifier
4664 // missed this case.
4665 if (IsTestAndBranch && MI.getOperand(1).getImm())
4666 return false;
4668 // Find Definition.
4669 assert(MI.getParent() && "Incomplete machine instruciton\n");
4670 MachineBasicBlock *MBB = MI.getParent();
4671 MachineFunction *MF = MBB->getParent();
4672 MachineRegisterInfo *MRI = &MF->getRegInfo();
4673 unsigned VReg = MI.getOperand(0).getReg();
4674 if (!TargetRegisterInfo::isVirtualRegister(VReg))
4675 return false;
4677 MachineInstr *DefMI = MRI->getVRegDef(VReg);
4679 // Look through COPY instructions to find definition.
4680 while (DefMI->isCopy()) {
4681 unsigned CopyVReg = DefMI->getOperand(1).getReg();
4682 if (!MRI->hasOneNonDBGUse(CopyVReg))
4683 return false;
4684 if (!MRI->hasOneDef(CopyVReg))
4685 return false;
4686 DefMI = MRI->getVRegDef(CopyVReg);
4689 switch (DefMI->getOpcode()) {
4690 default:
4691 return false;
4692 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
4693 case AArch64::ANDWri:
4694 case AArch64::ANDXri: {
4695 if (IsTestAndBranch)
4696 return false;
4697 if (DefMI->getParent() != MBB)
4698 return false;
4699 if (!MRI->hasOneNonDBGUse(VReg))
4700 return false;
4702 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
4703 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
4704 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
4705 if (!isPowerOf2_64(Mask))
4706 return false;
4708 MachineOperand &MO = DefMI->getOperand(1);
4709 unsigned NewReg = MO.getReg();
4710 if (!TargetRegisterInfo::isVirtualRegister(NewReg))
4711 return false;
4713 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
4715 MachineBasicBlock &RefToMBB = *MBB;
4716 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
4717 DebugLoc DL = MI.getDebugLoc();
4718 unsigned Imm = Log2_64(Mask);
4719 unsigned Opc = (Imm < 32)
4720 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
4721 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
4722 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
4723 .addReg(NewReg)
4724 .addImm(Imm)
4725 .addMBB(TBB);
4726 // Register lives on to the CBZ now.
4727 MO.setIsKill(false);
4729 // For immediate smaller than 32, we need to use the 32-bit
4730 // variant (W) in all cases. Indeed the 64-bit variant does not
4731 // allow to encode them.
4732 // Therefore, if the input register is 64-bit, we need to take the
4733 // 32-bit sub-part.
4734 if (!Is32Bit && Imm < 32)
4735 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
4736 MI.eraseFromParent();
4737 return true;
4739 // Look for CSINC
4740 case AArch64::CSINCWr:
4741 case AArch64::CSINCXr: {
4742 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
4743 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
4744 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
4745 DefMI->getOperand(2).getReg() == AArch64::XZR))
4746 return false;
4748 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
4749 return false;
4751 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
4752 // Convert only when the condition code is not modified between
4753 // the CSINC and the branch. The CC may be used by other
4754 // instructions in between.
4755 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
4756 return false;
4757 MachineBasicBlock &RefToMBB = *MBB;
4758 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
4759 DebugLoc DL = MI.getDebugLoc();
4760 if (IsNegativeBranch)
4761 CC = AArch64CC::getInvertedCondCode(CC);
4762 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
4763 MI.eraseFromParent();
4764 return true;
4769 std::pair<unsigned, unsigned>
4770 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
4771 const unsigned Mask = AArch64II::MO_FRAGMENT;
4772 return std::make_pair(TF & Mask, TF & ~Mask);
4775 ArrayRef<std::pair<unsigned, const char *>>
4776 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
4777 using namespace AArch64II;
4779 static const std::pair<unsigned, const char *> TargetFlags[] = {
4780 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
4781 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
4782 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
4783 {MO_HI12, "aarch64-hi12"}};
4784 return makeArrayRef(TargetFlags);
4787 ArrayRef<std::pair<unsigned, const char *>>
4788 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
4789 using namespace AArch64II;
4791 static const std::pair<unsigned, const char *> TargetFlags[] = {
4792 {MO_COFFSTUB, "aarch64-coffstub"},
4793 {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"},
4794 {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"},
4795 {MO_DLLIMPORT, "aarch64-dllimport"}};
4796 return makeArrayRef(TargetFlags);
4799 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
4800 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
4801 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
4802 {{MOSuppressPair, "aarch64-suppress-pair"},
4803 {MOStridedAccess, "aarch64-strided-access"}};
4804 return makeArrayRef(TargetFlags);
4807 /// Constants defining how certain sequences should be outlined.
4808 /// This encompasses how an outlined function should be called, and what kind of
4809 /// frame should be emitted for that outlined function.
4811 /// \p MachineOutlinerDefault implies that the function should be called with
4812 /// a save and restore of LR to the stack.
4814 /// That is,
4816 /// I1 Save LR OUTLINED_FUNCTION:
4817 /// I2 --> BL OUTLINED_FUNCTION I1
4818 /// I3 Restore LR I2
4819 /// I3
4820 /// RET
4822 /// * Call construction overhead: 3 (save + BL + restore)
4823 /// * Frame construction overhead: 1 (ret)
4824 /// * Requires stack fixups? Yes
4826 /// \p MachineOutlinerTailCall implies that the function is being created from
4827 /// a sequence of instructions ending in a return.
4829 /// That is,
4831 /// I1 OUTLINED_FUNCTION:
4832 /// I2 --> B OUTLINED_FUNCTION I1
4833 /// RET I2
4834 /// RET
4836 /// * Call construction overhead: 1 (B)
4837 /// * Frame construction overhead: 0 (Return included in sequence)
4838 /// * Requires stack fixups? No
4840 /// \p MachineOutlinerNoLRSave implies that the function should be called using
4841 /// a BL instruction, but doesn't require LR to be saved and restored. This
4842 /// happens when LR is known to be dead.
4844 /// That is,
4846 /// I1 OUTLINED_FUNCTION:
4847 /// I2 --> BL OUTLINED_FUNCTION I1
4848 /// I3 I2
4849 /// I3
4850 /// RET
4852 /// * Call construction overhead: 1 (BL)
4853 /// * Frame construction overhead: 1 (RET)
4854 /// * Requires stack fixups? No
4856 /// \p MachineOutlinerThunk implies that the function is being created from
4857 /// a sequence of instructions ending in a call. The outlined function is
4858 /// called with a BL instruction, and the outlined function tail-calls the
4859 /// original call destination.
4861 /// That is,
4863 /// I1 OUTLINED_FUNCTION:
4864 /// I2 --> BL OUTLINED_FUNCTION I1
4865 /// BL f I2
4866 /// B f
4867 /// * Call construction overhead: 1 (BL)
4868 /// * Frame construction overhead: 0
4869 /// * Requires stack fixups? No
4871 /// \p MachineOutlinerRegSave implies that the function should be called with a
4872 /// save and restore of LR to an available register. This allows us to avoid
4873 /// stack fixups. Note that this outlining variant is compatible with the
4874 /// NoLRSave case.
4876 /// That is,
4878 /// I1 Save LR OUTLINED_FUNCTION:
4879 /// I2 --> BL OUTLINED_FUNCTION I1
4880 /// I3 Restore LR I2
4881 /// I3
4882 /// RET
4884 /// * Call construction overhead: 3 (save + BL + restore)
4885 /// * Frame construction overhead: 1 (ret)
4886 /// * Requires stack fixups? No
4887 enum MachineOutlinerClass {
4888 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
4889 MachineOutlinerTailCall, /// Only emit a branch.
4890 MachineOutlinerNoLRSave, /// Emit a call and return.
4891 MachineOutlinerThunk, /// Emit a call and tail-call.
4892 MachineOutlinerRegSave /// Same as default, but save to a register.
4895 enum MachineOutlinerMBBFlags {
4896 LRUnavailableSomewhere = 0x2,
4897 HasCalls = 0x4,
4898 UnsafeRegsDead = 0x8
4901 unsigned
4902 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
4903 assert(C.LRUWasSet && "LRU wasn't set?");
4904 MachineFunction *MF = C.getMF();
4905 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
4906 MF->getSubtarget().getRegisterInfo());
4908 // Check if there is an available register across the sequence that we can
4909 // use.
4910 for (unsigned Reg : AArch64::GPR64RegClass) {
4911 if (!ARI->isReservedReg(*MF, Reg) &&
4912 Reg != AArch64::LR && // LR is not reserved, but don't use it.
4913 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
4914 Reg != AArch64::X17 && // Ditto for X17.
4915 C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
4916 return Reg;
4919 // No suitable register. Return 0.
4920 return 0u;
4923 outliner::OutlinedFunction
4924 AArch64InstrInfo::getOutliningCandidateInfo(
4925 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
4926 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
4927 unsigned SequenceSize =
4928 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
4929 [this](unsigned Sum, const MachineInstr &MI) {
4930 return Sum + getInstSizeInBytes(MI);
4933 // Properties about candidate MBBs that hold for all of them.
4934 unsigned FlagsSetInAll = 0xF;
4936 // Compute liveness information for each candidate, and set FlagsSetInAll.
4937 const TargetRegisterInfo &TRI = getRegisterInfo();
4938 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
4939 [&FlagsSetInAll](outliner::Candidate &C) {
4940 FlagsSetInAll &= C.Flags;
4943 // According to the AArch64 Procedure Call Standard, the following are
4944 // undefined on entry/exit from a function call:
4946 // * Registers x16, x17, (and thus w16, w17)
4947 // * Condition codes (and thus the NZCV register)
4949 // Because if this, we can't outline any sequence of instructions where
4950 // one
4951 // of these registers is live into/across it. Thus, we need to delete
4952 // those
4953 // candidates.
4954 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
4955 // If the unsafe registers in this block are all dead, then we don't need
4956 // to compute liveness here.
4957 if (C.Flags & UnsafeRegsDead)
4958 return false;
4959 C.initLRU(TRI);
4960 LiveRegUnits LRU = C.LRU;
4961 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
4962 !LRU.available(AArch64::NZCV));
4965 // Are there any candidates where those registers are live?
4966 if (!(FlagsSetInAll & UnsafeRegsDead)) {
4967 // Erase every candidate that violates the restrictions above. (It could be
4968 // true that we have viable candidates, so it's not worth bailing out in
4969 // the case that, say, 1 out of 20 candidates violate the restructions.)
4970 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
4971 RepeatedSequenceLocs.end(),
4972 CantGuaranteeValueAcrossCall),
4973 RepeatedSequenceLocs.end());
4975 // If the sequence doesn't have enough candidates left, then we're done.
4976 if (RepeatedSequenceLocs.size() < 2)
4977 return outliner::OutlinedFunction();
4980 // At this point, we have only "safe" candidates to outline. Figure out
4981 // frame + call instruction information.
4983 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
4985 // Helper lambda which sets call information for every candidate.
4986 auto SetCandidateCallInfo =
4987 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
4988 for (outliner::Candidate &C : RepeatedSequenceLocs)
4989 C.setCallInfo(CallID, NumBytesForCall);
4992 unsigned FrameID = MachineOutlinerDefault;
4993 unsigned NumBytesToCreateFrame = 4;
4995 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
4996 return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
4999 // Returns true if an instructions is safe to fix up, false otherwise.
5000 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
5001 if (MI.isCall())
5002 return true;
5004 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
5005 !MI.readsRegister(AArch64::SP, &TRI))
5006 return true;
5008 // Any modification of SP will break our code to save/restore LR.
5009 // FIXME: We could handle some instructions which add a constant
5010 // offset to SP, with a bit more work.
5011 if (MI.modifiesRegister(AArch64::SP, &TRI))
5012 return false;
5014 // At this point, we have a stack instruction that we might need to
5015 // fix up. We'll handle it if it's a load or store.
5016 if (MI.mayLoadOrStore()) {
5017 MachineOperand *Base; // Filled with the base operand of MI.
5018 int64_t Offset; // Filled with the offset of MI.
5020 // Does it allow us to offset the base operand and is the base the
5021 // register SP?
5022 if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
5023 Base->getReg() != AArch64::SP)
5024 return false;
5026 // Find the minimum/maximum offset for this instruction and check
5027 // if fixing it up would be in range.
5028 int64_t MinOffset,
5029 MaxOffset; // Unscaled offsets for the instruction.
5030 unsigned Scale; // The scale to multiply the offsets by.
5031 unsigned DummyWidth;
5032 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
5034 Offset += 16; // Update the offset to what it would be if we outlined.
5035 if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
5036 return false;
5038 // It's in range, so we can outline it.
5039 return true;
5042 // FIXME: Add handling for instructions like "add x0, sp, #8".
5044 // We can't fix it up, so don't outline it.
5045 return false;
5048 // True if it's possible to fix up each stack instruction in this sequence.
5049 // Important for frames/call variants that modify the stack.
5050 bool AllStackInstrsSafe = std::all_of(
5051 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
5053 // If the last instruction in any candidate is a terminator, then we should
5054 // tail call all of the candidates.
5055 if (RepeatedSequenceLocs[0].back()->isTerminator()) {
5056 FrameID = MachineOutlinerTailCall;
5057 NumBytesToCreateFrame = 0;
5058 SetCandidateCallInfo(MachineOutlinerTailCall, 4);
5061 else if (LastInstrOpcode == AArch64::BL ||
5062 (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
5063 // FIXME: Do we need to check if the code after this uses the value of LR?
5064 FrameID = MachineOutlinerThunk;
5065 NumBytesToCreateFrame = 0;
5066 SetCandidateCallInfo(MachineOutlinerThunk, 4);
5069 else {
5070 // We need to decide how to emit calls + frames. We can always emit the same
5071 // frame if we don't need to save to the stack. If we have to save to the
5072 // stack, then we need a different frame.
5073 unsigned NumBytesNoStackCalls = 0;
5074 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
5076 for (outliner::Candidate &C : RepeatedSequenceLocs) {
5077 C.initLRU(TRI);
5079 // Is LR available? If so, we don't need a save.
5080 if (C.LRU.available(AArch64::LR)) {
5081 NumBytesNoStackCalls += 4;
5082 C.setCallInfo(MachineOutlinerNoLRSave, 4);
5083 CandidatesWithoutStackFixups.push_back(C);
5086 // Is an unused register available? If so, we won't modify the stack, so
5087 // we can outline with the same frame type as those that don't save LR.
5088 else if (findRegisterToSaveLRTo(C)) {
5089 NumBytesNoStackCalls += 12;
5090 C.setCallInfo(MachineOutlinerRegSave, 12);
5091 CandidatesWithoutStackFixups.push_back(C);
5094 // Is SP used in the sequence at all? If not, we don't have to modify
5095 // the stack, so we are guaranteed to get the same frame.
5096 else if (C.UsedInSequence.available(AArch64::SP)) {
5097 NumBytesNoStackCalls += 12;
5098 C.setCallInfo(MachineOutlinerDefault, 12);
5099 CandidatesWithoutStackFixups.push_back(C);
5102 // If we outline this, we need to modify the stack. Pretend we don't
5103 // outline this by saving all of its bytes.
5104 else {
5105 NumBytesNoStackCalls += SequenceSize;
5109 // If there are no places where we have to save LR, then note that we
5110 // don't have to update the stack. Otherwise, give every candidate the
5111 // default call type, as long as it's safe to do so.
5112 if (!AllStackInstrsSafe ||
5113 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
5114 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
5115 FrameID = MachineOutlinerNoLRSave;
5116 } else {
5117 SetCandidateCallInfo(MachineOutlinerDefault, 12);
5120 // If we dropped all of the candidates, bail out here.
5121 if (RepeatedSequenceLocs.size() < 2) {
5122 RepeatedSequenceLocs.clear();
5123 return outliner::OutlinedFunction();
5127 // Does every candidate's MBB contain a call? If so, then we might have a call
5128 // in the range.
5129 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
5130 // Check if the range contains a call. These require a save + restore of the
5131 // link register.
5132 bool ModStackToSaveLR = false;
5133 if (std::any_of(FirstCand.front(), FirstCand.back(),
5134 [](const MachineInstr &MI) { return MI.isCall(); }))
5135 ModStackToSaveLR = true;
5137 // Handle the last instruction separately. If this is a tail call, then the
5138 // last instruction is a call. We don't want to save + restore in this case.
5139 // However, it could be possible that the last instruction is a call without
5140 // it being valid to tail call this sequence. We should consider this as
5141 // well.
5142 else if (FrameID != MachineOutlinerThunk &&
5143 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
5144 ModStackToSaveLR = true;
5146 if (ModStackToSaveLR) {
5147 // We can't fix up the stack. Bail out.
5148 if (!AllStackInstrsSafe) {
5149 RepeatedSequenceLocs.clear();
5150 return outliner::OutlinedFunction();
5153 // Save + restore LR.
5154 NumBytesToCreateFrame += 8;
5158 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
5159 NumBytesToCreateFrame, FrameID);
5162 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
5163 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
5164 const Function &F = MF.getFunction();
5166 // Can F be deduplicated by the linker? If it can, don't outline from it.
5167 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
5168 return false;
5170 // Don't outline from functions with section markings; the program could
5171 // expect that all the code is in the named section.
5172 // FIXME: Allow outlining from multiple functions with the same section
5173 // marking.
5174 if (F.hasSection())
5175 return false;
5177 // Outlining from functions with redzones is unsafe since the outliner may
5178 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
5179 // outline from it.
5180 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5181 if (!AFI || AFI->hasRedZone().getValueOr(true))
5182 return false;
5184 // It's safe to outline from MF.
5185 return true;
5188 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
5189 unsigned &Flags) const {
5190 // Check if LR is available through all of the MBB. If it's not, then set
5191 // a flag.
5192 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
5193 "Suitable Machine Function for outlining must track liveness");
5194 LiveRegUnits LRU(getRegisterInfo());
5196 std::for_each(MBB.rbegin(), MBB.rend(),
5197 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
5199 // Check if each of the unsafe registers are available...
5200 bool W16AvailableInBlock = LRU.available(AArch64::W16);
5201 bool W17AvailableInBlock = LRU.available(AArch64::W17);
5202 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
5204 // If all of these are dead (and not live out), we know we don't have to check
5205 // them later.
5206 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
5207 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
5209 // Now, add the live outs to the set.
5210 LRU.addLiveOuts(MBB);
5212 // If any of these registers is available in the MBB, but also a live out of
5213 // the block, then we know outlining is unsafe.
5214 if (W16AvailableInBlock && !LRU.available(AArch64::W16))
5215 return false;
5216 if (W17AvailableInBlock && !LRU.available(AArch64::W17))
5217 return false;
5218 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
5219 return false;
5221 // Check if there's a call inside this MachineBasicBlock. If there is, then
5222 // set a flag.
5223 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
5224 Flags |= MachineOutlinerMBBFlags::HasCalls;
5226 MachineFunction *MF = MBB.getParent();
5228 // In the event that we outline, we may have to save LR. If there is an
5229 // available register in the MBB, then we'll always save LR there. Check if
5230 // this is true.
5231 bool CanSaveLR = false;
5232 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5233 MF->getSubtarget().getRegisterInfo());
5235 // Check if there is an available register across the sequence that we can
5236 // use.
5237 for (unsigned Reg : AArch64::GPR64RegClass) {
5238 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
5239 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
5240 CanSaveLR = true;
5241 break;
5245 // Check if we have a register we can save LR to, and if LR was used
5246 // somewhere. If both of those things are true, then we need to evaluate the
5247 // safety of outlining stack instructions later.
5248 if (!CanSaveLR && !LRU.available(AArch64::LR))
5249 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
5251 return true;
5254 outliner::InstrType
5255 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
5256 unsigned Flags) const {
5257 MachineInstr &MI = *MIT;
5258 MachineBasicBlock *MBB = MI.getParent();
5259 MachineFunction *MF = MBB->getParent();
5260 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
5262 // Don't outline LOHs.
5263 if (FuncInfo->getLOHRelated().count(&MI))
5264 return outliner::InstrType::Illegal;
5266 // Don't allow debug values to impact outlining type.
5267 if (MI.isDebugInstr() || MI.isIndirectDebugValue())
5268 return outliner::InstrType::Invisible;
5270 // At this point, KILL instructions don't really tell us much so we can go
5271 // ahead and skip over them.
5272 if (MI.isKill())
5273 return outliner::InstrType::Invisible;
5275 // Is this a terminator for a basic block?
5276 if (MI.isTerminator()) {
5278 // Is this the end of a function?
5279 if (MI.getParent()->succ_empty())
5280 return outliner::InstrType::Legal;
5282 // It's not, so don't outline it.
5283 return outliner::InstrType::Illegal;
5286 // Make sure none of the operands are un-outlinable.
5287 for (const MachineOperand &MOP : MI.operands()) {
5288 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
5289 MOP.isTargetIndex())
5290 return outliner::InstrType::Illegal;
5292 // If it uses LR or W30 explicitly, then don't touch it.
5293 if (MOP.isReg() && !MOP.isImplicit() &&
5294 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
5295 return outliner::InstrType::Illegal;
5298 // Special cases for instructions that can always be outlined, but will fail
5299 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
5300 // be outlined because they don't require a *specific* value to be in LR.
5301 if (MI.getOpcode() == AArch64::ADRP)
5302 return outliner::InstrType::Legal;
5304 // If MI is a call we might be able to outline it. We don't want to outline
5305 // any calls that rely on the position of items on the stack. When we outline
5306 // something containing a call, we have to emit a save and restore of LR in
5307 // the outlined function. Currently, this always happens by saving LR to the
5308 // stack. Thus, if we outline, say, half the parameters for a function call
5309 // plus the call, then we'll break the callee's expectations for the layout
5310 // of the stack.
5312 // FIXME: Allow calls to functions which construct a stack frame, as long
5313 // as they don't access arguments on the stack.
5314 // FIXME: Figure out some way to analyze functions defined in other modules.
5315 // We should be able to compute the memory usage based on the IR calling
5316 // convention, even if we can't see the definition.
5317 if (MI.isCall()) {
5318 // Get the function associated with the call. Look at each operand and find
5319 // the one that represents the callee and get its name.
5320 const Function *Callee = nullptr;
5321 for (const MachineOperand &MOP : MI.operands()) {
5322 if (MOP.isGlobal()) {
5323 Callee = dyn_cast<Function>(MOP.getGlobal());
5324 break;
5328 // Never outline calls to mcount. There isn't any rule that would require
5329 // this, but the Linux kernel's "ftrace" feature depends on it.
5330 if (Callee && Callee->getName() == "\01_mcount")
5331 return outliner::InstrType::Illegal;
5333 // If we don't know anything about the callee, assume it depends on the
5334 // stack layout of the caller. In that case, it's only legal to outline
5335 // as a tail-call. Whitelist the call instructions we know about so we
5336 // don't get unexpected results with call pseudo-instructions.
5337 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
5338 if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
5339 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
5341 if (!Callee)
5342 return UnknownCallOutlineType;
5344 // We have a function we have information about. Check it if it's something
5345 // can safely outline.
5346 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
5348 // We don't know what's going on with the callee at all. Don't touch it.
5349 if (!CalleeMF)
5350 return UnknownCallOutlineType;
5352 // Check if we know anything about the callee saves on the function. If we
5353 // don't, then don't touch it, since that implies that we haven't
5354 // computed anything about its stack frame yet.
5355 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
5356 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
5357 MFI.getNumObjects() > 0)
5358 return UnknownCallOutlineType;
5360 // At this point, we can say that CalleeMF ought to not pass anything on the
5361 // stack. Therefore, we can outline it.
5362 return outliner::InstrType::Legal;
5365 // Don't outline positions.
5366 if (MI.isPosition())
5367 return outliner::InstrType::Illegal;
5369 // Don't touch the link register or W30.
5370 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
5371 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
5372 return outliner::InstrType::Illegal;
5374 // Don't outline BTI instructions, because that will prevent the outlining
5375 // site from being indirectly callable.
5376 if (MI.getOpcode() == AArch64::HINT) {
5377 int64_t Imm = MI.getOperand(0).getImm();
5378 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5379 return outliner::InstrType::Illegal;
5382 return outliner::InstrType::Legal;
5385 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
5386 for (MachineInstr &MI : MBB) {
5387 MachineOperand *Base;
5388 unsigned Width;
5389 int64_t Offset;
5391 // Is this a load or store with an immediate offset with SP as the base?
5392 if (!MI.mayLoadOrStore() ||
5393 !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
5394 (Base->isReg() && Base->getReg() != AArch64::SP))
5395 continue;
5397 // It is, so we have to fix it up.
5398 unsigned Scale;
5399 int64_t Dummy1, Dummy2;
5401 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
5402 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
5403 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
5404 assert(Scale != 0 && "Unexpected opcode!");
5406 // We've pushed the return address to the stack, so add 16 to the offset.
5407 // This is safe, since we already checked if it would overflow when we
5408 // checked if this instruction was legal to outline.
5409 int64_t NewImm = (Offset + 16) / Scale;
5410 StackOffsetOperand.setImm(NewImm);
5414 void AArch64InstrInfo::buildOutlinedFrame(
5415 MachineBasicBlock &MBB, MachineFunction &MF,
5416 const outliner::OutlinedFunction &OF) const {
5417 // For thunk outlining, rewrite the last instruction from a call to a
5418 // tail-call.
5419 if (OF.FrameConstructionID == MachineOutlinerThunk) {
5420 MachineInstr *Call = &*--MBB.instr_end();
5421 unsigned TailOpcode;
5422 if (Call->getOpcode() == AArch64::BL) {
5423 TailOpcode = AArch64::TCRETURNdi;
5424 } else {
5425 assert(Call->getOpcode() == AArch64::BLR);
5426 TailOpcode = AArch64::TCRETURNriALL;
5428 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
5429 .add(Call->getOperand(0))
5430 .addImm(0);
5431 MBB.insert(MBB.end(), TC);
5432 Call->eraseFromParent();
5435 // Is there a call in the outlined range?
5436 auto IsNonTailCall = [](MachineInstr &MI) {
5437 return MI.isCall() && !MI.isReturn();
5439 if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
5440 // Fix up the instructions in the range, since we're going to modify the
5441 // stack.
5442 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
5443 "Can only fix up stack references once");
5444 fixupPostOutline(MBB);
5446 // LR has to be a live in so that we can save it.
5447 MBB.addLiveIn(AArch64::LR);
5449 MachineBasicBlock::iterator It = MBB.begin();
5450 MachineBasicBlock::iterator Et = MBB.end();
5452 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
5453 OF.FrameConstructionID == MachineOutlinerThunk)
5454 Et = std::prev(MBB.end());
5456 // Insert a save before the outlined region
5457 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5458 .addReg(AArch64::SP, RegState::Define)
5459 .addReg(AArch64::LR)
5460 .addReg(AArch64::SP)
5461 .addImm(-16);
5462 It = MBB.insert(It, STRXpre);
5464 const TargetSubtargetInfo &STI = MF.getSubtarget();
5465 const MCRegisterInfo *MRI = STI.getRegisterInfo();
5466 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
5468 // Add a CFI saying the stack was moved 16 B down.
5469 int64_t StackPosEntry =
5470 MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
5471 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5472 .addCFIIndex(StackPosEntry)
5473 .setMIFlags(MachineInstr::FrameSetup);
5475 // Add a CFI saying that the LR that we want to find is now 16 B higher than
5476 // before.
5477 int64_t LRPosEntry =
5478 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
5479 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5480 .addCFIIndex(LRPosEntry)
5481 .setMIFlags(MachineInstr::FrameSetup);
5483 // Insert a restore before the terminator for the function.
5484 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5485 .addReg(AArch64::SP, RegState::Define)
5486 .addReg(AArch64::LR, RegState::Define)
5487 .addReg(AArch64::SP)
5488 .addImm(16);
5489 Et = MBB.insert(Et, LDRXpost);
5492 // If this is a tail call outlined function, then there's already a return.
5493 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
5494 OF.FrameConstructionID == MachineOutlinerThunk)
5495 return;
5497 // It's not a tail call, so we have to insert the return ourselves.
5498 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
5499 .addReg(AArch64::LR, RegState::Undef);
5500 MBB.insert(MBB.end(), ret);
5502 // Did we have to modify the stack by saving the link register?
5503 if (OF.FrameConstructionID != MachineOutlinerDefault)
5504 return;
5506 // We modified the stack.
5507 // Walk over the basic block and fix up all the stack accesses.
5508 fixupPostOutline(MBB);
5511 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
5512 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
5513 MachineFunction &MF, const outliner::Candidate &C) const {
5515 // Are we tail calling?
5516 if (C.CallConstructionID == MachineOutlinerTailCall) {
5517 // If yes, then we can just branch to the label.
5518 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
5519 .addGlobalAddress(M.getNamedValue(MF.getName()))
5520 .addImm(0));
5521 return It;
5524 // Are we saving the link register?
5525 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
5526 C.CallConstructionID == MachineOutlinerThunk) {
5527 // No, so just insert the call.
5528 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5529 .addGlobalAddress(M.getNamedValue(MF.getName())));
5530 return It;
5533 // We want to return the spot where we inserted the call.
5534 MachineBasicBlock::iterator CallPt;
5536 // Instructions for saving and restoring LR around the call instruction we're
5537 // going to insert.
5538 MachineInstr *Save;
5539 MachineInstr *Restore;
5540 // Can we save to a register?
5541 if (C.CallConstructionID == MachineOutlinerRegSave) {
5542 // FIXME: This logic should be sunk into a target-specific interface so that
5543 // we don't have to recompute the register.
5544 unsigned Reg = findRegisterToSaveLRTo(C);
5545 assert(Reg != 0 && "No callee-saved register available?");
5547 // Save and restore LR from that register.
5548 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
5549 .addReg(AArch64::XZR)
5550 .addReg(AArch64::LR)
5551 .addImm(0);
5552 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
5553 .addReg(AArch64::XZR)
5554 .addReg(Reg)
5555 .addImm(0);
5556 } else {
5557 // We have the default case. Save and restore from SP.
5558 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5559 .addReg(AArch64::SP, RegState::Define)
5560 .addReg(AArch64::LR)
5561 .addReg(AArch64::SP)
5562 .addImm(-16);
5563 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5564 .addReg(AArch64::SP, RegState::Define)
5565 .addReg(AArch64::LR, RegState::Define)
5566 .addReg(AArch64::SP)
5567 .addImm(16);
5570 It = MBB.insert(It, Save);
5571 It++;
5573 // Insert the call.
5574 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5575 .addGlobalAddress(M.getNamedValue(MF.getName())));
5576 CallPt = It;
5577 It++;
5579 It = MBB.insert(It, Restore);
5580 return CallPt;
5583 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
5584 MachineFunction &MF) const {
5585 return MF.getFunction().optForMinSize();
5588 #define GET_INSTRINFO_HELPERS
5589 #include "AArch64GenInstrInfo.inc"