[CostModel][X86] getShuffleCosts - convert all shuffle cost tables to be CostKind...
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64InstrInfo.cpp
blob17dd8a073eff0f9146982ce667505fd9f71030ea
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
11 //===----------------------------------------------------------------------===//
13 #include "AArch64InstrInfo.h"
14 #include "AArch64ExpandImm.h"
15 #include "AArch64MachineFunctionInfo.h"
16 #include "AArch64PointerAuth.h"
17 #include "AArch64Subtarget.h"
18 #include "MCTargetDesc/AArch64AddressingModes.h"
19 #include "MCTargetDesc/AArch64MCTargetDesc.h"
20 #include "Utils/AArch64BaseInfo.h"
21 #include "llvm/ADT/ArrayRef.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/CodeGen/LivePhysRegs.h"
25 #include "llvm/CodeGen/MachineBasicBlock.h"
26 #include "llvm/CodeGen/MachineCombinerPattern.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineInstr.h"
30 #include "llvm/CodeGen/MachineInstrBuilder.h"
31 #include "llvm/CodeGen/MachineMemOperand.h"
32 #include "llvm/CodeGen/MachineModuleInfo.h"
33 #include "llvm/CodeGen/MachineOperand.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/RegisterScavenging.h"
36 #include "llvm/CodeGen/StackMaps.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/CodeGen/TargetSubtargetInfo.h"
39 #include "llvm/IR/DebugInfoMetadata.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/IR/GlobalValue.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/MC/MCAsmInfo.h"
44 #include "llvm/MC/MCInst.h"
45 #include "llvm/MC/MCInstBuilder.h"
46 #include "llvm/MC/MCInstrDesc.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/CodeGen.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Support/LEB128.h"
52 #include "llvm/Support/MathExtras.h"
53 #include "llvm/Target/TargetMachine.h"
54 #include "llvm/Target/TargetOptions.h"
55 #include <cassert>
56 #include <cstdint>
57 #include <iterator>
58 #include <utility>
60 using namespace llvm;
62 #define GET_INSTRINFO_CTOR_DTOR
63 #include "AArch64GenInstrInfo.inc"
65 static cl::opt<unsigned> TBZDisplacementBits(
66 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
67 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
69 static cl::opt<unsigned> CBZDisplacementBits(
70 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
71 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
73 static cl::opt<unsigned>
74 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
75 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
77 static cl::opt<unsigned>
78 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
79 cl::desc("Restrict range of B instructions (DEBUG)"));
81 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
82 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
83 AArch64::CATCHRET),
84 RI(STI.getTargetTriple()), Subtarget(STI) {}
86 /// GetInstSize - Return the number of bytes of code the specified
87 /// instruction may be. This returns the maximum number of bytes.
88 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
89 const MachineBasicBlock &MBB = *MI.getParent();
90 const MachineFunction *MF = MBB.getParent();
91 const Function &F = MF->getFunction();
92 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
95 auto Op = MI.getOpcode();
96 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
97 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
100 // Meta-instructions emit no code.
101 if (MI.isMetaInstruction())
102 return 0;
104 // FIXME: We currently only handle pseudoinstructions that don't get expanded
105 // before the assembly printer.
106 unsigned NumBytes = 0;
107 const MCInstrDesc &Desc = MI.getDesc();
109 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
110 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
112 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
113 if (!MFI->shouldSignReturnAddress(MF))
114 return NumBytes;
116 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
117 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
118 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
119 return NumBytes;
122 // Size should be preferably set in
123 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
124 // Specific cases handle instructions of variable sizes
125 switch (Desc.getOpcode()) {
126 default:
127 if (Desc.getSize())
128 return Desc.getSize();
130 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
131 // with fixed constant size but not specified in .td file) is a normal
132 // 4-byte insn.
133 NumBytes = 4;
134 break;
135 case TargetOpcode::STACKMAP:
136 // The upper bound for a stackmap intrinsic is the full length of its shadow
137 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
138 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
139 break;
140 case TargetOpcode::PATCHPOINT:
141 // The size of the patchpoint intrinsic is the number of bytes requested
142 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
143 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
144 break;
145 case TargetOpcode::STATEPOINT:
146 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
147 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
148 // No patch bytes means a normal call inst is emitted
149 if (NumBytes == 0)
150 NumBytes = 4;
151 break;
152 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
153 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
154 // instructions are expanded to the specified number of NOPs. Otherwise,
155 // they are expanded to 36-byte XRay sleds.
156 NumBytes =
157 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
158 break;
159 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
160 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
161 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
162 NumBytes = 36;
163 break;
164 case TargetOpcode::PATCHABLE_EVENT_CALL:
165 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
166 NumBytes = 24;
167 break;
169 case AArch64::SPACE:
170 NumBytes = MI.getOperand(1).getImm();
171 break;
172 case TargetOpcode::BUNDLE:
173 NumBytes = getInstBundleLength(MI);
174 break;
177 return NumBytes;
180 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
181 unsigned Size = 0;
182 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
183 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
184 while (++I != E && I->isInsideBundle()) {
185 assert(!I->isBundle() && "No nested bundle!");
186 Size += getInstSizeInBytes(*I);
188 return Size;
191 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
192 SmallVectorImpl<MachineOperand> &Cond) {
193 // Block ends with fall-through condbranch.
194 switch (LastInst->getOpcode()) {
195 default:
196 llvm_unreachable("Unknown branch instruction?");
197 case AArch64::Bcc:
198 Target = LastInst->getOperand(1).getMBB();
199 Cond.push_back(LastInst->getOperand(0));
200 break;
201 case AArch64::CBZW:
202 case AArch64::CBZX:
203 case AArch64::CBNZW:
204 case AArch64::CBNZX:
205 Target = LastInst->getOperand(1).getMBB();
206 Cond.push_back(MachineOperand::CreateImm(-1));
207 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
208 Cond.push_back(LastInst->getOperand(0));
209 break;
210 case AArch64::TBZW:
211 case AArch64::TBZX:
212 case AArch64::TBNZW:
213 case AArch64::TBNZX:
214 Target = LastInst->getOperand(2).getMBB();
215 Cond.push_back(MachineOperand::CreateImm(-1));
216 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
217 Cond.push_back(LastInst->getOperand(0));
218 Cond.push_back(LastInst->getOperand(1));
222 static unsigned getBranchDisplacementBits(unsigned Opc) {
223 switch (Opc) {
224 default:
225 llvm_unreachable("unexpected opcode!");
226 case AArch64::B:
227 return BDisplacementBits;
228 case AArch64::TBNZW:
229 case AArch64::TBZW:
230 case AArch64::TBNZX:
231 case AArch64::TBZX:
232 return TBZDisplacementBits;
233 case AArch64::CBNZW:
234 case AArch64::CBZW:
235 case AArch64::CBNZX:
236 case AArch64::CBZX:
237 return CBZDisplacementBits;
238 case AArch64::Bcc:
239 return BCCDisplacementBits;
243 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
244 int64_t BrOffset) const {
245 unsigned Bits = getBranchDisplacementBits(BranchOp);
246 assert(Bits >= 3 && "max branch displacement must be enough to jump"
247 "over conditional branch expansion");
248 return isIntN(Bits, BrOffset / 4);
251 MachineBasicBlock *
252 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
253 switch (MI.getOpcode()) {
254 default:
255 llvm_unreachable("unexpected opcode!");
256 case AArch64::B:
257 return MI.getOperand(0).getMBB();
258 case AArch64::TBZW:
259 case AArch64::TBNZW:
260 case AArch64::TBZX:
261 case AArch64::TBNZX:
262 return MI.getOperand(2).getMBB();
263 case AArch64::CBZW:
264 case AArch64::CBNZW:
265 case AArch64::CBZX:
266 case AArch64::CBNZX:
267 case AArch64::Bcc:
268 return MI.getOperand(1).getMBB();
272 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
273 MachineBasicBlock &NewDestBB,
274 MachineBasicBlock &RestoreBB,
275 const DebugLoc &DL,
276 int64_t BrOffset,
277 RegScavenger *RS) const {
278 assert(RS && "RegScavenger required for long branching");
279 assert(MBB.empty() &&
280 "new block should be inserted for expanding unconditional branch");
281 assert(MBB.pred_size() == 1);
282 assert(RestoreBB.empty() &&
283 "restore block should be inserted for restoring clobbered registers");
285 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
286 // Offsets outside of the signed 33-bit range are not supported for ADRP +
287 // ADD.
288 if (!isInt<33>(BrOffset))
289 report_fatal_error(
290 "Branch offsets outside of the signed 33-bit range not supported");
292 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
293 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
294 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
295 .addReg(Reg)
296 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
297 .addImm(0);
298 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
301 RS->enterBasicBlockEnd(MBB);
302 // If X16 is unused, we can rely on the linker to insert a range extension
303 // thunk if NewDestBB is out of range of a single B instruction.
304 constexpr Register Reg = AArch64::X16;
305 if (!RS->isRegUsed(Reg)) {
306 insertUnconditionalBranch(MBB, &NewDestBB, DL);
307 RS->setRegUsed(Reg);
308 return;
311 // If there's a free register and it's worth inflating the code size,
312 // manually insert the indirect branch.
313 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
314 if (Scavenged != AArch64::NoRegister &&
315 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
316 buildIndirectBranch(Scavenged, NewDestBB);
317 RS->setRegUsed(Scavenged);
318 return;
321 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
322 // with red zones.
323 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
324 if (!AFI || AFI->hasRedZone().value_or(true))
325 report_fatal_error(
326 "Unable to insert indirect branch inside function that has red zone");
328 // Otherwise, spill X16 and defer range extension to the linker.
329 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
330 .addReg(AArch64::SP, RegState::Define)
331 .addReg(Reg)
332 .addReg(AArch64::SP)
333 .addImm(-16);
335 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
337 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
338 .addReg(AArch64::SP, RegState::Define)
339 .addReg(Reg, RegState::Define)
340 .addReg(AArch64::SP)
341 .addImm(16);
344 // Branch analysis.
345 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
346 MachineBasicBlock *&TBB,
347 MachineBasicBlock *&FBB,
348 SmallVectorImpl<MachineOperand> &Cond,
349 bool AllowModify) const {
350 // If the block has no terminators, it just falls into the block after it.
351 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
352 if (I == MBB.end())
353 return false;
355 // Skip over SpeculationBarrierEndBB terminators
356 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
357 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
358 --I;
361 if (!isUnpredicatedTerminator(*I))
362 return false;
364 // Get the last instruction in the block.
365 MachineInstr *LastInst = &*I;
367 // If there is only one terminator instruction, process it.
368 unsigned LastOpc = LastInst->getOpcode();
369 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
370 if (isUncondBranchOpcode(LastOpc)) {
371 TBB = LastInst->getOperand(0).getMBB();
372 return false;
374 if (isCondBranchOpcode(LastOpc)) {
375 // Block ends with fall-through condbranch.
376 parseCondBranch(LastInst, TBB, Cond);
377 return false;
379 return true; // Can't handle indirect branch.
382 // Get the instruction before it if it is a terminator.
383 MachineInstr *SecondLastInst = &*I;
384 unsigned SecondLastOpc = SecondLastInst->getOpcode();
386 // If AllowModify is true and the block ends with two or more unconditional
387 // branches, delete all but the first unconditional branch.
388 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
389 while (isUncondBranchOpcode(SecondLastOpc)) {
390 LastInst->eraseFromParent();
391 LastInst = SecondLastInst;
392 LastOpc = LastInst->getOpcode();
393 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
394 // Return now the only terminator is an unconditional branch.
395 TBB = LastInst->getOperand(0).getMBB();
396 return false;
398 SecondLastInst = &*I;
399 SecondLastOpc = SecondLastInst->getOpcode();
403 // If we're allowed to modify and the block ends in a unconditional branch
404 // which could simply fallthrough, remove the branch. (Note: This case only
405 // matters when we can't understand the whole sequence, otherwise it's also
406 // handled by BranchFolding.cpp.)
407 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
408 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
409 LastInst->eraseFromParent();
410 LastInst = SecondLastInst;
411 LastOpc = LastInst->getOpcode();
412 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
413 assert(!isUncondBranchOpcode(LastOpc) &&
414 "unreachable unconditional branches removed above");
416 if (isCondBranchOpcode(LastOpc)) {
417 // Block ends with fall-through condbranch.
418 parseCondBranch(LastInst, TBB, Cond);
419 return false;
421 return true; // Can't handle indirect branch.
423 SecondLastInst = &*I;
424 SecondLastOpc = SecondLastInst->getOpcode();
427 // If there are three terminators, we don't know what sort of block this is.
428 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
429 return true;
431 // If the block ends with a B and a Bcc, handle it.
432 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
433 parseCondBranch(SecondLastInst, TBB, Cond);
434 FBB = LastInst->getOperand(0).getMBB();
435 return false;
438 // If the block ends with two unconditional branches, handle it. The second
439 // one is not executed, so remove it.
440 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
441 TBB = SecondLastInst->getOperand(0).getMBB();
442 I = LastInst;
443 if (AllowModify)
444 I->eraseFromParent();
445 return false;
448 // ...likewise if it ends with an indirect branch followed by an unconditional
449 // branch.
450 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
451 I = LastInst;
452 if (AllowModify)
453 I->eraseFromParent();
454 return true;
457 // Otherwise, can't handle this.
458 return true;
461 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
462 MachineBranchPredicate &MBP,
463 bool AllowModify) const {
464 // For the moment, handle only a block which ends with a cb(n)zx followed by
465 // a fallthrough. Why this? Because it is a common form.
466 // TODO: Should we handle b.cc?
468 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
469 if (I == MBB.end())
470 return true;
472 // Skip over SpeculationBarrierEndBB terminators
473 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
474 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
475 --I;
478 if (!isUnpredicatedTerminator(*I))
479 return true;
481 // Get the last instruction in the block.
482 MachineInstr *LastInst = &*I;
483 unsigned LastOpc = LastInst->getOpcode();
484 if (!isCondBranchOpcode(LastOpc))
485 return true;
487 switch (LastOpc) {
488 default:
489 return true;
490 case AArch64::CBZW:
491 case AArch64::CBZX:
492 case AArch64::CBNZW:
493 case AArch64::CBNZX:
494 break;
497 MBP.TrueDest = LastInst->getOperand(1).getMBB();
498 assert(MBP.TrueDest && "expected!");
499 MBP.FalseDest = MBB.getNextNode();
501 MBP.ConditionDef = nullptr;
502 MBP.SingleUseCondition = false;
504 MBP.LHS = LastInst->getOperand(0);
505 MBP.RHS = MachineOperand::CreateImm(0);
506 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
507 : MachineBranchPredicate::PRED_EQ;
508 return false;
511 bool AArch64InstrInfo::reverseBranchCondition(
512 SmallVectorImpl<MachineOperand> &Cond) const {
513 if (Cond[0].getImm() != -1) {
514 // Regular Bcc
515 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
516 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
517 } else {
518 // Folded compare-and-branch
519 switch (Cond[1].getImm()) {
520 default:
521 llvm_unreachable("Unknown conditional branch!");
522 case AArch64::CBZW:
523 Cond[1].setImm(AArch64::CBNZW);
524 break;
525 case AArch64::CBNZW:
526 Cond[1].setImm(AArch64::CBZW);
527 break;
528 case AArch64::CBZX:
529 Cond[1].setImm(AArch64::CBNZX);
530 break;
531 case AArch64::CBNZX:
532 Cond[1].setImm(AArch64::CBZX);
533 break;
534 case AArch64::TBZW:
535 Cond[1].setImm(AArch64::TBNZW);
536 break;
537 case AArch64::TBNZW:
538 Cond[1].setImm(AArch64::TBZW);
539 break;
540 case AArch64::TBZX:
541 Cond[1].setImm(AArch64::TBNZX);
542 break;
543 case AArch64::TBNZX:
544 Cond[1].setImm(AArch64::TBZX);
545 break;
549 return false;
552 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
553 int *BytesRemoved) const {
554 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
555 if (I == MBB.end())
556 return 0;
558 if (!isUncondBranchOpcode(I->getOpcode()) &&
559 !isCondBranchOpcode(I->getOpcode()))
560 return 0;
562 // Remove the branch.
563 I->eraseFromParent();
565 I = MBB.end();
567 if (I == MBB.begin()) {
568 if (BytesRemoved)
569 *BytesRemoved = 4;
570 return 1;
572 --I;
573 if (!isCondBranchOpcode(I->getOpcode())) {
574 if (BytesRemoved)
575 *BytesRemoved = 4;
576 return 1;
579 // Remove the branch.
580 I->eraseFromParent();
581 if (BytesRemoved)
582 *BytesRemoved = 8;
584 return 2;
587 void AArch64InstrInfo::instantiateCondBranch(
588 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
589 ArrayRef<MachineOperand> Cond) const {
590 if (Cond[0].getImm() != -1) {
591 // Regular Bcc
592 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
593 } else {
594 // Folded compare-and-branch
595 // Note that we use addOperand instead of addReg to keep the flags.
596 const MachineInstrBuilder MIB =
597 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
598 if (Cond.size() > 3)
599 MIB.addImm(Cond[3].getImm());
600 MIB.addMBB(TBB);
604 unsigned AArch64InstrInfo::insertBranch(
605 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
606 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
607 // Shouldn't be a fall through.
608 assert(TBB && "insertBranch must not be told to insert a fallthrough");
610 if (!FBB) {
611 if (Cond.empty()) // Unconditional branch?
612 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
613 else
614 instantiateCondBranch(MBB, DL, TBB, Cond);
616 if (BytesAdded)
617 *BytesAdded = 4;
619 return 1;
622 // Two-way conditional branch.
623 instantiateCondBranch(MBB, DL, TBB, Cond);
624 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
626 if (BytesAdded)
627 *BytesAdded = 8;
629 return 2;
632 // Find the original register that VReg is copied from.
633 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
634 while (Register::isVirtualRegister(VReg)) {
635 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
636 if (!DefMI->isFullCopy())
637 return VReg;
638 VReg = DefMI->getOperand(1).getReg();
640 return VReg;
643 // Determine if VReg is defined by an instruction that can be folded into a
644 // csel instruction. If so, return the folded opcode, and the replacement
645 // register.
646 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
647 unsigned *NewVReg = nullptr) {
648 VReg = removeCopies(MRI, VReg);
649 if (!Register::isVirtualRegister(VReg))
650 return 0;
652 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
653 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
654 unsigned Opc = 0;
655 unsigned SrcOpNum = 0;
656 switch (DefMI->getOpcode()) {
657 case AArch64::ADDSXri:
658 case AArch64::ADDSWri:
659 // if NZCV is used, do not fold.
660 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
661 true) == -1)
662 return 0;
663 // fall-through to ADDXri and ADDWri.
664 [[fallthrough]];
665 case AArch64::ADDXri:
666 case AArch64::ADDWri:
667 // add x, 1 -> csinc.
668 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
669 DefMI->getOperand(3).getImm() != 0)
670 return 0;
671 SrcOpNum = 1;
672 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
673 break;
675 case AArch64::ORNXrr:
676 case AArch64::ORNWrr: {
677 // not x -> csinv, represented as orn dst, xzr, src.
678 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
679 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
680 return 0;
681 SrcOpNum = 2;
682 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
683 break;
686 case AArch64::SUBSXrr:
687 case AArch64::SUBSWrr:
688 // if NZCV is used, do not fold.
689 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
690 true) == -1)
691 return 0;
692 // fall-through to SUBXrr and SUBWrr.
693 [[fallthrough]];
694 case AArch64::SUBXrr:
695 case AArch64::SUBWrr: {
696 // neg x -> csneg, represented as sub dst, xzr, src.
697 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
698 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
699 return 0;
700 SrcOpNum = 2;
701 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
702 break;
704 default:
705 return 0;
707 assert(Opc && SrcOpNum && "Missing parameters");
709 if (NewVReg)
710 *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
711 return Opc;
714 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
715 ArrayRef<MachineOperand> Cond,
716 Register DstReg, Register TrueReg,
717 Register FalseReg, int &CondCycles,
718 int &TrueCycles,
719 int &FalseCycles) const {
720 // Check register classes.
721 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
722 const TargetRegisterClass *RC =
723 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
724 if (!RC)
725 return false;
727 // Also need to check the dest regclass, in case we're trying to optimize
728 // something like:
729 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
730 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
731 return false;
733 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
734 unsigned ExtraCondLat = Cond.size() != 1;
736 // GPRs are handled by csel.
737 // FIXME: Fold in x+1, -x, and ~x when applicable.
738 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
739 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
740 // Single-cycle csel, csinc, csinv, and csneg.
741 CondCycles = 1 + ExtraCondLat;
742 TrueCycles = FalseCycles = 1;
743 if (canFoldIntoCSel(MRI, TrueReg))
744 TrueCycles = 0;
745 else if (canFoldIntoCSel(MRI, FalseReg))
746 FalseCycles = 0;
747 return true;
750 // Scalar floating point is handled by fcsel.
751 // FIXME: Form fabs, fmin, and fmax when applicable.
752 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
753 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
754 CondCycles = 5 + ExtraCondLat;
755 TrueCycles = FalseCycles = 2;
756 return true;
759 // Can't do vectors.
760 return false;
763 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
764 MachineBasicBlock::iterator I,
765 const DebugLoc &DL, Register DstReg,
766 ArrayRef<MachineOperand> Cond,
767 Register TrueReg, Register FalseReg) const {
768 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
770 // Parse the condition code, see parseCondBranch() above.
771 AArch64CC::CondCode CC;
772 switch (Cond.size()) {
773 default:
774 llvm_unreachable("Unknown condition opcode in Cond");
775 case 1: // b.cc
776 CC = AArch64CC::CondCode(Cond[0].getImm());
777 break;
778 case 3: { // cbz/cbnz
779 // We must insert a compare against 0.
780 bool Is64Bit;
781 switch (Cond[1].getImm()) {
782 default:
783 llvm_unreachable("Unknown branch opcode in Cond");
784 case AArch64::CBZW:
785 Is64Bit = false;
786 CC = AArch64CC::EQ;
787 break;
788 case AArch64::CBZX:
789 Is64Bit = true;
790 CC = AArch64CC::EQ;
791 break;
792 case AArch64::CBNZW:
793 Is64Bit = false;
794 CC = AArch64CC::NE;
795 break;
796 case AArch64::CBNZX:
797 Is64Bit = true;
798 CC = AArch64CC::NE;
799 break;
801 Register SrcReg = Cond[2].getReg();
802 if (Is64Bit) {
803 // cmp reg, #0 is actually subs xzr, reg, #0.
804 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
805 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
806 .addReg(SrcReg)
807 .addImm(0)
808 .addImm(0);
809 } else {
810 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
811 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
812 .addReg(SrcReg)
813 .addImm(0)
814 .addImm(0);
816 break;
818 case 4: { // tbz/tbnz
819 // We must insert a tst instruction.
820 switch (Cond[1].getImm()) {
821 default:
822 llvm_unreachable("Unknown branch opcode in Cond");
823 case AArch64::TBZW:
824 case AArch64::TBZX:
825 CC = AArch64CC::EQ;
826 break;
827 case AArch64::TBNZW:
828 case AArch64::TBNZX:
829 CC = AArch64CC::NE;
830 break;
832 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
833 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
834 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
835 .addReg(Cond[2].getReg())
836 .addImm(
837 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
838 else
839 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
840 .addReg(Cond[2].getReg())
841 .addImm(
842 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
843 break;
847 unsigned Opc = 0;
848 const TargetRegisterClass *RC = nullptr;
849 bool TryFold = false;
850 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
851 RC = &AArch64::GPR64RegClass;
852 Opc = AArch64::CSELXr;
853 TryFold = true;
854 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
855 RC = &AArch64::GPR32RegClass;
856 Opc = AArch64::CSELWr;
857 TryFold = true;
858 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
859 RC = &AArch64::FPR64RegClass;
860 Opc = AArch64::FCSELDrrr;
861 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
862 RC = &AArch64::FPR32RegClass;
863 Opc = AArch64::FCSELSrrr;
865 assert(RC && "Unsupported regclass");
867 // Try folding simple instructions into the csel.
868 if (TryFold) {
869 unsigned NewVReg = 0;
870 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
871 if (FoldedOpc) {
872 // The folded opcodes csinc, csinc and csneg apply the operation to
873 // FalseReg, so we need to invert the condition.
874 CC = AArch64CC::getInvertedCondCode(CC);
875 TrueReg = FalseReg;
876 } else
877 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
879 // Fold the operation. Leave any dead instructions for DCE to clean up.
880 if (FoldedOpc) {
881 FalseReg = NewVReg;
882 Opc = FoldedOpc;
883 // The extends the live range of NewVReg.
884 MRI.clearKillFlags(NewVReg);
888 // Pull all virtual register into the appropriate class.
889 MRI.constrainRegClass(TrueReg, RC);
890 MRI.constrainRegClass(FalseReg, RC);
892 // Insert the csel.
893 BuildMI(MBB, I, DL, get(Opc), DstReg)
894 .addReg(TrueReg)
895 .addReg(FalseReg)
896 .addImm(CC);
899 // Return true if Imm can be loaded into a register by a "cheap" sequence of
900 // instructions. For now, "cheap" means at most two instructions.
901 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
902 if (BitSize == 32)
903 return true;
905 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
906 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
907 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
908 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
910 return Is.size() <= 2;
913 // FIXME: this implementation should be micro-architecture dependent, so a
914 // micro-architecture target hook should be introduced here in future.
915 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
916 if (Subtarget.hasExynosCheapAsMoveHandling()) {
917 if (isExynosCheapAsMove(MI))
918 return true;
919 return MI.isAsCheapAsAMove();
922 switch (MI.getOpcode()) {
923 default:
924 return MI.isAsCheapAsAMove();
926 case AArch64::ADDWrs:
927 case AArch64::ADDXrs:
928 case AArch64::SUBWrs:
929 case AArch64::SUBXrs:
930 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
932 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
933 // ORRXri, it is as cheap as MOV.
934 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
935 case AArch64::MOVi32imm:
936 return isCheapImmediate(MI, 32);
937 case AArch64::MOVi64imm:
938 return isCheapImmediate(MI, 64);
942 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
943 switch (MI.getOpcode()) {
944 default:
945 return false;
947 case AArch64::ADDWrs:
948 case AArch64::ADDXrs:
949 case AArch64::ADDSWrs:
950 case AArch64::ADDSXrs: {
951 unsigned Imm = MI.getOperand(3).getImm();
952 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
953 if (ShiftVal == 0)
954 return true;
955 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
958 case AArch64::ADDWrx:
959 case AArch64::ADDXrx:
960 case AArch64::ADDXrx64:
961 case AArch64::ADDSWrx:
962 case AArch64::ADDSXrx:
963 case AArch64::ADDSXrx64: {
964 unsigned Imm = MI.getOperand(3).getImm();
965 switch (AArch64_AM::getArithExtendType(Imm)) {
966 default:
967 return false;
968 case AArch64_AM::UXTB:
969 case AArch64_AM::UXTH:
970 case AArch64_AM::UXTW:
971 case AArch64_AM::UXTX:
972 return AArch64_AM::getArithShiftValue(Imm) <= 4;
976 case AArch64::SUBWrs:
977 case AArch64::SUBSWrs: {
978 unsigned Imm = MI.getOperand(3).getImm();
979 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
980 return ShiftVal == 0 ||
981 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
984 case AArch64::SUBXrs:
985 case AArch64::SUBSXrs: {
986 unsigned Imm = MI.getOperand(3).getImm();
987 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
988 return ShiftVal == 0 ||
989 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
992 case AArch64::SUBWrx:
993 case AArch64::SUBXrx:
994 case AArch64::SUBXrx64:
995 case AArch64::SUBSWrx:
996 case AArch64::SUBSXrx:
997 case AArch64::SUBSXrx64: {
998 unsigned Imm = MI.getOperand(3).getImm();
999 switch (AArch64_AM::getArithExtendType(Imm)) {
1000 default:
1001 return false;
1002 case AArch64_AM::UXTB:
1003 case AArch64_AM::UXTH:
1004 case AArch64_AM::UXTW:
1005 case AArch64_AM::UXTX:
1006 return AArch64_AM::getArithShiftValue(Imm) == 0;
1010 case AArch64::LDRBBroW:
1011 case AArch64::LDRBBroX:
1012 case AArch64::LDRBroW:
1013 case AArch64::LDRBroX:
1014 case AArch64::LDRDroW:
1015 case AArch64::LDRDroX:
1016 case AArch64::LDRHHroW:
1017 case AArch64::LDRHHroX:
1018 case AArch64::LDRHroW:
1019 case AArch64::LDRHroX:
1020 case AArch64::LDRQroW:
1021 case AArch64::LDRQroX:
1022 case AArch64::LDRSBWroW:
1023 case AArch64::LDRSBWroX:
1024 case AArch64::LDRSBXroW:
1025 case AArch64::LDRSBXroX:
1026 case AArch64::LDRSHWroW:
1027 case AArch64::LDRSHWroX:
1028 case AArch64::LDRSHXroW:
1029 case AArch64::LDRSHXroX:
1030 case AArch64::LDRSWroW:
1031 case AArch64::LDRSWroX:
1032 case AArch64::LDRSroW:
1033 case AArch64::LDRSroX:
1034 case AArch64::LDRWroW:
1035 case AArch64::LDRWroX:
1036 case AArch64::LDRXroW:
1037 case AArch64::LDRXroX:
1038 case AArch64::PRFMroW:
1039 case AArch64::PRFMroX:
1040 case AArch64::STRBBroW:
1041 case AArch64::STRBBroX:
1042 case AArch64::STRBroW:
1043 case AArch64::STRBroX:
1044 case AArch64::STRDroW:
1045 case AArch64::STRDroX:
1046 case AArch64::STRHHroW:
1047 case AArch64::STRHHroX:
1048 case AArch64::STRHroW:
1049 case AArch64::STRHroX:
1050 case AArch64::STRQroW:
1051 case AArch64::STRQroX:
1052 case AArch64::STRSroW:
1053 case AArch64::STRSroX:
1054 case AArch64::STRWroW:
1055 case AArch64::STRWroX:
1056 case AArch64::STRXroW:
1057 case AArch64::STRXroX: {
1058 unsigned IsSigned = MI.getOperand(3).getImm();
1059 return !IsSigned;
1064 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1065 unsigned Opc = MI.getOpcode();
1066 switch (Opc) {
1067 default:
1068 return false;
1069 case AArch64::SEH_StackAlloc:
1070 case AArch64::SEH_SaveFPLR:
1071 case AArch64::SEH_SaveFPLR_X:
1072 case AArch64::SEH_SaveReg:
1073 case AArch64::SEH_SaveReg_X:
1074 case AArch64::SEH_SaveRegP:
1075 case AArch64::SEH_SaveRegP_X:
1076 case AArch64::SEH_SaveFReg:
1077 case AArch64::SEH_SaveFReg_X:
1078 case AArch64::SEH_SaveFRegP:
1079 case AArch64::SEH_SaveFRegP_X:
1080 case AArch64::SEH_SetFP:
1081 case AArch64::SEH_AddFP:
1082 case AArch64::SEH_Nop:
1083 case AArch64::SEH_PrologEnd:
1084 case AArch64::SEH_EpilogStart:
1085 case AArch64::SEH_EpilogEnd:
1086 case AArch64::SEH_PACSignLR:
1087 case AArch64::SEH_SaveAnyRegQP:
1088 case AArch64::SEH_SaveAnyRegQPX:
1089 return true;
1093 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1094 Register &SrcReg, Register &DstReg,
1095 unsigned &SubIdx) const {
1096 switch (MI.getOpcode()) {
1097 default:
1098 return false;
1099 case AArch64::SBFMXri: // aka sxtw
1100 case AArch64::UBFMXri: // aka uxtw
1101 // Check for the 32 -> 64 bit extension case, these instructions can do
1102 // much more.
1103 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1104 return false;
1105 // This is a signed or unsigned 32 -> 64 bit extension.
1106 SrcReg = MI.getOperand(1).getReg();
1107 DstReg = MI.getOperand(0).getReg();
1108 SubIdx = AArch64::sub_32;
1109 return true;
1113 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1114 const MachineInstr &MIa, const MachineInstr &MIb) const {
1115 const TargetRegisterInfo *TRI = &getRegisterInfo();
1116 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1117 int64_t OffsetA = 0, OffsetB = 0;
1118 TypeSize WidthA(0, false), WidthB(0, false);
1119 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1121 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1122 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1124 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1125 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1126 return false;
1128 // Retrieve the base, offset from the base and width. Width
1129 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1130 // base are identical, and the offset of a lower memory access +
1131 // the width doesn't overlap the offset of a higher memory access,
1132 // then the memory accesses are different.
1133 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1134 // are assumed to have the same scale (vscale).
1135 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1136 WidthA, TRI) &&
1137 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1138 WidthB, TRI)) {
1139 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1140 OffsetAIsScalable == OffsetBIsScalable) {
1141 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1142 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1143 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1144 if (LowWidth.isScalable() == OffsetAIsScalable &&
1145 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1146 return true;
1149 return false;
1152 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1153 const MachineBasicBlock *MBB,
1154 const MachineFunction &MF) const {
1155 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1156 return true;
1158 // Do not move an instruction that can be recognized as a branch target.
1159 if (hasBTISemantics(MI))
1160 return true;
1162 switch (MI.getOpcode()) {
1163 case AArch64::HINT:
1164 // CSDB hints are scheduling barriers.
1165 if (MI.getOperand(0).getImm() == 0x14)
1166 return true;
1167 break;
1168 case AArch64::DSB:
1169 case AArch64::ISB:
1170 // DSB and ISB also are scheduling barriers.
1171 return true;
1172 case AArch64::MSRpstatesvcrImm1:
1173 // SMSTART and SMSTOP are also scheduling barriers.
1174 return true;
1175 default:;
1177 if (isSEHInstruction(MI))
1178 return true;
1179 auto Next = std::next(MI.getIterator());
1180 return Next != MBB->end() && Next->isCFIInstruction();
1183 /// analyzeCompare - For a comparison instruction, return the source registers
1184 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1185 /// Return true if the comparison instruction can be analyzed.
1186 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1187 Register &SrcReg2, int64_t &CmpMask,
1188 int64_t &CmpValue) const {
1189 // The first operand can be a frame index where we'd normally expect a
1190 // register.
1191 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1192 if (!MI.getOperand(1).isReg())
1193 return false;
1195 switch (MI.getOpcode()) {
1196 default:
1197 break;
1198 case AArch64::PTEST_PP:
1199 case AArch64::PTEST_PP_ANY:
1200 SrcReg = MI.getOperand(0).getReg();
1201 SrcReg2 = MI.getOperand(1).getReg();
1202 // Not sure about the mask and value for now...
1203 CmpMask = ~0;
1204 CmpValue = 0;
1205 return true;
1206 case AArch64::SUBSWrr:
1207 case AArch64::SUBSWrs:
1208 case AArch64::SUBSWrx:
1209 case AArch64::SUBSXrr:
1210 case AArch64::SUBSXrs:
1211 case AArch64::SUBSXrx:
1212 case AArch64::ADDSWrr:
1213 case AArch64::ADDSWrs:
1214 case AArch64::ADDSWrx:
1215 case AArch64::ADDSXrr:
1216 case AArch64::ADDSXrs:
1217 case AArch64::ADDSXrx:
1218 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1219 SrcReg = MI.getOperand(1).getReg();
1220 SrcReg2 = MI.getOperand(2).getReg();
1221 CmpMask = ~0;
1222 CmpValue = 0;
1223 return true;
1224 case AArch64::SUBSWri:
1225 case AArch64::ADDSWri:
1226 case AArch64::SUBSXri:
1227 case AArch64::ADDSXri:
1228 SrcReg = MI.getOperand(1).getReg();
1229 SrcReg2 = 0;
1230 CmpMask = ~0;
1231 CmpValue = MI.getOperand(2).getImm();
1232 return true;
1233 case AArch64::ANDSWri:
1234 case AArch64::ANDSXri:
1235 // ANDS does not use the same encoding scheme as the others xxxS
1236 // instructions.
1237 SrcReg = MI.getOperand(1).getReg();
1238 SrcReg2 = 0;
1239 CmpMask = ~0;
1240 CmpValue = AArch64_AM::decodeLogicalImmediate(
1241 MI.getOperand(2).getImm(),
1242 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1243 return true;
1246 return false;
1249 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1250 MachineBasicBlock *MBB = Instr.getParent();
1251 assert(MBB && "Can't get MachineBasicBlock here");
1252 MachineFunction *MF = MBB->getParent();
1253 assert(MF && "Can't get MachineFunction here");
1254 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1255 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1256 MachineRegisterInfo *MRI = &MF->getRegInfo();
1258 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1259 ++OpIdx) {
1260 MachineOperand &MO = Instr.getOperand(OpIdx);
1261 const TargetRegisterClass *OpRegCstraints =
1262 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1264 // If there's no constraint, there's nothing to do.
1265 if (!OpRegCstraints)
1266 continue;
1267 // If the operand is a frame index, there's nothing to do here.
1268 // A frame index operand will resolve correctly during PEI.
1269 if (MO.isFI())
1270 continue;
1272 assert(MO.isReg() &&
1273 "Operand has register constraints without being a register!");
1275 Register Reg = MO.getReg();
1276 if (Reg.isPhysical()) {
1277 if (!OpRegCstraints->contains(Reg))
1278 return false;
1279 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1280 !MRI->constrainRegClass(Reg, OpRegCstraints))
1281 return false;
1284 return true;
1287 /// Return the opcode that does not set flags when possible - otherwise
1288 /// return the original opcode. The caller is responsible to do the actual
1289 /// substitution and legality checking.
1290 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1291 // Don't convert all compare instructions, because for some the zero register
1292 // encoding becomes the sp register.
1293 bool MIDefinesZeroReg = false;
1294 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1295 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1296 MIDefinesZeroReg = true;
1298 switch (MI.getOpcode()) {
1299 default:
1300 return MI.getOpcode();
1301 case AArch64::ADDSWrr:
1302 return AArch64::ADDWrr;
1303 case AArch64::ADDSWri:
1304 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1305 case AArch64::ADDSWrs:
1306 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1307 case AArch64::ADDSWrx:
1308 return AArch64::ADDWrx;
1309 case AArch64::ADDSXrr:
1310 return AArch64::ADDXrr;
1311 case AArch64::ADDSXri:
1312 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1313 case AArch64::ADDSXrs:
1314 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1315 case AArch64::ADDSXrx:
1316 return AArch64::ADDXrx;
1317 case AArch64::SUBSWrr:
1318 return AArch64::SUBWrr;
1319 case AArch64::SUBSWri:
1320 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1321 case AArch64::SUBSWrs:
1322 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1323 case AArch64::SUBSWrx:
1324 return AArch64::SUBWrx;
1325 case AArch64::SUBSXrr:
1326 return AArch64::SUBXrr;
1327 case AArch64::SUBSXri:
1328 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1329 case AArch64::SUBSXrs:
1330 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1331 case AArch64::SUBSXrx:
1332 return AArch64::SUBXrx;
1336 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1338 /// True when condition flags are accessed (either by writing or reading)
1339 /// on the instruction trace starting at From and ending at To.
1341 /// Note: If From and To are from different blocks it's assumed CC are accessed
1342 /// on the path.
1343 static bool areCFlagsAccessedBetweenInstrs(
1344 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1345 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1346 // Early exit if To is at the beginning of the BB.
1347 if (To == To->getParent()->begin())
1348 return true;
1350 // Check whether the instructions are in the same basic block
1351 // If not, assume the condition flags might get modified somewhere.
1352 if (To->getParent() != From->getParent())
1353 return true;
1355 // From must be above To.
1356 assert(std::any_of(
1357 ++To.getReverse(), To->getParent()->rend(),
1358 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1360 // We iterate backward starting at \p To until we hit \p From.
1361 for (const MachineInstr &Instr :
1362 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1363 if (((AccessToCheck & AK_Write) &&
1364 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1365 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1366 return true;
1368 return false;
1371 std::optional<unsigned>
1372 AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1373 MachineInstr *Pred,
1374 const MachineRegisterInfo *MRI) const {
1375 unsigned MaskOpcode = Mask->getOpcode();
1376 unsigned PredOpcode = Pred->getOpcode();
1377 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1378 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1380 if (PredIsWhileLike) {
1381 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1382 // instruction and the condition is "any" since WHILcc does an implicit
1383 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1384 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1385 return PredOpcode;
1387 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1388 // redundant since WHILE performs an implicit PTEST with an all active
1389 // mask.
1390 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1391 getElementSizeForOpcode(MaskOpcode) ==
1392 getElementSizeForOpcode(PredOpcode))
1393 return PredOpcode;
1395 return {};
1398 if (PredIsPTestLike) {
1399 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1400 // instruction that sets the flags as PTEST would and the condition is
1401 // "any" since PG is always a subset of the governing predicate of the
1402 // ptest-like instruction.
1403 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1404 return PredOpcode;
1406 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1407 // the element size matches and either the PTEST_LIKE instruction uses
1408 // the same all active mask or the condition is "any".
1409 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1410 getElementSizeForOpcode(MaskOpcode) ==
1411 getElementSizeForOpcode(PredOpcode)) {
1412 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1413 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1414 return PredOpcode;
1417 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1418 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1419 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1420 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1421 // performed by the compare could consider fewer lanes for these element
1422 // sizes.
1424 // For example, consider
1426 // ptrue p0.b ; P0=1111-1111-1111-1111
1427 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1428 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1429 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1430 // ; ^ last active
1431 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1432 // ; ^ last active
1434 // where the compare generates a canonical all active 32-bit predicate
1435 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1436 // active flag, whereas the PTEST instruction with the same mask doesn't.
1437 // For PTEST_ANY this doesn't apply as the flags in this case would be
1438 // identical regardless of element size.
1439 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1440 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1441 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1442 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1443 return PredOpcode;
1445 return {};
1448 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1449 // opcode so the PTEST becomes redundant.
1450 switch (PredOpcode) {
1451 case AArch64::AND_PPzPP:
1452 case AArch64::BIC_PPzPP:
1453 case AArch64::EOR_PPzPP:
1454 case AArch64::NAND_PPzPP:
1455 case AArch64::NOR_PPzPP:
1456 case AArch64::ORN_PPzPP:
1457 case AArch64::ORR_PPzPP:
1458 case AArch64::BRKA_PPzP:
1459 case AArch64::BRKPA_PPzPP:
1460 case AArch64::BRKB_PPzP:
1461 case AArch64::BRKPB_PPzPP:
1462 case AArch64::RDFFR_PPz: {
1463 // Check to see if our mask is the same. If not the resulting flag bits
1464 // may be different and we can't remove the ptest.
1465 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1466 if (Mask != PredMask)
1467 return {};
1468 break;
1470 case AArch64::BRKN_PPzP: {
1471 // BRKN uses an all active implicit mask to set flags unlike the other
1472 // flag-setting instructions.
1473 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1474 if ((MaskOpcode != AArch64::PTRUE_B) ||
1475 (Mask->getOperand(1).getImm() != 31))
1476 return {};
1477 break;
1479 case AArch64::PTRUE_B:
1480 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1481 break;
1482 default:
1483 // Bail out if we don't recognize the input
1484 return {};
1487 return convertToFlagSettingOpc(PredOpcode);
1490 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1491 /// operation which could set the flags in an identical manner
1492 bool AArch64InstrInfo::optimizePTestInstr(
1493 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1494 const MachineRegisterInfo *MRI) const {
1495 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1496 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1497 unsigned PredOpcode = Pred->getOpcode();
1498 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1499 if (!NewOp)
1500 return false;
1502 const TargetRegisterInfo *TRI = &getRegisterInfo();
1504 // If another instruction between Pred and PTest accesses flags, don't remove
1505 // the ptest or update the earlier instruction to modify them.
1506 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1507 return false;
1509 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1510 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1511 // operand to be replaced with an equivalent instruction that also sets the
1512 // flags.
1513 PTest->eraseFromParent();
1514 if (*NewOp != PredOpcode) {
1515 Pred->setDesc(get(*NewOp));
1516 bool succeeded = UpdateOperandRegClass(*Pred);
1517 (void)succeeded;
1518 assert(succeeded && "Operands have incompatible register classes!");
1519 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1522 // Ensure that the flags def is live.
1523 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1524 unsigned i = 0, e = Pred->getNumOperands();
1525 for (; i != e; ++i) {
1526 MachineOperand &MO = Pred->getOperand(i);
1527 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1528 MO.setIsDead(false);
1529 break;
1533 return true;
1536 /// Try to optimize a compare instruction. A compare instruction is an
1537 /// instruction which produces AArch64::NZCV. It can be truly compare
1538 /// instruction
1539 /// when there are no uses of its destination register.
1541 /// The following steps are tried in order:
1542 /// 1. Convert CmpInstr into an unconditional version.
1543 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1544 /// condition code or an instruction which can be converted into such an
1545 /// instruction.
1546 /// Only comparison with zero is supported.
1547 bool AArch64InstrInfo::optimizeCompareInstr(
1548 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1549 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1550 assert(CmpInstr.getParent());
1551 assert(MRI);
1553 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1554 int DeadNZCVIdx =
1555 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1556 if (DeadNZCVIdx != -1) {
1557 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1558 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1559 CmpInstr.eraseFromParent();
1560 return true;
1562 unsigned Opc = CmpInstr.getOpcode();
1563 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1564 if (NewOpc == Opc)
1565 return false;
1566 const MCInstrDesc &MCID = get(NewOpc);
1567 CmpInstr.setDesc(MCID);
1568 CmpInstr.removeOperand(DeadNZCVIdx);
1569 bool succeeded = UpdateOperandRegClass(CmpInstr);
1570 (void)succeeded;
1571 assert(succeeded && "Some operands reg class are incompatible!");
1572 return true;
1575 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1576 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1577 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1579 if (SrcReg2 != 0)
1580 return false;
1582 // CmpInstr is a Compare instruction if destination register is not used.
1583 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1584 return false;
1586 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1587 return true;
1588 return (CmpValue == 0 || CmpValue == 1) &&
1589 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1592 /// Get opcode of S version of Instr.
1593 /// If Instr is S version its opcode is returned.
1594 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1595 /// or we are not interested in it.
1596 static unsigned sForm(MachineInstr &Instr) {
1597 switch (Instr.getOpcode()) {
1598 default:
1599 return AArch64::INSTRUCTION_LIST_END;
1601 case AArch64::ADDSWrr:
1602 case AArch64::ADDSWri:
1603 case AArch64::ADDSXrr:
1604 case AArch64::ADDSXri:
1605 case AArch64::SUBSWrr:
1606 case AArch64::SUBSWri:
1607 case AArch64::SUBSXrr:
1608 case AArch64::SUBSXri:
1609 return Instr.getOpcode();
1611 case AArch64::ADDWrr:
1612 return AArch64::ADDSWrr;
1613 case AArch64::ADDWri:
1614 return AArch64::ADDSWri;
1615 case AArch64::ADDXrr:
1616 return AArch64::ADDSXrr;
1617 case AArch64::ADDXri:
1618 return AArch64::ADDSXri;
1619 case AArch64::ADCWr:
1620 return AArch64::ADCSWr;
1621 case AArch64::ADCXr:
1622 return AArch64::ADCSXr;
1623 case AArch64::SUBWrr:
1624 return AArch64::SUBSWrr;
1625 case AArch64::SUBWri:
1626 return AArch64::SUBSWri;
1627 case AArch64::SUBXrr:
1628 return AArch64::SUBSXrr;
1629 case AArch64::SUBXri:
1630 return AArch64::SUBSXri;
1631 case AArch64::SBCWr:
1632 return AArch64::SBCSWr;
1633 case AArch64::SBCXr:
1634 return AArch64::SBCSXr;
1635 case AArch64::ANDWri:
1636 return AArch64::ANDSWri;
1637 case AArch64::ANDXri:
1638 return AArch64::ANDSXri;
1642 /// Check if AArch64::NZCV should be alive in successors of MBB.
1643 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1644 for (auto *BB : MBB->successors())
1645 if (BB->isLiveIn(AArch64::NZCV))
1646 return true;
1647 return false;
1650 /// \returns The condition code operand index for \p Instr if it is a branch
1651 /// or select and -1 otherwise.
1652 static int
1653 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1654 switch (Instr.getOpcode()) {
1655 default:
1656 return -1;
1658 case AArch64::Bcc: {
1659 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1660 assert(Idx >= 2);
1661 return Idx - 2;
1664 case AArch64::CSINVWr:
1665 case AArch64::CSINVXr:
1666 case AArch64::CSINCWr:
1667 case AArch64::CSINCXr:
1668 case AArch64::CSELWr:
1669 case AArch64::CSELXr:
1670 case AArch64::CSNEGWr:
1671 case AArch64::CSNEGXr:
1672 case AArch64::FCSELSrrr:
1673 case AArch64::FCSELDrrr: {
1674 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1675 assert(Idx >= 1);
1676 return Idx - 1;
1681 /// Find a condition code used by the instruction.
1682 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1683 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1684 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1685 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1686 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1687 Instr.getOperand(CCIdx).getImm())
1688 : AArch64CC::Invalid;
1691 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1692 assert(CC != AArch64CC::Invalid);
1693 UsedNZCV UsedFlags;
1694 switch (CC) {
1695 default:
1696 break;
1698 case AArch64CC::EQ: // Z set
1699 case AArch64CC::NE: // Z clear
1700 UsedFlags.Z = true;
1701 break;
1703 case AArch64CC::HI: // Z clear and C set
1704 case AArch64CC::LS: // Z set or C clear
1705 UsedFlags.Z = true;
1706 [[fallthrough]];
1707 case AArch64CC::HS: // C set
1708 case AArch64CC::LO: // C clear
1709 UsedFlags.C = true;
1710 break;
1712 case AArch64CC::MI: // N set
1713 case AArch64CC::PL: // N clear
1714 UsedFlags.N = true;
1715 break;
1717 case AArch64CC::VS: // V set
1718 case AArch64CC::VC: // V clear
1719 UsedFlags.V = true;
1720 break;
1722 case AArch64CC::GT: // Z clear, N and V the same
1723 case AArch64CC::LE: // Z set, N and V differ
1724 UsedFlags.Z = true;
1725 [[fallthrough]];
1726 case AArch64CC::GE: // N and V the same
1727 case AArch64CC::LT: // N and V differ
1728 UsedFlags.N = true;
1729 UsedFlags.V = true;
1730 break;
1732 return UsedFlags;
1735 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1736 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1737 /// \returns std::nullopt otherwise.
1739 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1740 std::optional<UsedNZCV>
1741 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1742 const TargetRegisterInfo &TRI,
1743 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1744 MachineBasicBlock *CmpParent = CmpInstr.getParent();
1745 if (MI.getParent() != CmpParent)
1746 return std::nullopt;
1748 if (areCFlagsAliveInSuccessors(CmpParent))
1749 return std::nullopt;
1751 UsedNZCV NZCVUsedAfterCmp;
1752 for (MachineInstr &Instr : instructionsWithoutDebug(
1753 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1754 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1755 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1756 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1757 return std::nullopt;
1758 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1759 if (CCUseInstrs)
1760 CCUseInstrs->push_back(&Instr);
1762 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1763 break;
1765 return NZCVUsedAfterCmp;
1768 static bool isADDSRegImm(unsigned Opcode) {
1769 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1772 static bool isSUBSRegImm(unsigned Opcode) {
1773 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1776 /// Check if CmpInstr can be substituted by MI.
1778 /// CmpInstr can be substituted:
1779 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1780 /// - and, MI and CmpInstr are from the same MachineBB
1781 /// - and, condition flags are not alive in successors of the CmpInstr parent
1782 /// - and, if MI opcode is the S form there must be no defs of flags between
1783 /// MI and CmpInstr
1784 /// or if MI opcode is not the S form there must be neither defs of flags
1785 /// nor uses of flags between MI and CmpInstr.
1786 /// - and, if C/V flags are not used after CmpInstr
1787 /// or if N flag is used but MI produces poison value if signed overflow
1788 /// occurs.
1789 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1790 const TargetRegisterInfo &TRI) {
1791 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1792 // that may or may not set flags.
1793 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1795 const unsigned CmpOpcode = CmpInstr.getOpcode();
1796 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1797 return false;
1799 assert((CmpInstr.getOperand(2).isImm() &&
1800 CmpInstr.getOperand(2).getImm() == 0) &&
1801 "Caller guarantees that CmpInstr compares with constant 0");
1803 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1804 if (!NZVCUsed || NZVCUsed->C)
1805 return false;
1807 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1808 // '%vreg = add ...' or '%vreg = sub ...'.
1809 // Condition flag V is used to indicate signed overflow.
1810 // 1) MI and CmpInstr set N and V to the same value.
1811 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1812 // signed overflow occurs, so CmpInstr could still be simplified away.
1813 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1814 return false;
1816 AccessKind AccessToCheck = AK_Write;
1817 if (sForm(MI) != MI.getOpcode())
1818 AccessToCheck = AK_All;
1819 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1822 /// Substitute an instruction comparing to zero with another instruction
1823 /// which produces needed condition flags.
1825 /// Return true on success.
1826 bool AArch64InstrInfo::substituteCmpToZero(
1827 MachineInstr &CmpInstr, unsigned SrcReg,
1828 const MachineRegisterInfo &MRI) const {
1829 // Get the unique definition of SrcReg.
1830 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1831 if (!MI)
1832 return false;
1834 const TargetRegisterInfo &TRI = getRegisterInfo();
1836 unsigned NewOpc = sForm(*MI);
1837 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1838 return false;
1840 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1841 return false;
1843 // Update the instruction to set NZCV.
1844 MI->setDesc(get(NewOpc));
1845 CmpInstr.eraseFromParent();
1846 bool succeeded = UpdateOperandRegClass(*MI);
1847 (void)succeeded;
1848 assert(succeeded && "Some operands reg class are incompatible!");
1849 MI->addRegisterDefined(AArch64::NZCV, &TRI);
1850 return true;
1853 /// \returns True if \p CmpInstr can be removed.
1855 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1856 /// codes used in \p CCUseInstrs must be inverted.
1857 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1858 int CmpValue, const TargetRegisterInfo &TRI,
1859 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1860 bool &IsInvertCC) {
1861 assert((CmpValue == 0 || CmpValue == 1) &&
1862 "Only comparisons to 0 or 1 considered for removal!");
1864 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1865 unsigned MIOpc = MI.getOpcode();
1866 if (MIOpc == AArch64::CSINCWr) {
1867 if (MI.getOperand(1).getReg() != AArch64::WZR ||
1868 MI.getOperand(2).getReg() != AArch64::WZR)
1869 return false;
1870 } else if (MIOpc == AArch64::CSINCXr) {
1871 if (MI.getOperand(1).getReg() != AArch64::XZR ||
1872 MI.getOperand(2).getReg() != AArch64::XZR)
1873 return false;
1874 } else {
1875 return false;
1877 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1878 if (MICC == AArch64CC::Invalid)
1879 return false;
1881 // NZCV needs to be defined
1882 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1883 return false;
1885 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1886 const unsigned CmpOpcode = CmpInstr.getOpcode();
1887 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1888 if (CmpValue && !IsSubsRegImm)
1889 return false;
1890 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1891 return false;
1893 // MI conditions allowed: eq, ne, mi, pl
1894 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1895 if (MIUsedNZCV.C || MIUsedNZCV.V)
1896 return false;
1898 std::optional<UsedNZCV> NZCVUsedAfterCmp =
1899 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1900 // Condition flags are not used in CmpInstr basic block successors and only
1901 // Z or N flags allowed to be used after CmpInstr within its basic block
1902 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1903 return false;
1904 // Z or N flag used after CmpInstr must correspond to the flag used in MI
1905 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1906 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1907 return false;
1908 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1909 if (MIUsedNZCV.N && !CmpValue)
1910 return false;
1912 // There must be no defs of flags between MI and CmpInstr
1913 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1914 return false;
1916 // Condition code is inverted in the following cases:
1917 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1918 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1919 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1920 (!CmpValue && MICC == AArch64CC::NE);
1921 return true;
1924 /// Remove comparison in csinc-cmp sequence
1926 /// Examples:
1927 /// 1. \code
1928 /// csinc w9, wzr, wzr, ne
1929 /// cmp w9, #0
1930 /// b.eq
1931 /// \endcode
1932 /// to
1933 /// \code
1934 /// csinc w9, wzr, wzr, ne
1935 /// b.ne
1936 /// \endcode
1938 /// 2. \code
1939 /// csinc x2, xzr, xzr, mi
1940 /// cmp x2, #1
1941 /// b.pl
1942 /// \endcode
1943 /// to
1944 /// \code
1945 /// csinc x2, xzr, xzr, mi
1946 /// b.pl
1947 /// \endcode
1949 /// \param CmpInstr comparison instruction
1950 /// \return True when comparison removed
1951 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1952 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1953 const MachineRegisterInfo &MRI) const {
1954 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1955 if (!MI)
1956 return false;
1957 const TargetRegisterInfo &TRI = getRegisterInfo();
1958 SmallVector<MachineInstr *, 4> CCUseInstrs;
1959 bool IsInvertCC = false;
1960 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1961 IsInvertCC))
1962 return false;
1963 // Make transformation
1964 CmpInstr.eraseFromParent();
1965 if (IsInvertCC) {
1966 // Invert condition codes in CmpInstr CC users
1967 for (MachineInstr *CCUseInstr : CCUseInstrs) {
1968 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1969 assert(Idx >= 0 && "Unexpected instruction using CC.");
1970 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1971 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1972 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1973 CCOperand.setImm(CCUse);
1976 return true;
1979 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1980 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1981 MI.getOpcode() != AArch64::CATCHRET)
1982 return false;
1984 MachineBasicBlock &MBB = *MI.getParent();
1985 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1986 auto TRI = Subtarget.getRegisterInfo();
1987 DebugLoc DL = MI.getDebugLoc();
1989 if (MI.getOpcode() == AArch64::CATCHRET) {
1990 // Skip to the first instruction before the epilog.
1991 const TargetInstrInfo *TII =
1992 MBB.getParent()->getSubtarget().getInstrInfo();
1993 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1994 auto MBBI = MachineBasicBlock::iterator(MI);
1995 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1996 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1997 FirstEpilogSEH != MBB.begin())
1998 FirstEpilogSEH = std::prev(FirstEpilogSEH);
1999 if (FirstEpilogSEH != MBB.begin())
2000 FirstEpilogSEH = std::next(FirstEpilogSEH);
2001 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2002 .addReg(AArch64::X0, RegState::Define)
2003 .addMBB(TargetMBB);
2004 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2005 .addReg(AArch64::X0, RegState::Define)
2006 .addReg(AArch64::X0)
2007 .addMBB(TargetMBB)
2008 .addImm(0);
2009 TargetMBB->setMachineBlockAddressTaken();
2010 return true;
2013 Register Reg = MI.getOperand(0).getReg();
2014 Module &M = *MBB.getParent()->getFunction().getParent();
2015 if (M.getStackProtectorGuard() == "sysreg") {
2016 const AArch64SysReg::SysReg *SrcReg =
2017 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2018 if (!SrcReg)
2019 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2021 // mrs xN, sysreg
2022 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2023 .addDef(Reg, RegState::Renamable)
2024 .addImm(SrcReg->Encoding);
2025 int Offset = M.getStackProtectorGuardOffset();
2026 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2027 // ldr xN, [xN, #offset]
2028 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2029 .addDef(Reg)
2030 .addUse(Reg, RegState::Kill)
2031 .addImm(Offset / 8);
2032 } else if (Offset >= -256 && Offset <= 255) {
2033 // ldur xN, [xN, #offset]
2034 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2035 .addDef(Reg)
2036 .addUse(Reg, RegState::Kill)
2037 .addImm(Offset);
2038 } else if (Offset >= -4095 && Offset <= 4095) {
2039 if (Offset > 0) {
2040 // add xN, xN, #offset
2041 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2042 .addDef(Reg)
2043 .addUse(Reg, RegState::Kill)
2044 .addImm(Offset)
2045 .addImm(0);
2046 } else {
2047 // sub xN, xN, #offset
2048 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2049 .addDef(Reg)
2050 .addUse(Reg, RegState::Kill)
2051 .addImm(-Offset)
2052 .addImm(0);
2054 // ldr xN, [xN]
2055 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2056 .addDef(Reg)
2057 .addUse(Reg, RegState::Kill)
2058 .addImm(0);
2059 } else {
2060 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2061 // than 23760.
2062 // It might be nice to use AArch64::MOVi32imm here, which would get
2063 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2064 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2065 // AArch64FrameLowering might help us find such a scratch register
2066 // though. If we failed to find a scratch register, we could emit a
2067 // stream of add instructions to build up the immediate. Or, we could try
2068 // to insert a AArch64::MOVi32imm before register allocation so that we
2069 // didn't need to scavenge for a scratch register.
2070 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2072 MBB.erase(MI);
2073 return true;
2076 const GlobalValue *GV =
2077 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2078 const TargetMachine &TM = MBB.getParent()->getTarget();
2079 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2080 const unsigned char MO_NC = AArch64II::MO_NC;
2082 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2083 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2084 .addGlobalAddress(GV, 0, OpFlags);
2085 if (Subtarget.isTargetILP32()) {
2086 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2087 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2088 .addDef(Reg32, RegState::Dead)
2089 .addUse(Reg, RegState::Kill)
2090 .addImm(0)
2091 .addMemOperand(*MI.memoperands_begin())
2092 .addDef(Reg, RegState::Implicit);
2093 } else {
2094 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2095 .addReg(Reg, RegState::Kill)
2096 .addImm(0)
2097 .addMemOperand(*MI.memoperands_begin());
2099 } else if (TM.getCodeModel() == CodeModel::Large) {
2100 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2101 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2102 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2103 .addImm(0);
2104 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2105 .addReg(Reg, RegState::Kill)
2106 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2107 .addImm(16);
2108 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2109 .addReg(Reg, RegState::Kill)
2110 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2111 .addImm(32);
2112 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2113 .addReg(Reg, RegState::Kill)
2114 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2115 .addImm(48);
2116 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2117 .addReg(Reg, RegState::Kill)
2118 .addImm(0)
2119 .addMemOperand(*MI.memoperands_begin());
2120 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2121 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2122 .addGlobalAddress(GV, 0, OpFlags);
2123 } else {
2124 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2125 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2126 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2127 if (Subtarget.isTargetILP32()) {
2128 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2129 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2130 .addDef(Reg32, RegState::Dead)
2131 .addUse(Reg, RegState::Kill)
2132 .addGlobalAddress(GV, 0, LoFlags)
2133 .addMemOperand(*MI.memoperands_begin())
2134 .addDef(Reg, RegState::Implicit);
2135 } else {
2136 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2137 .addReg(Reg, RegState::Kill)
2138 .addGlobalAddress(GV, 0, LoFlags)
2139 .addMemOperand(*MI.memoperands_begin());
2143 MBB.erase(MI);
2145 return true;
2148 // Return true if this instruction simply sets its single destination register
2149 // to zero. This is equivalent to a register rename of the zero-register.
2150 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2151 switch (MI.getOpcode()) {
2152 default:
2153 break;
2154 case AArch64::MOVZWi:
2155 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2156 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2157 assert(MI.getDesc().getNumOperands() == 3 &&
2158 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2159 return true;
2161 break;
2162 case AArch64::ANDWri: // and Rd, Rzr, #imm
2163 return MI.getOperand(1).getReg() == AArch64::WZR;
2164 case AArch64::ANDXri:
2165 return MI.getOperand(1).getReg() == AArch64::XZR;
2166 case TargetOpcode::COPY:
2167 return MI.getOperand(1).getReg() == AArch64::WZR;
2169 return false;
2172 // Return true if this instruction simply renames a general register without
2173 // modifying bits.
2174 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2175 switch (MI.getOpcode()) {
2176 default:
2177 break;
2178 case TargetOpcode::COPY: {
2179 // GPR32 copies will by lowered to ORRXrs
2180 Register DstReg = MI.getOperand(0).getReg();
2181 return (AArch64::GPR32RegClass.contains(DstReg) ||
2182 AArch64::GPR64RegClass.contains(DstReg));
2184 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2185 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2186 assert(MI.getDesc().getNumOperands() == 4 &&
2187 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2188 return true;
2190 break;
2191 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2192 if (MI.getOperand(2).getImm() == 0) {
2193 assert(MI.getDesc().getNumOperands() == 4 &&
2194 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2195 return true;
2197 break;
2199 return false;
2202 // Return true if this instruction simply renames a general register without
2203 // modifying bits.
2204 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2205 switch (MI.getOpcode()) {
2206 default:
2207 break;
2208 case TargetOpcode::COPY: {
2209 Register DstReg = MI.getOperand(0).getReg();
2210 return AArch64::FPR128RegClass.contains(DstReg);
2212 case AArch64::ORRv16i8:
2213 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2214 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2215 "invalid ORRv16i8 operands");
2216 return true;
2218 break;
2220 return false;
2223 Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2224 int &FrameIndex) const {
2225 switch (MI.getOpcode()) {
2226 default:
2227 break;
2228 case AArch64::LDRWui:
2229 case AArch64::LDRXui:
2230 case AArch64::LDRBui:
2231 case AArch64::LDRHui:
2232 case AArch64::LDRSui:
2233 case AArch64::LDRDui:
2234 case AArch64::LDRQui:
2235 case AArch64::LDR_PXI:
2236 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2237 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2238 FrameIndex = MI.getOperand(1).getIndex();
2239 return MI.getOperand(0).getReg();
2241 break;
2244 return 0;
2247 Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2248 int &FrameIndex) const {
2249 switch (MI.getOpcode()) {
2250 default:
2251 break;
2252 case AArch64::STRWui:
2253 case AArch64::STRXui:
2254 case AArch64::STRBui:
2255 case AArch64::STRHui:
2256 case AArch64::STRSui:
2257 case AArch64::STRDui:
2258 case AArch64::STRQui:
2259 case AArch64::STR_PXI:
2260 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2261 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2262 FrameIndex = MI.getOperand(1).getIndex();
2263 return MI.getOperand(0).getReg();
2265 break;
2267 return 0;
2270 /// Check all MachineMemOperands for a hint to suppress pairing.
2271 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2272 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2273 return MMO->getFlags() & MOSuppressPair;
2277 /// Set a flag on the first MachineMemOperand to suppress pairing.
2278 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2279 if (MI.memoperands_empty())
2280 return;
2281 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2284 /// Check all MachineMemOperands for a hint that the load/store is strided.
2285 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2286 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2287 return MMO->getFlags() & MOStridedAccess;
2291 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2292 switch (Opc) {
2293 default:
2294 return false;
2295 case AArch64::STURSi:
2296 case AArch64::STRSpre:
2297 case AArch64::STURDi:
2298 case AArch64::STRDpre:
2299 case AArch64::STURQi:
2300 case AArch64::STRQpre:
2301 case AArch64::STURBBi:
2302 case AArch64::STURHHi:
2303 case AArch64::STURWi:
2304 case AArch64::STRWpre:
2305 case AArch64::STURXi:
2306 case AArch64::STRXpre:
2307 case AArch64::LDURSi:
2308 case AArch64::LDRSpre:
2309 case AArch64::LDURDi:
2310 case AArch64::LDRDpre:
2311 case AArch64::LDURQi:
2312 case AArch64::LDRQpre:
2313 case AArch64::LDURWi:
2314 case AArch64::LDRWpre:
2315 case AArch64::LDURXi:
2316 case AArch64::LDRXpre:
2317 case AArch64::LDRSWpre:
2318 case AArch64::LDURSWi:
2319 case AArch64::LDURHHi:
2320 case AArch64::LDURBBi:
2321 case AArch64::LDURSBWi:
2322 case AArch64::LDURSHWi:
2323 return true;
2327 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2328 switch (Opc) {
2329 default: return {};
2330 case AArch64::PRFMui: return AArch64::PRFUMi;
2331 case AArch64::LDRXui: return AArch64::LDURXi;
2332 case AArch64::LDRWui: return AArch64::LDURWi;
2333 case AArch64::LDRBui: return AArch64::LDURBi;
2334 case AArch64::LDRHui: return AArch64::LDURHi;
2335 case AArch64::LDRSui: return AArch64::LDURSi;
2336 case AArch64::LDRDui: return AArch64::LDURDi;
2337 case AArch64::LDRQui: return AArch64::LDURQi;
2338 case AArch64::LDRBBui: return AArch64::LDURBBi;
2339 case AArch64::LDRHHui: return AArch64::LDURHHi;
2340 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2341 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2342 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2343 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2344 case AArch64::LDRSWui: return AArch64::LDURSWi;
2345 case AArch64::STRXui: return AArch64::STURXi;
2346 case AArch64::STRWui: return AArch64::STURWi;
2347 case AArch64::STRBui: return AArch64::STURBi;
2348 case AArch64::STRHui: return AArch64::STURHi;
2349 case AArch64::STRSui: return AArch64::STURSi;
2350 case AArch64::STRDui: return AArch64::STURDi;
2351 case AArch64::STRQui: return AArch64::STURQi;
2352 case AArch64::STRBBui: return AArch64::STURBBi;
2353 case AArch64::STRHHui: return AArch64::STURHHi;
2357 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2358 switch (Opc) {
2359 default:
2360 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2361 case AArch64::ADDG:
2362 case AArch64::LDAPURBi:
2363 case AArch64::LDAPURHi:
2364 case AArch64::LDAPURi:
2365 case AArch64::LDAPURSBWi:
2366 case AArch64::LDAPURSBXi:
2367 case AArch64::LDAPURSHWi:
2368 case AArch64::LDAPURSHXi:
2369 case AArch64::LDAPURSWi:
2370 case AArch64::LDAPURXi:
2371 case AArch64::LDR_PPXI:
2372 case AArch64::LDR_PXI:
2373 case AArch64::LDR_ZXI:
2374 case AArch64::LDR_ZZXI:
2375 case AArch64::LDR_ZZZXI:
2376 case AArch64::LDR_ZZZZXI:
2377 case AArch64::LDRBBui:
2378 case AArch64::LDRBui:
2379 case AArch64::LDRDui:
2380 case AArch64::LDRHHui:
2381 case AArch64::LDRHui:
2382 case AArch64::LDRQui:
2383 case AArch64::LDRSBWui:
2384 case AArch64::LDRSBXui:
2385 case AArch64::LDRSHWui:
2386 case AArch64::LDRSHXui:
2387 case AArch64::LDRSui:
2388 case AArch64::LDRSWui:
2389 case AArch64::LDRWui:
2390 case AArch64::LDRXui:
2391 case AArch64::LDURBBi:
2392 case AArch64::LDURBi:
2393 case AArch64::LDURDi:
2394 case AArch64::LDURHHi:
2395 case AArch64::LDURHi:
2396 case AArch64::LDURQi:
2397 case AArch64::LDURSBWi:
2398 case AArch64::LDURSBXi:
2399 case AArch64::LDURSHWi:
2400 case AArch64::LDURSHXi:
2401 case AArch64::LDURSi:
2402 case AArch64::LDURSWi:
2403 case AArch64::LDURWi:
2404 case AArch64::LDURXi:
2405 case AArch64::PRFMui:
2406 case AArch64::PRFUMi:
2407 case AArch64::ST2Gi:
2408 case AArch64::STGi:
2409 case AArch64::STLURBi:
2410 case AArch64::STLURHi:
2411 case AArch64::STLURWi:
2412 case AArch64::STLURXi:
2413 case AArch64::StoreSwiftAsyncContext:
2414 case AArch64::STR_PPXI:
2415 case AArch64::STR_PXI:
2416 case AArch64::STR_ZXI:
2417 case AArch64::STR_ZZXI:
2418 case AArch64::STR_ZZZXI:
2419 case AArch64::STR_ZZZZXI:
2420 case AArch64::STRBBui:
2421 case AArch64::STRBui:
2422 case AArch64::STRDui:
2423 case AArch64::STRHHui:
2424 case AArch64::STRHui:
2425 case AArch64::STRQui:
2426 case AArch64::STRSui:
2427 case AArch64::STRWui:
2428 case AArch64::STRXui:
2429 case AArch64::STURBBi:
2430 case AArch64::STURBi:
2431 case AArch64::STURDi:
2432 case AArch64::STURHHi:
2433 case AArch64::STURHi:
2434 case AArch64::STURQi:
2435 case AArch64::STURSi:
2436 case AArch64::STURWi:
2437 case AArch64::STURXi:
2438 case AArch64::STZ2Gi:
2439 case AArch64::STZGi:
2440 case AArch64::TAGPstack:
2441 return 2;
2442 case AArch64::LD1B_D_IMM:
2443 case AArch64::LD1B_H_IMM:
2444 case AArch64::LD1B_IMM:
2445 case AArch64::LD1B_S_IMM:
2446 case AArch64::LD1D_IMM:
2447 case AArch64::LD1H_D_IMM:
2448 case AArch64::LD1H_IMM:
2449 case AArch64::LD1H_S_IMM:
2450 case AArch64::LD1RB_D_IMM:
2451 case AArch64::LD1RB_H_IMM:
2452 case AArch64::LD1RB_IMM:
2453 case AArch64::LD1RB_S_IMM:
2454 case AArch64::LD1RD_IMM:
2455 case AArch64::LD1RH_D_IMM:
2456 case AArch64::LD1RH_IMM:
2457 case AArch64::LD1RH_S_IMM:
2458 case AArch64::LD1RSB_D_IMM:
2459 case AArch64::LD1RSB_H_IMM:
2460 case AArch64::LD1RSB_S_IMM:
2461 case AArch64::LD1RSH_D_IMM:
2462 case AArch64::LD1RSH_S_IMM:
2463 case AArch64::LD1RSW_IMM:
2464 case AArch64::LD1RW_D_IMM:
2465 case AArch64::LD1RW_IMM:
2466 case AArch64::LD1SB_D_IMM:
2467 case AArch64::LD1SB_H_IMM:
2468 case AArch64::LD1SB_S_IMM:
2469 case AArch64::LD1SH_D_IMM:
2470 case AArch64::LD1SH_S_IMM:
2471 case AArch64::LD1SW_D_IMM:
2472 case AArch64::LD1W_D_IMM:
2473 case AArch64::LD1W_IMM:
2474 case AArch64::LD2B_IMM:
2475 case AArch64::LD2D_IMM:
2476 case AArch64::LD2H_IMM:
2477 case AArch64::LD2W_IMM:
2478 case AArch64::LD3B_IMM:
2479 case AArch64::LD3D_IMM:
2480 case AArch64::LD3H_IMM:
2481 case AArch64::LD3W_IMM:
2482 case AArch64::LD4B_IMM:
2483 case AArch64::LD4D_IMM:
2484 case AArch64::LD4H_IMM:
2485 case AArch64::LD4W_IMM:
2486 case AArch64::LDG:
2487 case AArch64::LDNF1B_D_IMM:
2488 case AArch64::LDNF1B_H_IMM:
2489 case AArch64::LDNF1B_IMM:
2490 case AArch64::LDNF1B_S_IMM:
2491 case AArch64::LDNF1D_IMM:
2492 case AArch64::LDNF1H_D_IMM:
2493 case AArch64::LDNF1H_IMM:
2494 case AArch64::LDNF1H_S_IMM:
2495 case AArch64::LDNF1SB_D_IMM:
2496 case AArch64::LDNF1SB_H_IMM:
2497 case AArch64::LDNF1SB_S_IMM:
2498 case AArch64::LDNF1SH_D_IMM:
2499 case AArch64::LDNF1SH_S_IMM:
2500 case AArch64::LDNF1SW_D_IMM:
2501 case AArch64::LDNF1W_D_IMM:
2502 case AArch64::LDNF1W_IMM:
2503 case AArch64::LDNPDi:
2504 case AArch64::LDNPQi:
2505 case AArch64::LDNPSi:
2506 case AArch64::LDNPWi:
2507 case AArch64::LDNPXi:
2508 case AArch64::LDNT1B_ZRI:
2509 case AArch64::LDNT1D_ZRI:
2510 case AArch64::LDNT1H_ZRI:
2511 case AArch64::LDNT1W_ZRI:
2512 case AArch64::LDPDi:
2513 case AArch64::LDPQi:
2514 case AArch64::LDPSi:
2515 case AArch64::LDPWi:
2516 case AArch64::LDPXi:
2517 case AArch64::LDRBBpost:
2518 case AArch64::LDRBBpre:
2519 case AArch64::LDRBpost:
2520 case AArch64::LDRBpre:
2521 case AArch64::LDRDpost:
2522 case AArch64::LDRDpre:
2523 case AArch64::LDRHHpost:
2524 case AArch64::LDRHHpre:
2525 case AArch64::LDRHpost:
2526 case AArch64::LDRHpre:
2527 case AArch64::LDRQpost:
2528 case AArch64::LDRQpre:
2529 case AArch64::LDRSpost:
2530 case AArch64::LDRSpre:
2531 case AArch64::LDRWpost:
2532 case AArch64::LDRWpre:
2533 case AArch64::LDRXpost:
2534 case AArch64::LDRXpre:
2535 case AArch64::ST1B_D_IMM:
2536 case AArch64::ST1B_H_IMM:
2537 case AArch64::ST1B_IMM:
2538 case AArch64::ST1B_S_IMM:
2539 case AArch64::ST1D_IMM:
2540 case AArch64::ST1H_D_IMM:
2541 case AArch64::ST1H_IMM:
2542 case AArch64::ST1H_S_IMM:
2543 case AArch64::ST1W_D_IMM:
2544 case AArch64::ST1W_IMM:
2545 case AArch64::ST2B_IMM:
2546 case AArch64::ST2D_IMM:
2547 case AArch64::ST2H_IMM:
2548 case AArch64::ST2W_IMM:
2549 case AArch64::ST3B_IMM:
2550 case AArch64::ST3D_IMM:
2551 case AArch64::ST3H_IMM:
2552 case AArch64::ST3W_IMM:
2553 case AArch64::ST4B_IMM:
2554 case AArch64::ST4D_IMM:
2555 case AArch64::ST4H_IMM:
2556 case AArch64::ST4W_IMM:
2557 case AArch64::STGPi:
2558 case AArch64::STGPreIndex:
2559 case AArch64::STZGPreIndex:
2560 case AArch64::ST2GPreIndex:
2561 case AArch64::STZ2GPreIndex:
2562 case AArch64::STGPostIndex:
2563 case AArch64::STZGPostIndex:
2564 case AArch64::ST2GPostIndex:
2565 case AArch64::STZ2GPostIndex:
2566 case AArch64::STNPDi:
2567 case AArch64::STNPQi:
2568 case AArch64::STNPSi:
2569 case AArch64::STNPWi:
2570 case AArch64::STNPXi:
2571 case AArch64::STNT1B_ZRI:
2572 case AArch64::STNT1D_ZRI:
2573 case AArch64::STNT1H_ZRI:
2574 case AArch64::STNT1W_ZRI:
2575 case AArch64::STPDi:
2576 case AArch64::STPQi:
2577 case AArch64::STPSi:
2578 case AArch64::STPWi:
2579 case AArch64::STPXi:
2580 case AArch64::STRBBpost:
2581 case AArch64::STRBBpre:
2582 case AArch64::STRBpost:
2583 case AArch64::STRBpre:
2584 case AArch64::STRDpost:
2585 case AArch64::STRDpre:
2586 case AArch64::STRHHpost:
2587 case AArch64::STRHHpre:
2588 case AArch64::STRHpost:
2589 case AArch64::STRHpre:
2590 case AArch64::STRQpost:
2591 case AArch64::STRQpre:
2592 case AArch64::STRSpost:
2593 case AArch64::STRSpre:
2594 case AArch64::STRWpost:
2595 case AArch64::STRWpre:
2596 case AArch64::STRXpost:
2597 case AArch64::STRXpre:
2598 return 3;
2599 case AArch64::LDPDpost:
2600 case AArch64::LDPDpre:
2601 case AArch64::LDPQpost:
2602 case AArch64::LDPQpre:
2603 case AArch64::LDPSpost:
2604 case AArch64::LDPSpre:
2605 case AArch64::LDPWpost:
2606 case AArch64::LDPWpre:
2607 case AArch64::LDPXpost:
2608 case AArch64::LDPXpre:
2609 case AArch64::STGPpre:
2610 case AArch64::STGPpost:
2611 case AArch64::STPDpost:
2612 case AArch64::STPDpre:
2613 case AArch64::STPQpost:
2614 case AArch64::STPQpre:
2615 case AArch64::STPSpost:
2616 case AArch64::STPSpre:
2617 case AArch64::STPWpost:
2618 case AArch64::STPWpre:
2619 case AArch64::STPXpost:
2620 case AArch64::STPXpre:
2621 return 4;
2625 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2626 switch (MI.getOpcode()) {
2627 default:
2628 return false;
2629 // Scaled instructions.
2630 case AArch64::STRSui:
2631 case AArch64::STRDui:
2632 case AArch64::STRQui:
2633 case AArch64::STRXui:
2634 case AArch64::STRWui:
2635 case AArch64::LDRSui:
2636 case AArch64::LDRDui:
2637 case AArch64::LDRQui:
2638 case AArch64::LDRXui:
2639 case AArch64::LDRWui:
2640 case AArch64::LDRSWui:
2641 // Unscaled instructions.
2642 case AArch64::STURSi:
2643 case AArch64::STRSpre:
2644 case AArch64::STURDi:
2645 case AArch64::STRDpre:
2646 case AArch64::STURQi:
2647 case AArch64::STRQpre:
2648 case AArch64::STURWi:
2649 case AArch64::STRWpre:
2650 case AArch64::STURXi:
2651 case AArch64::STRXpre:
2652 case AArch64::LDURSi:
2653 case AArch64::LDRSpre:
2654 case AArch64::LDURDi:
2655 case AArch64::LDRDpre:
2656 case AArch64::LDURQi:
2657 case AArch64::LDRQpre:
2658 case AArch64::LDURWi:
2659 case AArch64::LDRWpre:
2660 case AArch64::LDURXi:
2661 case AArch64::LDRXpre:
2662 case AArch64::LDURSWi:
2663 case AArch64::LDRSWpre:
2664 return true;
2668 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2669 switch (MI.getOpcode()) {
2670 default:
2671 assert((!MI.isCall() || !MI.isReturn()) &&
2672 "Unexpected instruction - was a new tail call opcode introduced?");
2673 return false;
2674 case AArch64::TCRETURNdi:
2675 case AArch64::TCRETURNri:
2676 case AArch64::TCRETURNrix16x17:
2677 case AArch64::TCRETURNrix17:
2678 case AArch64::TCRETURNrinotx16:
2679 case AArch64::TCRETURNriALL:
2680 case AArch64::AUTH_TCRETURN:
2681 case AArch64::AUTH_TCRETURN_BTI:
2682 return true;
2686 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2687 switch (Opc) {
2688 default:
2689 llvm_unreachable("Opcode has no flag setting equivalent!");
2690 // 32-bit cases:
2691 case AArch64::ADDWri:
2692 return AArch64::ADDSWri;
2693 case AArch64::ADDWrr:
2694 return AArch64::ADDSWrr;
2695 case AArch64::ADDWrs:
2696 return AArch64::ADDSWrs;
2697 case AArch64::ADDWrx:
2698 return AArch64::ADDSWrx;
2699 case AArch64::ANDWri:
2700 return AArch64::ANDSWri;
2701 case AArch64::ANDWrr:
2702 return AArch64::ANDSWrr;
2703 case AArch64::ANDWrs:
2704 return AArch64::ANDSWrs;
2705 case AArch64::BICWrr:
2706 return AArch64::BICSWrr;
2707 case AArch64::BICWrs:
2708 return AArch64::BICSWrs;
2709 case AArch64::SUBWri:
2710 return AArch64::SUBSWri;
2711 case AArch64::SUBWrr:
2712 return AArch64::SUBSWrr;
2713 case AArch64::SUBWrs:
2714 return AArch64::SUBSWrs;
2715 case AArch64::SUBWrx:
2716 return AArch64::SUBSWrx;
2717 // 64-bit cases:
2718 case AArch64::ADDXri:
2719 return AArch64::ADDSXri;
2720 case AArch64::ADDXrr:
2721 return AArch64::ADDSXrr;
2722 case AArch64::ADDXrs:
2723 return AArch64::ADDSXrs;
2724 case AArch64::ADDXrx:
2725 return AArch64::ADDSXrx;
2726 case AArch64::ANDXri:
2727 return AArch64::ANDSXri;
2728 case AArch64::ANDXrr:
2729 return AArch64::ANDSXrr;
2730 case AArch64::ANDXrs:
2731 return AArch64::ANDSXrs;
2732 case AArch64::BICXrr:
2733 return AArch64::BICSXrr;
2734 case AArch64::BICXrs:
2735 return AArch64::BICSXrs;
2736 case AArch64::SUBXri:
2737 return AArch64::SUBSXri;
2738 case AArch64::SUBXrr:
2739 return AArch64::SUBSXrr;
2740 case AArch64::SUBXrs:
2741 return AArch64::SUBSXrs;
2742 case AArch64::SUBXrx:
2743 return AArch64::SUBSXrx;
2744 // SVE instructions:
2745 case AArch64::AND_PPzPP:
2746 return AArch64::ANDS_PPzPP;
2747 case AArch64::BIC_PPzPP:
2748 return AArch64::BICS_PPzPP;
2749 case AArch64::EOR_PPzPP:
2750 return AArch64::EORS_PPzPP;
2751 case AArch64::NAND_PPzPP:
2752 return AArch64::NANDS_PPzPP;
2753 case AArch64::NOR_PPzPP:
2754 return AArch64::NORS_PPzPP;
2755 case AArch64::ORN_PPzPP:
2756 return AArch64::ORNS_PPzPP;
2757 case AArch64::ORR_PPzPP:
2758 return AArch64::ORRS_PPzPP;
2759 case AArch64::BRKA_PPzP:
2760 return AArch64::BRKAS_PPzP;
2761 case AArch64::BRKPA_PPzPP:
2762 return AArch64::BRKPAS_PPzPP;
2763 case AArch64::BRKB_PPzP:
2764 return AArch64::BRKBS_PPzP;
2765 case AArch64::BRKPB_PPzPP:
2766 return AArch64::BRKPBS_PPzPP;
2767 case AArch64::BRKN_PPzP:
2768 return AArch64::BRKNS_PPzP;
2769 case AArch64::RDFFR_PPz:
2770 return AArch64::RDFFRS_PPz;
2771 case AArch64::PTRUE_B:
2772 return AArch64::PTRUES_B;
2776 // Is this a candidate for ld/st merging or pairing? For example, we don't
2777 // touch volatiles or load/stores that have a hint to avoid pair formation.
2778 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2780 bool IsPreLdSt = isPreLdSt(MI);
2782 // If this is a volatile load/store, don't mess with it.
2783 if (MI.hasOrderedMemoryRef())
2784 return false;
2786 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2787 // For Pre-inc LD/ST, the operand is shifted by one.
2788 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2789 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2790 "Expected a reg or frame index operand.");
2792 // For Pre-indexed addressing quadword instructions, the third operand is the
2793 // immediate value.
2794 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2796 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2797 return false;
2799 // Can't merge/pair if the instruction modifies the base register.
2800 // e.g., ldr x0, [x0]
2801 // This case will never occur with an FI base.
2802 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2803 // STR<S,D,Q,W,X>pre, it can be merged.
2804 // For example:
2805 // ldr q0, [x11, #32]!
2806 // ldr q1, [x11, #16]
2807 // to
2808 // ldp q0, q1, [x11, #32]!
2809 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2810 Register BaseReg = MI.getOperand(1).getReg();
2811 const TargetRegisterInfo *TRI = &getRegisterInfo();
2812 if (MI.modifiesRegister(BaseReg, TRI))
2813 return false;
2816 // Check if this load/store has a hint to avoid pair formation.
2817 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2818 if (isLdStPairSuppressed(MI))
2819 return false;
2821 // Do not pair any callee-save store/reload instructions in the
2822 // prologue/epilogue if the CFI information encoded the operations as separate
2823 // instructions, as that will cause the size of the actual prologue to mismatch
2824 // with the prologue size recorded in the Windows CFI.
2825 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2826 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2827 MI.getMF()->getFunction().needsUnwindTableEntry();
2828 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2829 MI.getFlag(MachineInstr::FrameDestroy)))
2830 return false;
2832 // On some CPUs quad load/store pairs are slower than two single load/stores.
2833 if (Subtarget.isPaired128Slow()) {
2834 switch (MI.getOpcode()) {
2835 default:
2836 break;
2837 case AArch64::LDURQi:
2838 case AArch64::STURQi:
2839 case AArch64::LDRQui:
2840 case AArch64::STRQui:
2841 return false;
2845 return true;
2848 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2849 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2850 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2851 const TargetRegisterInfo *TRI) const {
2852 if (!LdSt.mayLoadOrStore())
2853 return false;
2855 const MachineOperand *BaseOp;
2856 TypeSize WidthN(0, false);
2857 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2858 WidthN, TRI))
2859 return false;
2860 // The maximum vscale is 16 under AArch64, return the maximal extent for the
2861 // vector.
2862 Width = LocationSize::precise(WidthN);
2863 BaseOps.push_back(BaseOp);
2864 return true;
2867 std::optional<ExtAddrMode>
2868 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2869 const TargetRegisterInfo *TRI) const {
2870 const MachineOperand *Base; // Filled with the base operand of MI.
2871 int64_t Offset; // Filled with the offset of MI.
2872 bool OffsetIsScalable;
2873 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2874 return std::nullopt;
2876 if (!Base->isReg())
2877 return std::nullopt;
2878 ExtAddrMode AM;
2879 AM.BaseReg = Base->getReg();
2880 AM.Displacement = Offset;
2881 AM.ScaledReg = 0;
2882 AM.Scale = 0;
2883 return AM;
2886 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2887 Register Reg,
2888 const MachineInstr &AddrI,
2889 ExtAddrMode &AM) const {
2890 // Filter out instructions into which we cannot fold.
2891 unsigned NumBytes;
2892 int64_t OffsetScale = 1;
2893 switch (MemI.getOpcode()) {
2894 default:
2895 return false;
2897 case AArch64::LDURQi:
2898 case AArch64::STURQi:
2899 NumBytes = 16;
2900 break;
2902 case AArch64::LDURDi:
2903 case AArch64::STURDi:
2904 case AArch64::LDURXi:
2905 case AArch64::STURXi:
2906 NumBytes = 8;
2907 break;
2909 case AArch64::LDURWi:
2910 case AArch64::LDURSWi:
2911 case AArch64::STURWi:
2912 NumBytes = 4;
2913 break;
2915 case AArch64::LDURHi:
2916 case AArch64::STURHi:
2917 case AArch64::LDURHHi:
2918 case AArch64::STURHHi:
2919 case AArch64::LDURSHXi:
2920 case AArch64::LDURSHWi:
2921 NumBytes = 2;
2922 break;
2924 case AArch64::LDRBroX:
2925 case AArch64::LDRBBroX:
2926 case AArch64::LDRSBXroX:
2927 case AArch64::LDRSBWroX:
2928 case AArch64::STRBroX:
2929 case AArch64::STRBBroX:
2930 case AArch64::LDURBi:
2931 case AArch64::LDURBBi:
2932 case AArch64::LDURSBXi:
2933 case AArch64::LDURSBWi:
2934 case AArch64::STURBi:
2935 case AArch64::STURBBi:
2936 case AArch64::LDRBui:
2937 case AArch64::LDRBBui:
2938 case AArch64::LDRSBXui:
2939 case AArch64::LDRSBWui:
2940 case AArch64::STRBui:
2941 case AArch64::STRBBui:
2942 NumBytes = 1;
2943 break;
2945 case AArch64::LDRQroX:
2946 case AArch64::STRQroX:
2947 case AArch64::LDRQui:
2948 case AArch64::STRQui:
2949 NumBytes = 16;
2950 OffsetScale = 16;
2951 break;
2953 case AArch64::LDRDroX:
2954 case AArch64::STRDroX:
2955 case AArch64::LDRXroX:
2956 case AArch64::STRXroX:
2957 case AArch64::LDRDui:
2958 case AArch64::STRDui:
2959 case AArch64::LDRXui:
2960 case AArch64::STRXui:
2961 NumBytes = 8;
2962 OffsetScale = 8;
2963 break;
2965 case AArch64::LDRWroX:
2966 case AArch64::LDRSWroX:
2967 case AArch64::STRWroX:
2968 case AArch64::LDRWui:
2969 case AArch64::LDRSWui:
2970 case AArch64::STRWui:
2971 NumBytes = 4;
2972 OffsetScale = 4;
2973 break;
2975 case AArch64::LDRHroX:
2976 case AArch64::STRHroX:
2977 case AArch64::LDRHHroX:
2978 case AArch64::STRHHroX:
2979 case AArch64::LDRSHXroX:
2980 case AArch64::LDRSHWroX:
2981 case AArch64::LDRHui:
2982 case AArch64::STRHui:
2983 case AArch64::LDRHHui:
2984 case AArch64::STRHHui:
2985 case AArch64::LDRSHXui:
2986 case AArch64::LDRSHWui:
2987 NumBytes = 2;
2988 OffsetScale = 2;
2989 break;
2992 // Check the fold operand is not the loaded/stored value.
2993 const MachineOperand &BaseRegOp = MemI.getOperand(0);
2994 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2995 return false;
2997 // Handle memory instructions with a [Reg, Reg] addressing mode.
2998 if (MemI.getOperand(2).isReg()) {
2999 // Bail if the addressing mode already includes extension of the offset
3000 // register.
3001 if (MemI.getOperand(3).getImm())
3002 return false;
3004 // Check if we actually have a scaled offset.
3005 if (MemI.getOperand(4).getImm() == 0)
3006 OffsetScale = 1;
3008 // If the address instructions is folded into the base register, then the
3009 // addressing mode must not have a scale. Then we can swap the base and the
3010 // scaled registers.
3011 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3012 return false;
3014 switch (AddrI.getOpcode()) {
3015 default:
3016 return false;
3018 case AArch64::SBFMXri:
3019 // sxtw Xa, Wm
3020 // ldr Xd, [Xn, Xa, lsl #N]
3021 // ->
3022 // ldr Xd, [Xn, Wm, sxtw #N]
3023 if (AddrI.getOperand(2).getImm() != 0 ||
3024 AddrI.getOperand(3).getImm() != 31)
3025 return false;
3027 AM.BaseReg = MemI.getOperand(1).getReg();
3028 if (AM.BaseReg == Reg)
3029 AM.BaseReg = MemI.getOperand(2).getReg();
3030 AM.ScaledReg = AddrI.getOperand(1).getReg();
3031 AM.Scale = OffsetScale;
3032 AM.Displacement = 0;
3033 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
3034 return true;
3036 case TargetOpcode::SUBREG_TO_REG: {
3037 // mov Wa, Wm
3038 // ldr Xd, [Xn, Xa, lsl #N]
3039 // ->
3040 // ldr Xd, [Xn, Wm, uxtw #N]
3042 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3043 if (AddrI.getOperand(1).getImm() != 0 ||
3044 AddrI.getOperand(3).getImm() != AArch64::sub_32)
3045 return false;
3047 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3048 Register OffsetReg = AddrI.getOperand(2).getReg();
3049 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3050 return false;
3052 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3053 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3054 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3055 DefMI.getOperand(3).getImm() != 0)
3056 return false;
3058 AM.BaseReg = MemI.getOperand(1).getReg();
3059 if (AM.BaseReg == Reg)
3060 AM.BaseReg = MemI.getOperand(2).getReg();
3061 AM.ScaledReg = DefMI.getOperand(2).getReg();
3062 AM.Scale = OffsetScale;
3063 AM.Displacement = 0;
3064 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
3065 return true;
3070 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3072 // Check we are not breaking a potential conversion to an LDP.
3073 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3074 int64_t NewOffset) -> bool {
3075 int64_t MinOffset, MaxOffset;
3076 switch (NumBytes) {
3077 default:
3078 return true;
3079 case 4:
3080 MinOffset = -256;
3081 MaxOffset = 252;
3082 break;
3083 case 8:
3084 MinOffset = -512;
3085 MaxOffset = 504;
3086 break;
3087 case 16:
3088 MinOffset = -1024;
3089 MaxOffset = 1008;
3090 break;
3092 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3093 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3095 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3096 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3097 int64_t NewOffset = OldOffset + Disp;
3098 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3099 return false;
3100 // If the old offset would fit into an LDP, but the new offset wouldn't,
3101 // bail out.
3102 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3103 return false;
3104 AM.BaseReg = AddrI.getOperand(1).getReg();
3105 AM.ScaledReg = 0;
3106 AM.Scale = 0;
3107 AM.Displacement = NewOffset;
3108 AM.Form = ExtAddrMode::Formula::Basic;
3109 return true;
3112 auto canFoldAddRegIntoAddrMode =
3113 [&](int64_t Scale,
3114 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
3115 if (MemI.getOperand(2).getImm() != 0)
3116 return false;
3117 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3118 return false;
3119 AM.BaseReg = AddrI.getOperand(1).getReg();
3120 AM.ScaledReg = AddrI.getOperand(2).getReg();
3121 AM.Scale = Scale;
3122 AM.Displacement = 0;
3123 AM.Form = Form;
3124 return true;
3127 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3128 unsigned Opcode = MemI.getOpcode();
3129 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3130 Subtarget.isSTRQroSlow();
3133 int64_t Disp = 0;
3134 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3135 switch (AddrI.getOpcode()) {
3136 default:
3137 return false;
3139 case AArch64::ADDXri:
3140 // add Xa, Xn, #N
3141 // ldr Xd, [Xa, #M]
3142 // ->
3143 // ldr Xd, [Xn, #N'+M]
3144 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3145 return canFoldAddSubImmIntoAddrMode(Disp);
3147 case AArch64::SUBXri:
3148 // sub Xa, Xn, #N
3149 // ldr Xd, [Xa, #M]
3150 // ->
3151 // ldr Xd, [Xn, #N'+M]
3152 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3153 return canFoldAddSubImmIntoAddrMode(-Disp);
3155 case AArch64::ADDXrs: {
3156 // add Xa, Xn, Xm, lsl #N
3157 // ldr Xd, [Xa]
3158 // ->
3159 // ldr Xd, [Xn, Xm, lsl #N]
3161 // Don't fold the add if the result would be slower, unless optimising for
3162 // size.
3163 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3164 if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
3165 return false;
3166 Shift = AArch64_AM::getShiftValue(Shift);
3167 if (!OptSize) {
3168 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3169 return false;
3170 if (avoidSlowSTRQ(MemI))
3171 return false;
3173 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3176 case AArch64::ADDXrr:
3177 // add Xa, Xn, Xm
3178 // ldr Xd, [Xa]
3179 // ->
3180 // ldr Xd, [Xn, Xm, lsl #0]
3182 // Don't fold the add if the result would be slower, unless optimising for
3183 // size.
3184 if (!OptSize && avoidSlowSTRQ(MemI))
3185 return false;
3186 return canFoldAddRegIntoAddrMode(1);
3188 case AArch64::ADDXrx:
3189 // add Xa, Xn, Wm, {s,u}xtw #N
3190 // ldr Xd, [Xa]
3191 // ->
3192 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3194 // Don't fold the add if the result would be slower, unless optimising for
3195 // size.
3196 if (!OptSize && avoidSlowSTRQ(MemI))
3197 return false;
3199 // Can fold only sign-/zero-extend of a word.
3200 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3201 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3202 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3203 return false;
3205 return canFoldAddRegIntoAddrMode(
3206 1ULL << AArch64_AM::getArithShiftValue(Imm),
3207 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3208 : ExtAddrMode::Formula::ZExtScaledReg);
3212 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3213 // return the opcode of an instruction performing the same operation, but using
3214 // the [Reg, Reg] addressing mode.
3215 static unsigned regOffsetOpcode(unsigned Opcode) {
3216 switch (Opcode) {
3217 default:
3218 llvm_unreachable("Address folding not implemented for instruction");
3220 case AArch64::LDURQi:
3221 case AArch64::LDRQui:
3222 return AArch64::LDRQroX;
3223 case AArch64::STURQi:
3224 case AArch64::STRQui:
3225 return AArch64::STRQroX;
3226 case AArch64::LDURDi:
3227 case AArch64::LDRDui:
3228 return AArch64::LDRDroX;
3229 case AArch64::STURDi:
3230 case AArch64::STRDui:
3231 return AArch64::STRDroX;
3232 case AArch64::LDURXi:
3233 case AArch64::LDRXui:
3234 return AArch64::LDRXroX;
3235 case AArch64::STURXi:
3236 case AArch64::STRXui:
3237 return AArch64::STRXroX;
3238 case AArch64::LDURWi:
3239 case AArch64::LDRWui:
3240 return AArch64::LDRWroX;
3241 case AArch64::LDURSWi:
3242 case AArch64::LDRSWui:
3243 return AArch64::LDRSWroX;
3244 case AArch64::STURWi:
3245 case AArch64::STRWui:
3246 return AArch64::STRWroX;
3247 case AArch64::LDURHi:
3248 case AArch64::LDRHui:
3249 return AArch64::LDRHroX;
3250 case AArch64::STURHi:
3251 case AArch64::STRHui:
3252 return AArch64::STRHroX;
3253 case AArch64::LDURHHi:
3254 case AArch64::LDRHHui:
3255 return AArch64::LDRHHroX;
3256 case AArch64::STURHHi:
3257 case AArch64::STRHHui:
3258 return AArch64::STRHHroX;
3259 case AArch64::LDURSHXi:
3260 case AArch64::LDRSHXui:
3261 return AArch64::LDRSHXroX;
3262 case AArch64::LDURSHWi:
3263 case AArch64::LDRSHWui:
3264 return AArch64::LDRSHWroX;
3265 case AArch64::LDURBi:
3266 case AArch64::LDRBui:
3267 return AArch64::LDRBroX;
3268 case AArch64::LDURBBi:
3269 case AArch64::LDRBBui:
3270 return AArch64::LDRBBroX;
3271 case AArch64::LDURSBXi:
3272 case AArch64::LDRSBXui:
3273 return AArch64::LDRSBXroX;
3274 case AArch64::LDURSBWi:
3275 case AArch64::LDRSBWui:
3276 return AArch64::LDRSBWroX;
3277 case AArch64::STURBi:
3278 case AArch64::STRBui:
3279 return AArch64::STRBroX;
3280 case AArch64::STURBBi:
3281 case AArch64::STRBBui:
3282 return AArch64::STRBBroX;
3286 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3287 // the opcode of an instruction performing the same operation, but using the
3288 // [Reg, #Imm] addressing mode with scaled offset.
3289 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3290 switch (Opcode) {
3291 default:
3292 llvm_unreachable("Address folding not implemented for instruction");
3294 case AArch64::LDURQi:
3295 Scale = 16;
3296 return AArch64::LDRQui;
3297 case AArch64::STURQi:
3298 Scale = 16;
3299 return AArch64::STRQui;
3300 case AArch64::LDURDi:
3301 Scale = 8;
3302 return AArch64::LDRDui;
3303 case AArch64::STURDi:
3304 Scale = 8;
3305 return AArch64::STRDui;
3306 case AArch64::LDURXi:
3307 Scale = 8;
3308 return AArch64::LDRXui;
3309 case AArch64::STURXi:
3310 Scale = 8;
3311 return AArch64::STRXui;
3312 case AArch64::LDURWi:
3313 Scale = 4;
3314 return AArch64::LDRWui;
3315 case AArch64::LDURSWi:
3316 Scale = 4;
3317 return AArch64::LDRSWui;
3318 case AArch64::STURWi:
3319 Scale = 4;
3320 return AArch64::STRWui;
3321 case AArch64::LDURHi:
3322 Scale = 2;
3323 return AArch64::LDRHui;
3324 case AArch64::STURHi:
3325 Scale = 2;
3326 return AArch64::STRHui;
3327 case AArch64::LDURHHi:
3328 Scale = 2;
3329 return AArch64::LDRHHui;
3330 case AArch64::STURHHi:
3331 Scale = 2;
3332 return AArch64::STRHHui;
3333 case AArch64::LDURSHXi:
3334 Scale = 2;
3335 return AArch64::LDRSHXui;
3336 case AArch64::LDURSHWi:
3337 Scale = 2;
3338 return AArch64::LDRSHWui;
3339 case AArch64::LDURBi:
3340 Scale = 1;
3341 return AArch64::LDRBui;
3342 case AArch64::LDURBBi:
3343 Scale = 1;
3344 return AArch64::LDRBBui;
3345 case AArch64::LDURSBXi:
3346 Scale = 1;
3347 return AArch64::LDRSBXui;
3348 case AArch64::LDURSBWi:
3349 Scale = 1;
3350 return AArch64::LDRSBWui;
3351 case AArch64::STURBi:
3352 Scale = 1;
3353 return AArch64::STRBui;
3354 case AArch64::STURBBi:
3355 Scale = 1;
3356 return AArch64::STRBBui;
3357 case AArch64::LDRQui:
3358 case AArch64::STRQui:
3359 Scale = 16;
3360 return Opcode;
3361 case AArch64::LDRDui:
3362 case AArch64::STRDui:
3363 case AArch64::LDRXui:
3364 case AArch64::STRXui:
3365 Scale = 8;
3366 return Opcode;
3367 case AArch64::LDRWui:
3368 case AArch64::LDRSWui:
3369 case AArch64::STRWui:
3370 Scale = 4;
3371 return Opcode;
3372 case AArch64::LDRHui:
3373 case AArch64::STRHui:
3374 case AArch64::LDRHHui:
3375 case AArch64::STRHHui:
3376 case AArch64::LDRSHXui:
3377 case AArch64::LDRSHWui:
3378 Scale = 2;
3379 return Opcode;
3380 case AArch64::LDRBui:
3381 case AArch64::LDRBBui:
3382 case AArch64::LDRSBXui:
3383 case AArch64::LDRSBWui:
3384 case AArch64::STRBui:
3385 case AArch64::STRBBui:
3386 Scale = 1;
3387 return Opcode;
3391 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3392 // the opcode of an instruction performing the same operation, but using the
3393 // [Reg, #Imm] addressing mode with unscaled offset.
3394 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3395 switch (Opcode) {
3396 default:
3397 llvm_unreachable("Address folding not implemented for instruction");
3399 case AArch64::LDURQi:
3400 case AArch64::STURQi:
3401 case AArch64::LDURDi:
3402 case AArch64::STURDi:
3403 case AArch64::LDURXi:
3404 case AArch64::STURXi:
3405 case AArch64::LDURWi:
3406 case AArch64::LDURSWi:
3407 case AArch64::STURWi:
3408 case AArch64::LDURHi:
3409 case AArch64::STURHi:
3410 case AArch64::LDURHHi:
3411 case AArch64::STURHHi:
3412 case AArch64::LDURSHXi:
3413 case AArch64::LDURSHWi:
3414 case AArch64::LDURBi:
3415 case AArch64::STURBi:
3416 case AArch64::LDURBBi:
3417 case AArch64::STURBBi:
3418 case AArch64::LDURSBWi:
3419 case AArch64::LDURSBXi:
3420 return Opcode;
3421 case AArch64::LDRQui:
3422 return AArch64::LDURQi;
3423 case AArch64::STRQui:
3424 return AArch64::STURQi;
3425 case AArch64::LDRDui:
3426 return AArch64::LDURDi;
3427 case AArch64::STRDui:
3428 return AArch64::STURDi;
3429 case AArch64::LDRXui:
3430 return AArch64::LDURXi;
3431 case AArch64::STRXui:
3432 return AArch64::STURXi;
3433 case AArch64::LDRWui:
3434 return AArch64::LDURWi;
3435 case AArch64::LDRSWui:
3436 return AArch64::LDURSWi;
3437 case AArch64::STRWui:
3438 return AArch64::STURWi;
3439 case AArch64::LDRHui:
3440 return AArch64::LDURHi;
3441 case AArch64::STRHui:
3442 return AArch64::STURHi;
3443 case AArch64::LDRHHui:
3444 return AArch64::LDURHHi;
3445 case AArch64::STRHHui:
3446 return AArch64::STURHHi;
3447 case AArch64::LDRSHXui:
3448 return AArch64::LDURSHXi;
3449 case AArch64::LDRSHWui:
3450 return AArch64::LDURSHWi;
3451 case AArch64::LDRBBui:
3452 return AArch64::LDURBBi;
3453 case AArch64::LDRBui:
3454 return AArch64::LDURBi;
3455 case AArch64::STRBBui:
3456 return AArch64::STURBBi;
3457 case AArch64::STRBui:
3458 return AArch64::STURBi;
3459 case AArch64::LDRSBWui:
3460 return AArch64::LDURSBWi;
3461 case AArch64::LDRSBXui:
3462 return AArch64::LDURSBXi;
3466 // Given the opcode of a memory load/store instruction, return the opcode of an
3467 // instruction performing the same operation, but using
3468 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3469 // offset register.
3470 static unsigned offsetExtendOpcode(unsigned Opcode) {
3471 switch (Opcode) {
3472 default:
3473 llvm_unreachable("Address folding not implemented for instruction");
3475 case AArch64::LDRQroX:
3476 case AArch64::LDURQi:
3477 case AArch64::LDRQui:
3478 return AArch64::LDRQroW;
3479 case AArch64::STRQroX:
3480 case AArch64::STURQi:
3481 case AArch64::STRQui:
3482 return AArch64::STRQroW;
3483 case AArch64::LDRDroX:
3484 case AArch64::LDURDi:
3485 case AArch64::LDRDui:
3486 return AArch64::LDRDroW;
3487 case AArch64::STRDroX:
3488 case AArch64::STURDi:
3489 case AArch64::STRDui:
3490 return AArch64::STRDroW;
3491 case AArch64::LDRXroX:
3492 case AArch64::LDURXi:
3493 case AArch64::LDRXui:
3494 return AArch64::LDRXroW;
3495 case AArch64::STRXroX:
3496 case AArch64::STURXi:
3497 case AArch64::STRXui:
3498 return AArch64::STRXroW;
3499 case AArch64::LDRWroX:
3500 case AArch64::LDURWi:
3501 case AArch64::LDRWui:
3502 return AArch64::LDRWroW;
3503 case AArch64::LDRSWroX:
3504 case AArch64::LDURSWi:
3505 case AArch64::LDRSWui:
3506 return AArch64::LDRSWroW;
3507 case AArch64::STRWroX:
3508 case AArch64::STURWi:
3509 case AArch64::STRWui:
3510 return AArch64::STRWroW;
3511 case AArch64::LDRHroX:
3512 case AArch64::LDURHi:
3513 case AArch64::LDRHui:
3514 return AArch64::LDRHroW;
3515 case AArch64::STRHroX:
3516 case AArch64::STURHi:
3517 case AArch64::STRHui:
3518 return AArch64::STRHroW;
3519 case AArch64::LDRHHroX:
3520 case AArch64::LDURHHi:
3521 case AArch64::LDRHHui:
3522 return AArch64::LDRHHroW;
3523 case AArch64::STRHHroX:
3524 case AArch64::STURHHi:
3525 case AArch64::STRHHui:
3526 return AArch64::STRHHroW;
3527 case AArch64::LDRSHXroX:
3528 case AArch64::LDURSHXi:
3529 case AArch64::LDRSHXui:
3530 return AArch64::LDRSHXroW;
3531 case AArch64::LDRSHWroX:
3532 case AArch64::LDURSHWi:
3533 case AArch64::LDRSHWui:
3534 return AArch64::LDRSHWroW;
3535 case AArch64::LDRBroX:
3536 case AArch64::LDURBi:
3537 case AArch64::LDRBui:
3538 return AArch64::LDRBroW;
3539 case AArch64::LDRBBroX:
3540 case AArch64::LDURBBi:
3541 case AArch64::LDRBBui:
3542 return AArch64::LDRBBroW;
3543 case AArch64::LDRSBXroX:
3544 case AArch64::LDURSBXi:
3545 case AArch64::LDRSBXui:
3546 return AArch64::LDRSBXroW;
3547 case AArch64::LDRSBWroX:
3548 case AArch64::LDURSBWi:
3549 case AArch64::LDRSBWui:
3550 return AArch64::LDRSBWroW;
3551 case AArch64::STRBroX:
3552 case AArch64::STURBi:
3553 case AArch64::STRBui:
3554 return AArch64::STRBroW;
3555 case AArch64::STRBBroX:
3556 case AArch64::STURBBi:
3557 case AArch64::STRBBui:
3558 return AArch64::STRBBroW;
3562 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3563 const ExtAddrMode &AM) const {
3565 const DebugLoc &DL = MemI.getDebugLoc();
3566 MachineBasicBlock &MBB = *MemI.getParent();
3567 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3569 if (AM.Form == ExtAddrMode::Formula::Basic) {
3570 if (AM.ScaledReg) {
3571 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3572 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3573 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3574 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3575 .addReg(MemI.getOperand(0).getReg(),
3576 MemI.mayLoad() ? RegState::Define : 0)
3577 .addReg(AM.BaseReg)
3578 .addReg(AM.ScaledReg)
3579 .addImm(0)
3580 .addImm(AM.Scale > 1)
3581 .setMemRefs(MemI.memoperands())
3582 .setMIFlags(MemI.getFlags());
3583 return B.getInstr();
3586 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3587 "Addressing mode not supported for folding");
3589 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3590 unsigned Scale = 1;
3591 unsigned Opcode = MemI.getOpcode();
3592 if (isInt<9>(AM.Displacement))
3593 Opcode = unscaledOffsetOpcode(Opcode);
3594 else
3595 Opcode = scaledOffsetOpcode(Opcode, Scale);
3597 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3598 .addReg(MemI.getOperand(0).getReg(),
3599 MemI.mayLoad() ? RegState::Define : 0)
3600 .addReg(AM.BaseReg)
3601 .addImm(AM.Displacement / Scale)
3602 .setMemRefs(MemI.memoperands())
3603 .setMIFlags(MemI.getFlags());
3604 return B.getInstr();
3607 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3608 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3609 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3610 assert(AM.ScaledReg && !AM.Displacement &&
3611 "Address offset can be a register or an immediate, but not both");
3612 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3613 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3614 // Make sure the offset register is in the correct register class.
3615 Register OffsetReg = AM.ScaledReg;
3616 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3617 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3618 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3619 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3620 .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3622 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3623 .addReg(MemI.getOperand(0).getReg(),
3624 MemI.mayLoad() ? RegState::Define : 0)
3625 .addReg(AM.BaseReg)
3626 .addReg(OffsetReg)
3627 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3628 .addImm(AM.Scale != 1)
3629 .setMemRefs(MemI.memoperands())
3630 .setMIFlags(MemI.getFlags());
3632 return B.getInstr();
3635 llvm_unreachable(
3636 "Function must not be called with an addressing mode it can't handle");
3639 /// Return true if the opcode is a post-index ld/st instruction, which really
3640 /// loads from base+0.
3641 static bool isPostIndexLdStOpcode(unsigned Opcode) {
3642 switch (Opcode) {
3643 default:
3644 return false;
3645 case AArch64::LD1Fourv16b_POST:
3646 case AArch64::LD1Fourv1d_POST:
3647 case AArch64::LD1Fourv2d_POST:
3648 case AArch64::LD1Fourv2s_POST:
3649 case AArch64::LD1Fourv4h_POST:
3650 case AArch64::LD1Fourv4s_POST:
3651 case AArch64::LD1Fourv8b_POST:
3652 case AArch64::LD1Fourv8h_POST:
3653 case AArch64::LD1Onev16b_POST:
3654 case AArch64::LD1Onev1d_POST:
3655 case AArch64::LD1Onev2d_POST:
3656 case AArch64::LD1Onev2s_POST:
3657 case AArch64::LD1Onev4h_POST:
3658 case AArch64::LD1Onev4s_POST:
3659 case AArch64::LD1Onev8b_POST:
3660 case AArch64::LD1Onev8h_POST:
3661 case AArch64::LD1Rv16b_POST:
3662 case AArch64::LD1Rv1d_POST:
3663 case AArch64::LD1Rv2d_POST:
3664 case AArch64::LD1Rv2s_POST:
3665 case AArch64::LD1Rv4h_POST:
3666 case AArch64::LD1Rv4s_POST:
3667 case AArch64::LD1Rv8b_POST:
3668 case AArch64::LD1Rv8h_POST:
3669 case AArch64::LD1Threev16b_POST:
3670 case AArch64::LD1Threev1d_POST:
3671 case AArch64::LD1Threev2d_POST:
3672 case AArch64::LD1Threev2s_POST:
3673 case AArch64::LD1Threev4h_POST:
3674 case AArch64::LD1Threev4s_POST:
3675 case AArch64::LD1Threev8b_POST:
3676 case AArch64::LD1Threev8h_POST:
3677 case AArch64::LD1Twov16b_POST:
3678 case AArch64::LD1Twov1d_POST:
3679 case AArch64::LD1Twov2d_POST:
3680 case AArch64::LD1Twov2s_POST:
3681 case AArch64::LD1Twov4h_POST:
3682 case AArch64::LD1Twov4s_POST:
3683 case AArch64::LD1Twov8b_POST:
3684 case AArch64::LD1Twov8h_POST:
3685 case AArch64::LD1i16_POST:
3686 case AArch64::LD1i32_POST:
3687 case AArch64::LD1i64_POST:
3688 case AArch64::LD1i8_POST:
3689 case AArch64::LD2Rv16b_POST:
3690 case AArch64::LD2Rv1d_POST:
3691 case AArch64::LD2Rv2d_POST:
3692 case AArch64::LD2Rv2s_POST:
3693 case AArch64::LD2Rv4h_POST:
3694 case AArch64::LD2Rv4s_POST:
3695 case AArch64::LD2Rv8b_POST:
3696 case AArch64::LD2Rv8h_POST:
3697 case AArch64::LD2Twov16b_POST:
3698 case AArch64::LD2Twov2d_POST:
3699 case AArch64::LD2Twov2s_POST:
3700 case AArch64::LD2Twov4h_POST:
3701 case AArch64::LD2Twov4s_POST:
3702 case AArch64::LD2Twov8b_POST:
3703 case AArch64::LD2Twov8h_POST:
3704 case AArch64::LD2i16_POST:
3705 case AArch64::LD2i32_POST:
3706 case AArch64::LD2i64_POST:
3707 case AArch64::LD2i8_POST:
3708 case AArch64::LD3Rv16b_POST:
3709 case AArch64::LD3Rv1d_POST:
3710 case AArch64::LD3Rv2d_POST:
3711 case AArch64::LD3Rv2s_POST:
3712 case AArch64::LD3Rv4h_POST:
3713 case AArch64::LD3Rv4s_POST:
3714 case AArch64::LD3Rv8b_POST:
3715 case AArch64::LD3Rv8h_POST:
3716 case AArch64::LD3Threev16b_POST:
3717 case AArch64::LD3Threev2d_POST:
3718 case AArch64::LD3Threev2s_POST:
3719 case AArch64::LD3Threev4h_POST:
3720 case AArch64::LD3Threev4s_POST:
3721 case AArch64::LD3Threev8b_POST:
3722 case AArch64::LD3Threev8h_POST:
3723 case AArch64::LD3i16_POST:
3724 case AArch64::LD3i32_POST:
3725 case AArch64::LD3i64_POST:
3726 case AArch64::LD3i8_POST:
3727 case AArch64::LD4Fourv16b_POST:
3728 case AArch64::LD4Fourv2d_POST:
3729 case AArch64::LD4Fourv2s_POST:
3730 case AArch64::LD4Fourv4h_POST:
3731 case AArch64::LD4Fourv4s_POST:
3732 case AArch64::LD4Fourv8b_POST:
3733 case AArch64::LD4Fourv8h_POST:
3734 case AArch64::LD4Rv16b_POST:
3735 case AArch64::LD4Rv1d_POST:
3736 case AArch64::LD4Rv2d_POST:
3737 case AArch64::LD4Rv2s_POST:
3738 case AArch64::LD4Rv4h_POST:
3739 case AArch64::LD4Rv4s_POST:
3740 case AArch64::LD4Rv8b_POST:
3741 case AArch64::LD4Rv8h_POST:
3742 case AArch64::LD4i16_POST:
3743 case AArch64::LD4i32_POST:
3744 case AArch64::LD4i64_POST:
3745 case AArch64::LD4i8_POST:
3746 case AArch64::LDAPRWpost:
3747 case AArch64::LDAPRXpost:
3748 case AArch64::LDIAPPWpost:
3749 case AArch64::LDIAPPXpost:
3750 case AArch64::LDPDpost:
3751 case AArch64::LDPQpost:
3752 case AArch64::LDPSWpost:
3753 case AArch64::LDPSpost:
3754 case AArch64::LDPWpost:
3755 case AArch64::LDPXpost:
3756 case AArch64::LDRBBpost:
3757 case AArch64::LDRBpost:
3758 case AArch64::LDRDpost:
3759 case AArch64::LDRHHpost:
3760 case AArch64::LDRHpost:
3761 case AArch64::LDRQpost:
3762 case AArch64::LDRSBWpost:
3763 case AArch64::LDRSBXpost:
3764 case AArch64::LDRSHWpost:
3765 case AArch64::LDRSHXpost:
3766 case AArch64::LDRSWpost:
3767 case AArch64::LDRSpost:
3768 case AArch64::LDRWpost:
3769 case AArch64::LDRXpost:
3770 case AArch64::ST1Fourv16b_POST:
3771 case AArch64::ST1Fourv1d_POST:
3772 case AArch64::ST1Fourv2d_POST:
3773 case AArch64::ST1Fourv2s_POST:
3774 case AArch64::ST1Fourv4h_POST:
3775 case AArch64::ST1Fourv4s_POST:
3776 case AArch64::ST1Fourv8b_POST:
3777 case AArch64::ST1Fourv8h_POST:
3778 case AArch64::ST1Onev16b_POST:
3779 case AArch64::ST1Onev1d_POST:
3780 case AArch64::ST1Onev2d_POST:
3781 case AArch64::ST1Onev2s_POST:
3782 case AArch64::ST1Onev4h_POST:
3783 case AArch64::ST1Onev4s_POST:
3784 case AArch64::ST1Onev8b_POST:
3785 case AArch64::ST1Onev8h_POST:
3786 case AArch64::ST1Threev16b_POST:
3787 case AArch64::ST1Threev1d_POST:
3788 case AArch64::ST1Threev2d_POST:
3789 case AArch64::ST1Threev2s_POST:
3790 case AArch64::ST1Threev4h_POST:
3791 case AArch64::ST1Threev4s_POST:
3792 case AArch64::ST1Threev8b_POST:
3793 case AArch64::ST1Threev8h_POST:
3794 case AArch64::ST1Twov16b_POST:
3795 case AArch64::ST1Twov1d_POST:
3796 case AArch64::ST1Twov2d_POST:
3797 case AArch64::ST1Twov2s_POST:
3798 case AArch64::ST1Twov4h_POST:
3799 case AArch64::ST1Twov4s_POST:
3800 case AArch64::ST1Twov8b_POST:
3801 case AArch64::ST1Twov8h_POST:
3802 case AArch64::ST1i16_POST:
3803 case AArch64::ST1i32_POST:
3804 case AArch64::ST1i64_POST:
3805 case AArch64::ST1i8_POST:
3806 case AArch64::ST2GPostIndex:
3807 case AArch64::ST2Twov16b_POST:
3808 case AArch64::ST2Twov2d_POST:
3809 case AArch64::ST2Twov2s_POST:
3810 case AArch64::ST2Twov4h_POST:
3811 case AArch64::ST2Twov4s_POST:
3812 case AArch64::ST2Twov8b_POST:
3813 case AArch64::ST2Twov8h_POST:
3814 case AArch64::ST2i16_POST:
3815 case AArch64::ST2i32_POST:
3816 case AArch64::ST2i64_POST:
3817 case AArch64::ST2i8_POST:
3818 case AArch64::ST3Threev16b_POST:
3819 case AArch64::ST3Threev2d_POST:
3820 case AArch64::ST3Threev2s_POST:
3821 case AArch64::ST3Threev4h_POST:
3822 case AArch64::ST3Threev4s_POST:
3823 case AArch64::ST3Threev8b_POST:
3824 case AArch64::ST3Threev8h_POST:
3825 case AArch64::ST3i16_POST:
3826 case AArch64::ST3i32_POST:
3827 case AArch64::ST3i64_POST:
3828 case AArch64::ST3i8_POST:
3829 case AArch64::ST4Fourv16b_POST:
3830 case AArch64::ST4Fourv2d_POST:
3831 case AArch64::ST4Fourv2s_POST:
3832 case AArch64::ST4Fourv4h_POST:
3833 case AArch64::ST4Fourv4s_POST:
3834 case AArch64::ST4Fourv8b_POST:
3835 case AArch64::ST4Fourv8h_POST:
3836 case AArch64::ST4i16_POST:
3837 case AArch64::ST4i32_POST:
3838 case AArch64::ST4i64_POST:
3839 case AArch64::ST4i8_POST:
3840 case AArch64::STGPostIndex:
3841 case AArch64::STGPpost:
3842 case AArch64::STPDpost:
3843 case AArch64::STPQpost:
3844 case AArch64::STPSpost:
3845 case AArch64::STPWpost:
3846 case AArch64::STPXpost:
3847 case AArch64::STRBBpost:
3848 case AArch64::STRBpost:
3849 case AArch64::STRDpost:
3850 case AArch64::STRHHpost:
3851 case AArch64::STRHpost:
3852 case AArch64::STRQpost:
3853 case AArch64::STRSpost:
3854 case AArch64::STRWpost:
3855 case AArch64::STRXpost:
3856 case AArch64::STZ2GPostIndex:
3857 case AArch64::STZGPostIndex:
3858 return true;
3862 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3863 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3864 bool &OffsetIsScalable, TypeSize &Width,
3865 const TargetRegisterInfo *TRI) const {
3866 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3867 // Handle only loads/stores with base register followed by immediate offset.
3868 if (LdSt.getNumExplicitOperands() == 3) {
3869 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3870 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3871 !LdSt.getOperand(2).isImm())
3872 return false;
3873 } else if (LdSt.getNumExplicitOperands() == 4) {
3874 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3875 if (!LdSt.getOperand(1).isReg() ||
3876 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3877 !LdSt.getOperand(3).isImm())
3878 return false;
3879 } else
3880 return false;
3882 // Get the scaling factor for the instruction and set the width for the
3883 // instruction.
3884 TypeSize Scale(0U, false);
3885 int64_t Dummy1, Dummy2;
3887 // If this returns false, then it's an instruction we don't want to handle.
3888 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3889 return false;
3891 // Compute the offset. Offset is calculated as the immediate operand
3892 // multiplied by the scaling factor. Unscaled instructions have scaling factor
3893 // set to 1. Postindex are a special case which have an offset of 0.
3894 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
3895 BaseOp = &LdSt.getOperand(2);
3896 Offset = 0;
3897 } else if (LdSt.getNumExplicitOperands() == 3) {
3898 BaseOp = &LdSt.getOperand(1);
3899 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3900 } else {
3901 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3902 BaseOp = &LdSt.getOperand(2);
3903 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3905 OffsetIsScalable = Scale.isScalable();
3907 return BaseOp->isReg() || BaseOp->isFI();
3910 MachineOperand &
3911 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3912 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3913 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3914 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3915 return OfsOp;
3918 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3919 TypeSize &Width, int64_t &MinOffset,
3920 int64_t &MaxOffset) {
3921 switch (Opcode) {
3922 // Not a memory operation or something we want to handle.
3923 default:
3924 Scale = TypeSize::getFixed(0);
3925 Width = TypeSize::getFixed(0);
3926 MinOffset = MaxOffset = 0;
3927 return false;
3928 // LDR / STR
3929 case AArch64::LDRQui:
3930 case AArch64::STRQui:
3931 Scale = TypeSize::getFixed(16);
3932 Width = TypeSize::getFixed(16);
3933 MinOffset = 0;
3934 MaxOffset = 4095;
3935 break;
3936 case AArch64::LDRXui:
3937 case AArch64::LDRDui:
3938 case AArch64::STRXui:
3939 case AArch64::STRDui:
3940 case AArch64::PRFMui:
3941 Scale = TypeSize::getFixed(8);
3942 Width = TypeSize::getFixed(8);
3943 MinOffset = 0;
3944 MaxOffset = 4095;
3945 break;
3946 case AArch64::LDRWui:
3947 case AArch64::LDRSui:
3948 case AArch64::LDRSWui:
3949 case AArch64::STRWui:
3950 case AArch64::STRSui:
3951 Scale = TypeSize::getFixed(4);
3952 Width = TypeSize::getFixed(4);
3953 MinOffset = 0;
3954 MaxOffset = 4095;
3955 break;
3956 case AArch64::LDRHui:
3957 case AArch64::LDRHHui:
3958 case AArch64::LDRSHWui:
3959 case AArch64::LDRSHXui:
3960 case AArch64::STRHui:
3961 case AArch64::STRHHui:
3962 Scale = TypeSize::getFixed(2);
3963 Width = TypeSize::getFixed(2);
3964 MinOffset = 0;
3965 MaxOffset = 4095;
3966 break;
3967 case AArch64::LDRBui:
3968 case AArch64::LDRBBui:
3969 case AArch64::LDRSBWui:
3970 case AArch64::LDRSBXui:
3971 case AArch64::STRBui:
3972 case AArch64::STRBBui:
3973 Scale = TypeSize::getFixed(1);
3974 Width = TypeSize::getFixed(1);
3975 MinOffset = 0;
3976 MaxOffset = 4095;
3977 break;
3978 // post/pre inc
3979 case AArch64::STRQpre:
3980 case AArch64::LDRQpost:
3981 Scale = TypeSize::getFixed(1);
3982 Width = TypeSize::getFixed(16);
3983 MinOffset = -256;
3984 MaxOffset = 255;
3985 break;
3986 case AArch64::LDRDpost:
3987 case AArch64::LDRDpre:
3988 case AArch64::LDRXpost:
3989 case AArch64::LDRXpre:
3990 case AArch64::STRDpost:
3991 case AArch64::STRDpre:
3992 case AArch64::STRXpost:
3993 case AArch64::STRXpre:
3994 Scale = TypeSize::getFixed(1);
3995 Width = TypeSize::getFixed(8);
3996 MinOffset = -256;
3997 MaxOffset = 255;
3998 break;
3999 case AArch64::STRWpost:
4000 case AArch64::STRWpre:
4001 case AArch64::LDRWpost:
4002 case AArch64::LDRWpre:
4003 case AArch64::STRSpost:
4004 case AArch64::STRSpre:
4005 case AArch64::LDRSpost:
4006 case AArch64::LDRSpre:
4007 Scale = TypeSize::getFixed(1);
4008 Width = TypeSize::getFixed(4);
4009 MinOffset = -256;
4010 MaxOffset = 255;
4011 break;
4012 case AArch64::LDRHpost:
4013 case AArch64::LDRHpre:
4014 case AArch64::STRHpost:
4015 case AArch64::STRHpre:
4016 case AArch64::LDRHHpost:
4017 case AArch64::LDRHHpre:
4018 case AArch64::STRHHpost:
4019 case AArch64::STRHHpre:
4020 Scale = TypeSize::getFixed(1);
4021 Width = TypeSize::getFixed(2);
4022 MinOffset = -256;
4023 MaxOffset = 255;
4024 break;
4025 case AArch64::LDRBpost:
4026 case AArch64::LDRBpre:
4027 case AArch64::STRBpost:
4028 case AArch64::STRBpre:
4029 case AArch64::LDRBBpost:
4030 case AArch64::LDRBBpre:
4031 case AArch64::STRBBpost:
4032 case AArch64::STRBBpre:
4033 Scale = TypeSize::getFixed(1);
4034 Width = TypeSize::getFixed(1);
4035 MinOffset = -256;
4036 MaxOffset = 255;
4037 break;
4038 // Unscaled
4039 case AArch64::LDURQi:
4040 case AArch64::STURQi:
4041 Scale = TypeSize::getFixed(1);
4042 Width = TypeSize::getFixed(16);
4043 MinOffset = -256;
4044 MaxOffset = 255;
4045 break;
4046 case AArch64::LDURXi:
4047 case AArch64::LDURDi:
4048 case AArch64::LDAPURXi:
4049 case AArch64::STURXi:
4050 case AArch64::STURDi:
4051 case AArch64::STLURXi:
4052 case AArch64::PRFUMi:
4053 Scale = TypeSize::getFixed(1);
4054 Width = TypeSize::getFixed(8);
4055 MinOffset = -256;
4056 MaxOffset = 255;
4057 break;
4058 case AArch64::LDURWi:
4059 case AArch64::LDURSi:
4060 case AArch64::LDURSWi:
4061 case AArch64::LDAPURi:
4062 case AArch64::LDAPURSWi:
4063 case AArch64::STURWi:
4064 case AArch64::STURSi:
4065 case AArch64::STLURWi:
4066 Scale = TypeSize::getFixed(1);
4067 Width = TypeSize::getFixed(4);
4068 MinOffset = -256;
4069 MaxOffset = 255;
4070 break;
4071 case AArch64::LDURHi:
4072 case AArch64::LDURHHi:
4073 case AArch64::LDURSHXi:
4074 case AArch64::LDURSHWi:
4075 case AArch64::LDAPURHi:
4076 case AArch64::LDAPURSHWi:
4077 case AArch64::LDAPURSHXi:
4078 case AArch64::STURHi:
4079 case AArch64::STURHHi:
4080 case AArch64::STLURHi:
4081 Scale = TypeSize::getFixed(1);
4082 Width = TypeSize::getFixed(2);
4083 MinOffset = -256;
4084 MaxOffset = 255;
4085 break;
4086 case AArch64::LDURBi:
4087 case AArch64::LDURBBi:
4088 case AArch64::LDURSBXi:
4089 case AArch64::LDURSBWi:
4090 case AArch64::LDAPURBi:
4091 case AArch64::LDAPURSBWi:
4092 case AArch64::LDAPURSBXi:
4093 case AArch64::STURBi:
4094 case AArch64::STURBBi:
4095 case AArch64::STLURBi:
4096 Scale = TypeSize::getFixed(1);
4097 Width = TypeSize::getFixed(1);
4098 MinOffset = -256;
4099 MaxOffset = 255;
4100 break;
4101 // LDP / STP (including pre/post inc)
4102 case AArch64::LDPQi:
4103 case AArch64::LDNPQi:
4104 case AArch64::STPQi:
4105 case AArch64::STNPQi:
4106 case AArch64::LDPQpost:
4107 case AArch64::LDPQpre:
4108 case AArch64::STPQpost:
4109 case AArch64::STPQpre:
4110 Scale = TypeSize::getFixed(16);
4111 Width = TypeSize::getFixed(16 * 2);
4112 MinOffset = -64;
4113 MaxOffset = 63;
4114 break;
4115 case AArch64::LDPXi:
4116 case AArch64::LDPDi:
4117 case AArch64::LDNPXi:
4118 case AArch64::LDNPDi:
4119 case AArch64::STPXi:
4120 case AArch64::STPDi:
4121 case AArch64::STNPXi:
4122 case AArch64::STNPDi:
4123 case AArch64::LDPDpost:
4124 case AArch64::LDPDpre:
4125 case AArch64::LDPXpost:
4126 case AArch64::LDPXpre:
4127 case AArch64::STPDpost:
4128 case AArch64::STPDpre:
4129 case AArch64::STPXpost:
4130 case AArch64::STPXpre:
4131 Scale = TypeSize::getFixed(8);
4132 Width = TypeSize::getFixed(8 * 2);
4133 MinOffset = -64;
4134 MaxOffset = 63;
4135 break;
4136 case AArch64::LDPWi:
4137 case AArch64::LDPSi:
4138 case AArch64::LDNPWi:
4139 case AArch64::LDNPSi:
4140 case AArch64::STPWi:
4141 case AArch64::STPSi:
4142 case AArch64::STNPWi:
4143 case AArch64::STNPSi:
4144 case AArch64::LDPSpost:
4145 case AArch64::LDPSpre:
4146 case AArch64::LDPWpost:
4147 case AArch64::LDPWpre:
4148 case AArch64::STPSpost:
4149 case AArch64::STPSpre:
4150 case AArch64::STPWpost:
4151 case AArch64::STPWpre:
4152 Scale = TypeSize::getFixed(4);
4153 Width = TypeSize::getFixed(4 * 2);
4154 MinOffset = -64;
4155 MaxOffset = 63;
4156 break;
4157 case AArch64::StoreSwiftAsyncContext:
4158 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4159 Scale = TypeSize::getFixed(1);
4160 Width = TypeSize::getFixed(8);
4161 MinOffset = 0;
4162 MaxOffset = 4095;
4163 break;
4164 case AArch64::ADDG:
4165 Scale = TypeSize::getFixed(16);
4166 Width = TypeSize::getFixed(0);
4167 MinOffset = 0;
4168 MaxOffset = 63;
4169 break;
4170 case AArch64::TAGPstack:
4171 Scale = TypeSize::getFixed(16);
4172 Width = TypeSize::getFixed(0);
4173 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4174 // of 63 (not 64!).
4175 MinOffset = -63;
4176 MaxOffset = 63;
4177 break;
4178 case AArch64::LDG:
4179 case AArch64::STGi:
4180 case AArch64::STGPreIndex:
4181 case AArch64::STGPostIndex:
4182 case AArch64::STZGi:
4183 case AArch64::STZGPreIndex:
4184 case AArch64::STZGPostIndex:
4185 Scale = TypeSize::getFixed(16);
4186 Width = TypeSize::getFixed(16);
4187 MinOffset = -256;
4188 MaxOffset = 255;
4189 break;
4190 // SVE
4191 case AArch64::STR_ZZZZXI:
4192 case AArch64::LDR_ZZZZXI:
4193 Scale = TypeSize::getScalable(16);
4194 Width = TypeSize::getScalable(16 * 4);
4195 MinOffset = -256;
4196 MaxOffset = 252;
4197 break;
4198 case AArch64::STR_ZZZXI:
4199 case AArch64::LDR_ZZZXI:
4200 Scale = TypeSize::getScalable(16);
4201 Width = TypeSize::getScalable(16 * 3);
4202 MinOffset = -256;
4203 MaxOffset = 253;
4204 break;
4205 case AArch64::STR_ZZXI:
4206 case AArch64::LDR_ZZXI:
4207 Scale = TypeSize::getScalable(16);
4208 Width = TypeSize::getScalable(16 * 2);
4209 MinOffset = -256;
4210 MaxOffset = 254;
4211 break;
4212 case AArch64::LDR_PXI:
4213 case AArch64::STR_PXI:
4214 Scale = TypeSize::getScalable(2);
4215 Width = TypeSize::getScalable(2);
4216 MinOffset = -256;
4217 MaxOffset = 255;
4218 break;
4219 case AArch64::LDR_PPXI:
4220 case AArch64::STR_PPXI:
4221 Scale = TypeSize::getScalable(2);
4222 Width = TypeSize::getScalable(2 * 2);
4223 MinOffset = -256;
4224 MaxOffset = 254;
4225 break;
4226 case AArch64::LDR_ZXI:
4227 case AArch64::STR_ZXI:
4228 Scale = TypeSize::getScalable(16);
4229 Width = TypeSize::getScalable(16);
4230 MinOffset = -256;
4231 MaxOffset = 255;
4232 break;
4233 case AArch64::LD1B_IMM:
4234 case AArch64::LD1H_IMM:
4235 case AArch64::LD1W_IMM:
4236 case AArch64::LD1D_IMM:
4237 case AArch64::LDNT1B_ZRI:
4238 case AArch64::LDNT1H_ZRI:
4239 case AArch64::LDNT1W_ZRI:
4240 case AArch64::LDNT1D_ZRI:
4241 case AArch64::ST1B_IMM:
4242 case AArch64::ST1H_IMM:
4243 case AArch64::ST1W_IMM:
4244 case AArch64::ST1D_IMM:
4245 case AArch64::STNT1B_ZRI:
4246 case AArch64::STNT1H_ZRI:
4247 case AArch64::STNT1W_ZRI:
4248 case AArch64::STNT1D_ZRI:
4249 case AArch64::LDNF1B_IMM:
4250 case AArch64::LDNF1H_IMM:
4251 case AArch64::LDNF1W_IMM:
4252 case AArch64::LDNF1D_IMM:
4253 // A full vectors worth of data
4254 // Width = mbytes * elements
4255 Scale = TypeSize::getScalable(16);
4256 Width = TypeSize::getScalable(16);
4257 MinOffset = -8;
4258 MaxOffset = 7;
4259 break;
4260 case AArch64::LD2B_IMM:
4261 case AArch64::LD2H_IMM:
4262 case AArch64::LD2W_IMM:
4263 case AArch64::LD2D_IMM:
4264 case AArch64::ST2B_IMM:
4265 case AArch64::ST2H_IMM:
4266 case AArch64::ST2W_IMM:
4267 case AArch64::ST2D_IMM:
4268 Scale = TypeSize::getScalable(32);
4269 Width = TypeSize::getScalable(16 * 2);
4270 MinOffset = -8;
4271 MaxOffset = 7;
4272 break;
4273 case AArch64::LD3B_IMM:
4274 case AArch64::LD3H_IMM:
4275 case AArch64::LD3W_IMM:
4276 case AArch64::LD3D_IMM:
4277 case AArch64::ST3B_IMM:
4278 case AArch64::ST3H_IMM:
4279 case AArch64::ST3W_IMM:
4280 case AArch64::ST3D_IMM:
4281 Scale = TypeSize::getScalable(48);
4282 Width = TypeSize::getScalable(16 * 3);
4283 MinOffset = -8;
4284 MaxOffset = 7;
4285 break;
4286 case AArch64::LD4B_IMM:
4287 case AArch64::LD4H_IMM:
4288 case AArch64::LD4W_IMM:
4289 case AArch64::LD4D_IMM:
4290 case AArch64::ST4B_IMM:
4291 case AArch64::ST4H_IMM:
4292 case AArch64::ST4W_IMM:
4293 case AArch64::ST4D_IMM:
4294 Scale = TypeSize::getScalable(64);
4295 Width = TypeSize::getScalable(16 * 4);
4296 MinOffset = -8;
4297 MaxOffset = 7;
4298 break;
4299 case AArch64::LD1B_H_IMM:
4300 case AArch64::LD1SB_H_IMM:
4301 case AArch64::LD1H_S_IMM:
4302 case AArch64::LD1SH_S_IMM:
4303 case AArch64::LD1W_D_IMM:
4304 case AArch64::LD1SW_D_IMM:
4305 case AArch64::ST1B_H_IMM:
4306 case AArch64::ST1H_S_IMM:
4307 case AArch64::ST1W_D_IMM:
4308 case AArch64::LDNF1B_H_IMM:
4309 case AArch64::LDNF1SB_H_IMM:
4310 case AArch64::LDNF1H_S_IMM:
4311 case AArch64::LDNF1SH_S_IMM:
4312 case AArch64::LDNF1W_D_IMM:
4313 case AArch64::LDNF1SW_D_IMM:
4314 // A half vector worth of data
4315 // Width = mbytes * elements
4316 Scale = TypeSize::getScalable(8);
4317 Width = TypeSize::getScalable(8);
4318 MinOffset = -8;
4319 MaxOffset = 7;
4320 break;
4321 case AArch64::LD1B_S_IMM:
4322 case AArch64::LD1SB_S_IMM:
4323 case AArch64::LD1H_D_IMM:
4324 case AArch64::LD1SH_D_IMM:
4325 case AArch64::ST1B_S_IMM:
4326 case AArch64::ST1H_D_IMM:
4327 case AArch64::LDNF1B_S_IMM:
4328 case AArch64::LDNF1SB_S_IMM:
4329 case AArch64::LDNF1H_D_IMM:
4330 case AArch64::LDNF1SH_D_IMM:
4331 // A quarter vector worth of data
4332 // Width = mbytes * elements
4333 Scale = TypeSize::getScalable(4);
4334 Width = TypeSize::getScalable(4);
4335 MinOffset = -8;
4336 MaxOffset = 7;
4337 break;
4338 case AArch64::LD1B_D_IMM:
4339 case AArch64::LD1SB_D_IMM:
4340 case AArch64::ST1B_D_IMM:
4341 case AArch64::LDNF1B_D_IMM:
4342 case AArch64::LDNF1SB_D_IMM:
4343 // A eighth vector worth of data
4344 // Width = mbytes * elements
4345 Scale = TypeSize::getScalable(2);
4346 Width = TypeSize::getScalable(2);
4347 MinOffset = -8;
4348 MaxOffset = 7;
4349 break;
4350 case AArch64::ST2Gi:
4351 case AArch64::ST2GPreIndex:
4352 case AArch64::ST2GPostIndex:
4353 case AArch64::STZ2Gi:
4354 case AArch64::STZ2GPreIndex:
4355 case AArch64::STZ2GPostIndex:
4356 Scale = TypeSize::getFixed(16);
4357 Width = TypeSize::getFixed(32);
4358 MinOffset = -256;
4359 MaxOffset = 255;
4360 break;
4361 case AArch64::STGPi:
4362 case AArch64::STGPpost:
4363 case AArch64::STGPpre:
4364 Scale = TypeSize::getFixed(16);
4365 Width = TypeSize::getFixed(16);
4366 MinOffset = -64;
4367 MaxOffset = 63;
4368 break;
4369 case AArch64::LD1RB_IMM:
4370 case AArch64::LD1RB_H_IMM:
4371 case AArch64::LD1RB_S_IMM:
4372 case AArch64::LD1RB_D_IMM:
4373 case AArch64::LD1RSB_H_IMM:
4374 case AArch64::LD1RSB_S_IMM:
4375 case AArch64::LD1RSB_D_IMM:
4376 Scale = TypeSize::getFixed(1);
4377 Width = TypeSize::getFixed(1);
4378 MinOffset = 0;
4379 MaxOffset = 63;
4380 break;
4381 case AArch64::LD1RH_IMM:
4382 case AArch64::LD1RH_S_IMM:
4383 case AArch64::LD1RH_D_IMM:
4384 case AArch64::LD1RSH_S_IMM:
4385 case AArch64::LD1RSH_D_IMM:
4386 Scale = TypeSize::getFixed(2);
4387 Width = TypeSize::getFixed(2);
4388 MinOffset = 0;
4389 MaxOffset = 63;
4390 break;
4391 case AArch64::LD1RW_IMM:
4392 case AArch64::LD1RW_D_IMM:
4393 case AArch64::LD1RSW_IMM:
4394 Scale = TypeSize::getFixed(4);
4395 Width = TypeSize::getFixed(4);
4396 MinOffset = 0;
4397 MaxOffset = 63;
4398 break;
4399 case AArch64::LD1RD_IMM:
4400 Scale = TypeSize::getFixed(8);
4401 Width = TypeSize::getFixed(8);
4402 MinOffset = 0;
4403 MaxOffset = 63;
4404 break;
4407 return true;
4410 // Scaling factor for unscaled load or store.
4411 int AArch64InstrInfo::getMemScale(unsigned Opc) {
4412 switch (Opc) {
4413 default:
4414 llvm_unreachable("Opcode has unknown scale!");
4415 case AArch64::LDRBBui:
4416 case AArch64::LDURBBi:
4417 case AArch64::LDRSBWui:
4418 case AArch64::LDURSBWi:
4419 case AArch64::STRBBui:
4420 case AArch64::STURBBi:
4421 return 1;
4422 case AArch64::LDRHHui:
4423 case AArch64::LDURHHi:
4424 case AArch64::LDRSHWui:
4425 case AArch64::LDURSHWi:
4426 case AArch64::STRHHui:
4427 case AArch64::STURHHi:
4428 return 2;
4429 case AArch64::LDRSui:
4430 case AArch64::LDURSi:
4431 case AArch64::LDRSpre:
4432 case AArch64::LDRSWui:
4433 case AArch64::LDURSWi:
4434 case AArch64::LDRSWpre:
4435 case AArch64::LDRWpre:
4436 case AArch64::LDRWui:
4437 case AArch64::LDURWi:
4438 case AArch64::STRSui:
4439 case AArch64::STURSi:
4440 case AArch64::STRSpre:
4441 case AArch64::STRWui:
4442 case AArch64::STURWi:
4443 case AArch64::STRWpre:
4444 case AArch64::LDPSi:
4445 case AArch64::LDPSWi:
4446 case AArch64::LDPWi:
4447 case AArch64::STPSi:
4448 case AArch64::STPWi:
4449 return 4;
4450 case AArch64::LDRDui:
4451 case AArch64::LDURDi:
4452 case AArch64::LDRDpre:
4453 case AArch64::LDRXui:
4454 case AArch64::LDURXi:
4455 case AArch64::LDRXpre:
4456 case AArch64::STRDui:
4457 case AArch64::STURDi:
4458 case AArch64::STRDpre:
4459 case AArch64::STRXui:
4460 case AArch64::STURXi:
4461 case AArch64::STRXpre:
4462 case AArch64::LDPDi:
4463 case AArch64::LDPXi:
4464 case AArch64::STPDi:
4465 case AArch64::STPXi:
4466 return 8;
4467 case AArch64::LDRQui:
4468 case AArch64::LDURQi:
4469 case AArch64::STRQui:
4470 case AArch64::STURQi:
4471 case AArch64::STRQpre:
4472 case AArch64::LDPQi:
4473 case AArch64::LDRQpre:
4474 case AArch64::STPQi:
4475 case AArch64::STGi:
4476 case AArch64::STZGi:
4477 case AArch64::ST2Gi:
4478 case AArch64::STZ2Gi:
4479 case AArch64::STGPi:
4480 return 16;
4484 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4485 switch (MI.getOpcode()) {
4486 default:
4487 return false;
4488 case AArch64::LDRWpre:
4489 case AArch64::LDRXpre:
4490 case AArch64::LDRSWpre:
4491 case AArch64::LDRSpre:
4492 case AArch64::LDRDpre:
4493 case AArch64::LDRQpre:
4494 return true;
4498 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4499 switch (MI.getOpcode()) {
4500 default:
4501 return false;
4502 case AArch64::STRWpre:
4503 case AArch64::STRXpre:
4504 case AArch64::STRSpre:
4505 case AArch64::STRDpre:
4506 case AArch64::STRQpre:
4507 return true;
4511 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4512 return isPreLd(MI) || isPreSt(MI);
4515 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4516 switch (MI.getOpcode()) {
4517 default:
4518 return false;
4519 case AArch64::LDPSi:
4520 case AArch64::LDPSWi:
4521 case AArch64::LDPDi:
4522 case AArch64::LDPQi:
4523 case AArch64::LDPWi:
4524 case AArch64::LDPXi:
4525 case AArch64::STPSi:
4526 case AArch64::STPDi:
4527 case AArch64::STPQi:
4528 case AArch64::STPWi:
4529 case AArch64::STPXi:
4530 case AArch64::STGPi:
4531 return true;
4535 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4536 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
4537 unsigned Idx =
4538 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4539 : 1;
4540 return MI.getOperand(Idx);
4543 const MachineOperand &
4544 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4545 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
4546 unsigned Idx =
4547 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4548 : 2;
4549 return MI.getOperand(Idx);
4552 const MachineOperand &
4553 AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
4554 switch (MI.getOpcode()) {
4555 default:
4556 llvm_unreachable("Unexpected opcode");
4557 case AArch64::LDRBroX:
4558 case AArch64::LDRBBroX:
4559 case AArch64::LDRSBXroX:
4560 case AArch64::LDRSBWroX:
4561 case AArch64::LDRHroX:
4562 case AArch64::LDRHHroX:
4563 case AArch64::LDRSHXroX:
4564 case AArch64::LDRSHWroX:
4565 case AArch64::LDRWroX:
4566 case AArch64::LDRSroX:
4567 case AArch64::LDRSWroX:
4568 case AArch64::LDRDroX:
4569 case AArch64::LDRXroX:
4570 case AArch64::LDRQroX:
4571 return MI.getOperand(4);
4575 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4576 Register Reg) {
4577 if (MI.getParent() == nullptr)
4578 return nullptr;
4579 const MachineFunction *MF = MI.getParent()->getParent();
4580 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4583 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4584 auto IsHFPR = [&](const MachineOperand &Op) {
4585 if (!Op.isReg())
4586 return false;
4587 auto Reg = Op.getReg();
4588 if (Reg.isPhysical())
4589 return AArch64::FPR16RegClass.contains(Reg);
4590 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4591 return TRC == &AArch64::FPR16RegClass ||
4592 TRC == &AArch64::FPR16_loRegClass;
4594 return llvm::any_of(MI.operands(), IsHFPR);
4597 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4598 auto IsQFPR = [&](const MachineOperand &Op) {
4599 if (!Op.isReg())
4600 return false;
4601 auto Reg = Op.getReg();
4602 if (Reg.isPhysical())
4603 return AArch64::FPR128RegClass.contains(Reg);
4604 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4605 return TRC == &AArch64::FPR128RegClass ||
4606 TRC == &AArch64::FPR128_loRegClass;
4608 return llvm::any_of(MI.operands(), IsQFPR);
4611 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4612 switch (MI.getOpcode()) {
4613 case AArch64::BRK:
4614 case AArch64::HLT:
4615 case AArch64::PACIASP:
4616 case AArch64::PACIBSP:
4617 // Implicit BTI behavior.
4618 return true;
4619 case AArch64::PAUTH_PROLOGUE:
4620 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4621 return true;
4622 case AArch64::HINT: {
4623 unsigned Imm = MI.getOperand(0).getImm();
4624 // Explicit BTI instruction.
4625 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4626 return true;
4627 // PACI(A|B)SP instructions.
4628 if (Imm == 25 || Imm == 27)
4629 return true;
4630 return false;
4632 default:
4633 return false;
4637 bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4638 if (Reg == 0)
4639 return false;
4640 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4641 return AArch64::FPR128RegClass.contains(Reg) ||
4642 AArch64::FPR64RegClass.contains(Reg) ||
4643 AArch64::FPR32RegClass.contains(Reg) ||
4644 AArch64::FPR16RegClass.contains(Reg) ||
4645 AArch64::FPR8RegClass.contains(Reg);
4648 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4649 auto IsFPR = [&](const MachineOperand &Op) {
4650 if (!Op.isReg())
4651 return false;
4652 auto Reg = Op.getReg();
4653 if (Reg.isPhysical())
4654 return isFpOrNEON(Reg);
4656 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4657 return TRC == &AArch64::FPR128RegClass ||
4658 TRC == &AArch64::FPR128_loRegClass ||
4659 TRC == &AArch64::FPR64RegClass ||
4660 TRC == &AArch64::FPR64_loRegClass ||
4661 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4662 TRC == &AArch64::FPR8RegClass;
4664 return llvm::any_of(MI.operands(), IsFPR);
4667 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
4668 // scaled.
4669 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4670 int Scale = AArch64InstrInfo::getMemScale(Opc);
4672 // If the byte-offset isn't a multiple of the stride, we can't scale this
4673 // offset.
4674 if (Offset % Scale != 0)
4675 return false;
4677 // Convert the byte-offset used by unscaled into an "element" offset used
4678 // by the scaled pair load/store instructions.
4679 Offset /= Scale;
4680 return true;
4683 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4684 if (FirstOpc == SecondOpc)
4685 return true;
4686 // We can also pair sign-ext and zero-ext instructions.
4687 switch (FirstOpc) {
4688 default:
4689 return false;
4690 case AArch64::STRSui:
4691 case AArch64::STURSi:
4692 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4693 case AArch64::STRDui:
4694 case AArch64::STURDi:
4695 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4696 case AArch64::STRQui:
4697 case AArch64::STURQi:
4698 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4699 case AArch64::STRWui:
4700 case AArch64::STURWi:
4701 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4702 case AArch64::STRXui:
4703 case AArch64::STURXi:
4704 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4705 case AArch64::LDRSui:
4706 case AArch64::LDURSi:
4707 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4708 case AArch64::LDRDui:
4709 case AArch64::LDURDi:
4710 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4711 case AArch64::LDRQui:
4712 case AArch64::LDURQi:
4713 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4714 case AArch64::LDRWui:
4715 case AArch64::LDURWi:
4716 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4717 case AArch64::LDRSWui:
4718 case AArch64::LDURSWi:
4719 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4720 case AArch64::LDRXui:
4721 case AArch64::LDURXi:
4722 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4724 // These instructions can't be paired based on their opcodes.
4725 return false;
4728 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4729 int64_t Offset1, unsigned Opcode1, int FI2,
4730 int64_t Offset2, unsigned Opcode2) {
4731 // Accesses through fixed stack object frame indices may access a different
4732 // fixed stack slot. Check that the object offsets + offsets match.
4733 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4734 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4735 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4736 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4737 // Convert to scaled object offsets.
4738 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4739 if (ObjectOffset1 % Scale1 != 0)
4740 return false;
4741 ObjectOffset1 /= Scale1;
4742 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4743 if (ObjectOffset2 % Scale2 != 0)
4744 return false;
4745 ObjectOffset2 /= Scale2;
4746 ObjectOffset1 += Offset1;
4747 ObjectOffset2 += Offset2;
4748 return ObjectOffset1 + 1 == ObjectOffset2;
4751 return FI1 == FI2;
4754 /// Detect opportunities for ldp/stp formation.
4756 /// Only called for LdSt for which getMemOperandWithOffset returns true.
4757 bool AArch64InstrInfo::shouldClusterMemOps(
4758 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4759 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4760 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4761 unsigned NumBytes) const {
4762 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4763 const MachineOperand &BaseOp1 = *BaseOps1.front();
4764 const MachineOperand &BaseOp2 = *BaseOps2.front();
4765 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4766 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4767 if (BaseOp1.getType() != BaseOp2.getType())
4768 return false;
4770 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4771 "Only base registers and frame indices are supported.");
4773 // Check for both base regs and base FI.
4774 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4775 return false;
4777 // Only cluster up to a single pair.
4778 if (ClusterSize > 2)
4779 return false;
4781 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4782 return false;
4784 // Can we pair these instructions based on their opcodes?
4785 unsigned FirstOpc = FirstLdSt.getOpcode();
4786 unsigned SecondOpc = SecondLdSt.getOpcode();
4787 if (!canPairLdStOpc(FirstOpc, SecondOpc))
4788 return false;
4790 // Can't merge volatiles or load/stores that have a hint to avoid pair
4791 // formation, for example.
4792 if (!isCandidateToMergeOrPair(FirstLdSt) ||
4793 !isCandidateToMergeOrPair(SecondLdSt))
4794 return false;
4796 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4797 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4798 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4799 return false;
4801 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4802 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4803 return false;
4805 // Pairwise instructions have a 7-bit signed offset field.
4806 if (Offset1 > 63 || Offset1 < -64)
4807 return false;
4809 // The caller should already have ordered First/SecondLdSt by offset.
4810 // Note: except for non-equal frame index bases
4811 if (BaseOp1.isFI()) {
4812 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4813 "Caller should have ordered offsets.");
4815 const MachineFrameInfo &MFI =
4816 FirstLdSt.getParent()->getParent()->getFrameInfo();
4817 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4818 BaseOp2.getIndex(), Offset2, SecondOpc);
4821 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4823 return Offset1 + 1 == Offset2;
4826 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4827 MCRegister Reg, unsigned SubIdx,
4828 unsigned State,
4829 const TargetRegisterInfo *TRI) {
4830 if (!SubIdx)
4831 return MIB.addReg(Reg, State);
4833 if (Reg.isPhysical())
4834 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4835 return MIB.addReg(Reg, State, SubIdx);
4838 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4839 unsigned NumRegs) {
4840 // We really want the positive remainder mod 32 here, that happens to be
4841 // easily obtainable with a mask.
4842 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4845 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4846 MachineBasicBlock::iterator I,
4847 const DebugLoc &DL, MCRegister DestReg,
4848 MCRegister SrcReg, bool KillSrc,
4849 unsigned Opcode,
4850 ArrayRef<unsigned> Indices) const {
4851 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4852 const TargetRegisterInfo *TRI = &getRegisterInfo();
4853 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4854 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4855 unsigned NumRegs = Indices.size();
4857 int SubReg = 0, End = NumRegs, Incr = 1;
4858 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4859 SubReg = NumRegs - 1;
4860 End = -1;
4861 Incr = -1;
4864 for (; SubReg != End; SubReg += Incr) {
4865 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4866 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4867 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4868 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4872 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4873 MachineBasicBlock::iterator I,
4874 const DebugLoc &DL, MCRegister DestReg,
4875 MCRegister SrcReg, bool KillSrc,
4876 unsigned Opcode, unsigned ZeroReg,
4877 llvm::ArrayRef<unsigned> Indices) const {
4878 const TargetRegisterInfo *TRI = &getRegisterInfo();
4879 unsigned NumRegs = Indices.size();
4881 #ifndef NDEBUG
4882 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4883 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4884 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4885 "GPR reg sequences should not be able to overlap");
4886 #endif
4888 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4889 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4890 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4891 MIB.addReg(ZeroReg);
4892 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4893 MIB.addImm(0);
4897 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4898 MachineBasicBlock::iterator I,
4899 const DebugLoc &DL, MCRegister DestReg,
4900 MCRegister SrcReg, bool KillSrc,
4901 bool RenamableDest,
4902 bool RenamableSrc) const {
4903 if (AArch64::GPR32spRegClass.contains(DestReg) &&
4904 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4905 const TargetRegisterInfo *TRI = &getRegisterInfo();
4907 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4908 // If either operand is WSP, expand to ADD #0.
4909 if (Subtarget.hasZeroCycleRegMove()) {
4910 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4911 MCRegister DestRegX = TRI->getMatchingSuperReg(
4912 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4913 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4914 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4915 // This instruction is reading and writing X registers. This may upset
4916 // the register scavenger and machine verifier, so we need to indicate
4917 // that we are reading an undefined value from SrcRegX, but a proper
4918 // value from SrcReg.
4919 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4920 .addReg(SrcRegX, RegState::Undef)
4921 .addImm(0)
4922 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4923 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4924 } else {
4925 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4926 .addReg(SrcReg, getKillRegState(KillSrc))
4927 .addImm(0)
4928 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4930 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4931 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4932 .addImm(0)
4933 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4934 } else {
4935 if (Subtarget.hasZeroCycleRegMove()) {
4936 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4937 MCRegister DestRegX = TRI->getMatchingSuperReg(
4938 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4939 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4940 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4941 // This instruction is reading and writing X registers. This may upset
4942 // the register scavenger and machine verifier, so we need to indicate
4943 // that we are reading an undefined value from SrcRegX, but a proper
4944 // value from SrcReg.
4945 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4946 .addReg(AArch64::XZR)
4947 .addReg(SrcRegX, RegState::Undef)
4948 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4949 } else {
4950 // Otherwise, expand to ORR WZR.
4951 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4952 .addReg(AArch64::WZR)
4953 .addReg(SrcReg, getKillRegState(KillSrc));
4956 return;
4959 // Copy a Predicate register by ORRing with itself.
4960 if (AArch64::PPRRegClass.contains(DestReg) &&
4961 AArch64::PPRRegClass.contains(SrcReg)) {
4962 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4963 "Unexpected SVE register.");
4964 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4965 .addReg(SrcReg) // Pg
4966 .addReg(SrcReg)
4967 .addReg(SrcReg, getKillRegState(KillSrc));
4968 return;
4971 // Copy a predicate-as-counter register by ORRing with itself as if it
4972 // were a regular predicate (mask) register.
4973 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4974 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4975 if (DestIsPNR || SrcIsPNR) {
4976 auto ToPPR = [](MCRegister R) -> MCRegister {
4977 return (R - AArch64::PN0) + AArch64::P0;
4979 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4980 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4982 if (PPRSrcReg != PPRDestReg) {
4983 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4984 .addReg(PPRSrcReg) // Pg
4985 .addReg(PPRSrcReg)
4986 .addReg(PPRSrcReg, getKillRegState(KillSrc));
4987 if (DestIsPNR)
4988 NewMI.addDef(DestReg, RegState::Implicit);
4990 return;
4993 // Copy a Z register by ORRing with itself.
4994 if (AArch64::ZPRRegClass.contains(DestReg) &&
4995 AArch64::ZPRRegClass.contains(SrcReg)) {
4996 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4997 "Unexpected SVE register.");
4998 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4999 .addReg(SrcReg)
5000 .addReg(SrcReg, getKillRegState(KillSrc));
5001 return;
5004 // Copy a Z register pair by copying the individual sub-registers.
5005 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5006 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5007 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5008 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5009 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5010 "Unexpected SVE register.");
5011 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5012 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5013 Indices);
5014 return;
5017 // Copy a Z register triple by copying the individual sub-registers.
5018 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5019 AArch64::ZPR3RegClass.contains(SrcReg)) {
5020 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5021 "Unexpected SVE register.");
5022 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5023 AArch64::zsub2};
5024 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5025 Indices);
5026 return;
5029 // Copy a Z register quad by copying the individual sub-registers.
5030 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5031 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5032 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5033 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5034 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5035 "Unexpected SVE register.");
5036 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5037 AArch64::zsub2, AArch64::zsub3};
5038 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5039 Indices);
5040 return;
5043 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5044 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
5045 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5046 // If either operand is SP, expand to ADD #0.
5047 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5048 .addReg(SrcReg, getKillRegState(KillSrc))
5049 .addImm(0)
5050 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5051 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
5052 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5053 .addImm(0)
5054 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5055 } else {
5056 // Otherwise, expand to ORR XZR.
5057 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5058 .addReg(AArch64::XZR)
5059 .addReg(SrcReg, getKillRegState(KillSrc));
5061 return;
5064 // Copy a DDDD register quad by copying the individual sub-registers.
5065 if (AArch64::DDDDRegClass.contains(DestReg) &&
5066 AArch64::DDDDRegClass.contains(SrcReg)) {
5067 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5068 AArch64::dsub2, AArch64::dsub3};
5069 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5070 Indices);
5071 return;
5074 // Copy a DDD register triple by copying the individual sub-registers.
5075 if (AArch64::DDDRegClass.contains(DestReg) &&
5076 AArch64::DDDRegClass.contains(SrcReg)) {
5077 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5078 AArch64::dsub2};
5079 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5080 Indices);
5081 return;
5084 // Copy a DD register pair by copying the individual sub-registers.
5085 if (AArch64::DDRegClass.contains(DestReg) &&
5086 AArch64::DDRegClass.contains(SrcReg)) {
5087 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5088 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5089 Indices);
5090 return;
5093 // Copy a QQQQ register quad by copying the individual sub-registers.
5094 if (AArch64::QQQQRegClass.contains(DestReg) &&
5095 AArch64::QQQQRegClass.contains(SrcReg)) {
5096 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5097 AArch64::qsub2, AArch64::qsub3};
5098 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5099 Indices);
5100 return;
5103 // Copy a QQQ register triple by copying the individual sub-registers.
5104 if (AArch64::QQQRegClass.contains(DestReg) &&
5105 AArch64::QQQRegClass.contains(SrcReg)) {
5106 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5107 AArch64::qsub2};
5108 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5109 Indices);
5110 return;
5113 // Copy a QQ register pair by copying the individual sub-registers.
5114 if (AArch64::QQRegClass.contains(DestReg) &&
5115 AArch64::QQRegClass.contains(SrcReg)) {
5116 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5117 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5118 Indices);
5119 return;
5122 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5123 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5124 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5125 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5126 AArch64::XZR, Indices);
5127 return;
5130 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5131 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5132 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5133 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5134 AArch64::WZR, Indices);
5135 return;
5138 if (AArch64::FPR128RegClass.contains(DestReg) &&
5139 AArch64::FPR128RegClass.contains(SrcReg)) {
5140 if (Subtarget.isSVEorStreamingSVEAvailable() &&
5141 !Subtarget.isNeonAvailable())
5142 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5143 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5144 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5145 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5146 else if (Subtarget.isNeonAvailable())
5147 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5148 .addReg(SrcReg)
5149 .addReg(SrcReg, getKillRegState(KillSrc));
5150 else {
5151 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5152 .addReg(AArch64::SP, RegState::Define)
5153 .addReg(SrcReg, getKillRegState(KillSrc))
5154 .addReg(AArch64::SP)
5155 .addImm(-16);
5156 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5157 .addReg(AArch64::SP, RegState::Define)
5158 .addReg(DestReg, RegState::Define)
5159 .addReg(AArch64::SP)
5160 .addImm(16);
5162 return;
5165 if (AArch64::FPR64RegClass.contains(DestReg) &&
5166 AArch64::FPR64RegClass.contains(SrcReg)) {
5167 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5168 .addReg(SrcReg, getKillRegState(KillSrc));
5169 return;
5172 if (AArch64::FPR32RegClass.contains(DestReg) &&
5173 AArch64::FPR32RegClass.contains(SrcReg)) {
5174 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5175 .addReg(SrcReg, getKillRegState(KillSrc));
5176 return;
5179 if (AArch64::FPR16RegClass.contains(DestReg) &&
5180 AArch64::FPR16RegClass.contains(SrcReg)) {
5181 DestReg =
5182 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
5183 SrcReg =
5184 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
5185 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5186 .addReg(SrcReg, getKillRegState(KillSrc));
5187 return;
5190 if (AArch64::FPR8RegClass.contains(DestReg) &&
5191 AArch64::FPR8RegClass.contains(SrcReg)) {
5192 DestReg =
5193 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
5194 SrcReg =
5195 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
5196 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5197 .addReg(SrcReg, getKillRegState(KillSrc));
5198 return;
5201 // Copies between GPR64 and FPR64.
5202 if (AArch64::FPR64RegClass.contains(DestReg) &&
5203 AArch64::GPR64RegClass.contains(SrcReg)) {
5204 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5205 .addReg(SrcReg, getKillRegState(KillSrc));
5206 return;
5208 if (AArch64::GPR64RegClass.contains(DestReg) &&
5209 AArch64::FPR64RegClass.contains(SrcReg)) {
5210 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5211 .addReg(SrcReg, getKillRegState(KillSrc));
5212 return;
5214 // Copies between GPR32 and FPR32.
5215 if (AArch64::FPR32RegClass.contains(DestReg) &&
5216 AArch64::GPR32RegClass.contains(SrcReg)) {
5217 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5218 .addReg(SrcReg, getKillRegState(KillSrc));
5219 return;
5221 if (AArch64::GPR32RegClass.contains(DestReg) &&
5222 AArch64::FPR32RegClass.contains(SrcReg)) {
5223 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5224 .addReg(SrcReg, getKillRegState(KillSrc));
5225 return;
5228 if (DestReg == AArch64::NZCV) {
5229 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5230 BuildMI(MBB, I, DL, get(AArch64::MSR))
5231 .addImm(AArch64SysReg::NZCV)
5232 .addReg(SrcReg, getKillRegState(KillSrc))
5233 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
5234 return;
5237 if (SrcReg == AArch64::NZCV) {
5238 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5239 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
5240 .addImm(AArch64SysReg::NZCV)
5241 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
5242 return;
5245 #ifndef NDEBUG
5246 const TargetRegisterInfo &TRI = getRegisterInfo();
5247 errs() << TRI.getRegAsmName(DestReg) << " = COPY "
5248 << TRI.getRegAsmName(SrcReg) << "\n";
5249 #endif
5250 llvm_unreachable("unimplemented reg-to-reg copy");
5253 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
5254 MachineBasicBlock &MBB,
5255 MachineBasicBlock::iterator InsertBefore,
5256 const MCInstrDesc &MCID,
5257 Register SrcReg, bool IsKill,
5258 unsigned SubIdx0, unsigned SubIdx1, int FI,
5259 MachineMemOperand *MMO) {
5260 Register SrcReg0 = SrcReg;
5261 Register SrcReg1 = SrcReg;
5262 if (SrcReg.isPhysical()) {
5263 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
5264 SubIdx0 = 0;
5265 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
5266 SubIdx1 = 0;
5268 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5269 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
5270 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
5271 .addFrameIndex(FI)
5272 .addImm(0)
5273 .addMemOperand(MMO);
5276 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
5277 MachineBasicBlock::iterator MBBI,
5278 Register SrcReg, bool isKill, int FI,
5279 const TargetRegisterClass *RC,
5280 const TargetRegisterInfo *TRI,
5281 Register VReg,
5282 MachineInstr::MIFlag Flags) const {
5283 MachineFunction &MF = *MBB.getParent();
5284 MachineFrameInfo &MFI = MF.getFrameInfo();
5286 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5287 MachineMemOperand *MMO =
5288 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
5289 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5290 unsigned Opc = 0;
5291 bool Offset = true;
5292 MCRegister PNRReg = MCRegister::NoRegister;
5293 unsigned StackID = TargetStackID::Default;
5294 switch (TRI->getSpillSize(*RC)) {
5295 case 1:
5296 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5297 Opc = AArch64::STRBui;
5298 break;
5299 case 2: {
5300 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5301 Opc = AArch64::STRHui;
5302 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
5303 AArch64::PPRRegClass.hasSubClassEq(RC)) {
5304 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5305 "Unexpected register store without SVE store instructions");
5306 Opc = AArch64::STR_PXI;
5307 StackID = TargetStackID::ScalableVector;
5309 break;
5311 case 4:
5312 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5313 Opc = AArch64::STRWui;
5314 if (SrcReg.isVirtual())
5315 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
5316 else
5317 assert(SrcReg != AArch64::WSP);
5318 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5319 Opc = AArch64::STRSui;
5320 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5321 Opc = AArch64::STR_PPXI;
5322 StackID = TargetStackID::ScalableVector;
5324 break;
5325 case 8:
5326 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5327 Opc = AArch64::STRXui;
5328 if (SrcReg.isVirtual())
5329 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5330 else
5331 assert(SrcReg != AArch64::SP);
5332 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5333 Opc = AArch64::STRDui;
5334 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5335 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
5336 get(AArch64::STPWi), SrcReg, isKill,
5337 AArch64::sube32, AArch64::subo32, FI, MMO);
5338 return;
5340 break;
5341 case 16:
5342 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5343 Opc = AArch64::STRQui;
5344 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5345 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5346 Opc = AArch64::ST1Twov1d;
5347 Offset = false;
5348 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5349 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
5350 get(AArch64::STPXi), SrcReg, isKill,
5351 AArch64::sube64, AArch64::subo64, FI, MMO);
5352 return;
5353 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5354 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5355 "Unexpected register store without SVE store instructions");
5356 Opc = AArch64::STR_ZXI;
5357 StackID = TargetStackID::ScalableVector;
5359 break;
5360 case 24:
5361 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5362 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5363 Opc = AArch64::ST1Threev1d;
5364 Offset = false;
5366 break;
5367 case 32:
5368 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5369 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5370 Opc = AArch64::ST1Fourv1d;
5371 Offset = false;
5372 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5373 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5374 Opc = AArch64::ST1Twov2d;
5375 Offset = false;
5376 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5377 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5378 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5379 "Unexpected register store without SVE store instructions");
5380 Opc = AArch64::STR_ZZXI;
5381 StackID = TargetStackID::ScalableVector;
5383 break;
5384 case 48:
5385 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5386 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5387 Opc = AArch64::ST1Threev2d;
5388 Offset = false;
5389 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5390 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5391 "Unexpected register store without SVE store instructions");
5392 Opc = AArch64::STR_ZZZXI;
5393 StackID = TargetStackID::ScalableVector;
5395 break;
5396 case 64:
5397 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5398 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5399 Opc = AArch64::ST1Fourv2d;
5400 Offset = false;
5401 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5402 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5403 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5404 "Unexpected register store without SVE store instructions");
5405 Opc = AArch64::STR_ZZZZXI;
5406 StackID = TargetStackID::ScalableVector;
5408 break;
5410 assert(Opc && "Unknown register class");
5411 MFI.setStackID(FI, StackID);
5413 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5414 .addReg(SrcReg, getKillRegState(isKill))
5415 .addFrameIndex(FI);
5417 if (Offset)
5418 MI.addImm(0);
5419 if (PNRReg.isValid())
5420 MI.addDef(PNRReg, RegState::Implicit);
5421 MI.addMemOperand(MMO);
5424 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
5425 MachineBasicBlock &MBB,
5426 MachineBasicBlock::iterator InsertBefore,
5427 const MCInstrDesc &MCID,
5428 Register DestReg, unsigned SubIdx0,
5429 unsigned SubIdx1, int FI,
5430 MachineMemOperand *MMO) {
5431 Register DestReg0 = DestReg;
5432 Register DestReg1 = DestReg;
5433 bool IsUndef = true;
5434 if (DestReg.isPhysical()) {
5435 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
5436 SubIdx0 = 0;
5437 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
5438 SubIdx1 = 0;
5439 IsUndef = false;
5441 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5442 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
5443 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
5444 .addFrameIndex(FI)
5445 .addImm(0)
5446 .addMemOperand(MMO);
5449 void AArch64InstrInfo::loadRegFromStackSlot(
5450 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
5451 int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
5452 Register VReg, MachineInstr::MIFlag Flags) const {
5453 MachineFunction &MF = *MBB.getParent();
5454 MachineFrameInfo &MFI = MF.getFrameInfo();
5455 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5456 MachineMemOperand *MMO =
5457 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
5458 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5460 unsigned Opc = 0;
5461 bool Offset = true;
5462 unsigned StackID = TargetStackID::Default;
5463 Register PNRReg = MCRegister::NoRegister;
5464 switch (TRI->getSpillSize(*RC)) {
5465 case 1:
5466 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5467 Opc = AArch64::LDRBui;
5468 break;
5469 case 2: {
5470 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5471 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5472 Opc = AArch64::LDRHui;
5473 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5474 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5475 "Unexpected register load without SVE load instructions");
5476 if (IsPNR)
5477 PNRReg = DestReg;
5478 Opc = AArch64::LDR_PXI;
5479 StackID = TargetStackID::ScalableVector;
5481 break;
5483 case 4:
5484 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5485 Opc = AArch64::LDRWui;
5486 if (DestReg.isVirtual())
5487 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5488 else
5489 assert(DestReg != AArch64::WSP);
5490 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5491 Opc = AArch64::LDRSui;
5492 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5493 Opc = AArch64::LDR_PPXI;
5494 StackID = TargetStackID::ScalableVector;
5496 break;
5497 case 8:
5498 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5499 Opc = AArch64::LDRXui;
5500 if (DestReg.isVirtual())
5501 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5502 else
5503 assert(DestReg != AArch64::SP);
5504 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5505 Opc = AArch64::LDRDui;
5506 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5507 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5508 get(AArch64::LDPWi), DestReg, AArch64::sube32,
5509 AArch64::subo32, FI, MMO);
5510 return;
5512 break;
5513 case 16:
5514 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5515 Opc = AArch64::LDRQui;
5516 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5517 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5518 Opc = AArch64::LD1Twov1d;
5519 Offset = false;
5520 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5521 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5522 get(AArch64::LDPXi), DestReg, AArch64::sube64,
5523 AArch64::subo64, FI, MMO);
5524 return;
5525 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5526 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5527 "Unexpected register load without SVE load instructions");
5528 Opc = AArch64::LDR_ZXI;
5529 StackID = TargetStackID::ScalableVector;
5531 break;
5532 case 24:
5533 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5534 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5535 Opc = AArch64::LD1Threev1d;
5536 Offset = false;
5538 break;
5539 case 32:
5540 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5541 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5542 Opc = AArch64::LD1Fourv1d;
5543 Offset = false;
5544 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5545 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5546 Opc = AArch64::LD1Twov2d;
5547 Offset = false;
5548 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5549 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5550 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5551 "Unexpected register load without SVE load instructions");
5552 Opc = AArch64::LDR_ZZXI;
5553 StackID = TargetStackID::ScalableVector;
5555 break;
5556 case 48:
5557 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5558 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5559 Opc = AArch64::LD1Threev2d;
5560 Offset = false;
5561 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5562 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5563 "Unexpected register load without SVE load instructions");
5564 Opc = AArch64::LDR_ZZZXI;
5565 StackID = TargetStackID::ScalableVector;
5567 break;
5568 case 64:
5569 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5570 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5571 Opc = AArch64::LD1Fourv2d;
5572 Offset = false;
5573 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5574 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5575 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5576 "Unexpected register load without SVE load instructions");
5577 Opc = AArch64::LDR_ZZZZXI;
5578 StackID = TargetStackID::ScalableVector;
5580 break;
5583 assert(Opc && "Unknown register class");
5584 MFI.setStackID(FI, StackID);
5586 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5587 .addReg(DestReg, getDefRegState(true))
5588 .addFrameIndex(FI);
5589 if (Offset)
5590 MI.addImm(0);
5591 if (PNRReg.isValid() && !PNRReg.isVirtual())
5592 MI.addDef(PNRReg, RegState::Implicit);
5593 MI.addMemOperand(MMO);
5596 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5597 const MachineInstr &UseMI,
5598 const TargetRegisterInfo *TRI) {
5599 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5600 UseMI.getIterator()),
5601 [TRI](const MachineInstr &I) {
5602 return I.modifiesRegister(AArch64::NZCV, TRI) ||
5603 I.readsRegister(AArch64::NZCV, TRI);
5607 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5608 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5609 // The smallest scalable element supported by scaled SVE addressing
5610 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5611 // byte offset must always be a multiple of 2.
5612 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5614 // VGSized offsets are divided by '2', because the VG register is the
5615 // the number of 64bit granules as opposed to 128bit vector chunks,
5616 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5617 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5618 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5619 ByteSized = Offset.getFixed();
5620 VGSized = Offset.getScalable() / 2;
5623 /// Returns the offset in parts to which this frame offset can be
5624 /// decomposed for the purpose of describing a frame offset.
5625 /// For non-scalable offsets this is simply its byte size.
5626 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5627 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5628 int64_t &NumDataVectors) {
5629 // The smallest scalable element supported by scaled SVE addressing
5630 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5631 // byte offset must always be a multiple of 2.
5632 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5634 NumBytes = Offset.getFixed();
5635 NumDataVectors = 0;
5636 NumPredicateVectors = Offset.getScalable() / 2;
5637 // This method is used to get the offsets to adjust the frame offset.
5638 // If the function requires ADDPL to be used and needs more than two ADDPL
5639 // instructions, part of the offset is folded into NumDataVectors so that it
5640 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5641 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5642 NumPredicateVectors > 62) {
5643 NumDataVectors = NumPredicateVectors / 8;
5644 NumPredicateVectors -= NumDataVectors * 8;
5648 // Convenience function to create a DWARF expression for
5649 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
5650 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5651 int NumVGScaledBytes, unsigned VG,
5652 llvm::raw_string_ostream &Comment) {
5653 uint8_t buffer[16];
5655 if (NumBytes) {
5656 Expr.push_back(dwarf::DW_OP_consts);
5657 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5658 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5659 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5662 if (NumVGScaledBytes) {
5663 Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5664 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5666 Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5667 Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5668 Expr.push_back(0);
5670 Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5671 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5673 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5674 << std::abs(NumVGScaledBytes) << " * VG";
5678 // Creates an MCCFIInstruction:
5679 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
5680 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5681 unsigned Reg,
5682 const StackOffset &Offset) {
5683 int64_t NumBytes, NumVGScaledBytes;
5684 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5685 NumVGScaledBytes);
5686 std::string CommentBuffer;
5687 llvm::raw_string_ostream Comment(CommentBuffer);
5689 if (Reg == AArch64::SP)
5690 Comment << "sp";
5691 else if (Reg == AArch64::FP)
5692 Comment << "fp";
5693 else
5694 Comment << printReg(Reg, &TRI);
5696 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5697 SmallString<64> Expr;
5698 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5699 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5700 Expr.push_back(0);
5701 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5702 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5704 // Wrap this into DW_CFA_def_cfa.
5705 SmallString<64> DefCfaExpr;
5706 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5707 uint8_t buffer[16];
5708 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5709 DefCfaExpr.append(Expr.str());
5710 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5711 Comment.str());
5714 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5715 unsigned FrameReg, unsigned Reg,
5716 const StackOffset &Offset,
5717 bool LastAdjustmentWasScalable) {
5718 if (Offset.getScalable())
5719 return createDefCFAExpression(TRI, Reg, Offset);
5721 if (FrameReg == Reg && !LastAdjustmentWasScalable)
5722 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5724 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5725 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5728 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5729 unsigned Reg,
5730 const StackOffset &OffsetFromDefCFA) {
5731 int64_t NumBytes, NumVGScaledBytes;
5732 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5733 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5735 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5737 // Non-scalable offsets can use DW_CFA_offset directly.
5738 if (!NumVGScaledBytes)
5739 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5741 std::string CommentBuffer;
5742 llvm::raw_string_ostream Comment(CommentBuffer);
5743 Comment << printReg(Reg, &TRI) << " @ cfa";
5745 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5746 SmallString<64> OffsetExpr;
5747 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5748 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5750 // Wrap this into DW_CFA_expression
5751 SmallString<64> CfaExpr;
5752 CfaExpr.push_back(dwarf::DW_CFA_expression);
5753 uint8_t buffer[16];
5754 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5755 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5756 CfaExpr.append(OffsetExpr.str());
5758 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5759 Comment.str());
5762 // Helper function to emit a frame offset adjustment from a given
5763 // pointer (SrcReg), stored into DestReg. This function is explicit
5764 // in that it requires the opcode.
5765 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5766 MachineBasicBlock::iterator MBBI,
5767 const DebugLoc &DL, unsigned DestReg,
5768 unsigned SrcReg, int64_t Offset, unsigned Opc,
5769 const TargetInstrInfo *TII,
5770 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5771 bool *HasWinCFI, bool EmitCFAOffset,
5772 StackOffset CFAOffset, unsigned FrameReg) {
5773 int Sign = 1;
5774 unsigned MaxEncoding, ShiftSize;
5775 switch (Opc) {
5776 case AArch64::ADDXri:
5777 case AArch64::ADDSXri:
5778 case AArch64::SUBXri:
5779 case AArch64::SUBSXri:
5780 MaxEncoding = 0xfff;
5781 ShiftSize = 12;
5782 break;
5783 case AArch64::ADDVL_XXI:
5784 case AArch64::ADDPL_XXI:
5785 case AArch64::ADDSVL_XXI:
5786 case AArch64::ADDSPL_XXI:
5787 MaxEncoding = 31;
5788 ShiftSize = 0;
5789 if (Offset < 0) {
5790 MaxEncoding = 32;
5791 Sign = -1;
5792 Offset = -Offset;
5794 break;
5795 default:
5796 llvm_unreachable("Unsupported opcode");
5799 // `Offset` can be in bytes or in "scalable bytes".
5800 int VScale = 1;
5801 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5802 VScale = 16;
5803 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5804 VScale = 2;
5806 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5807 // scratch register. If DestReg is a virtual register, use it as the
5808 // scratch register; otherwise, create a new virtual register (to be
5809 // replaced by the scavenger at the end of PEI). That case can be optimized
5810 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5811 // register can be loaded with offset%8 and the add/sub can use an extending
5812 // instruction with LSL#3.
5813 // Currently the function handles any offsets but generates a poor sequence
5814 // of code.
5815 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5817 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5818 Register TmpReg = DestReg;
5819 if (TmpReg == AArch64::XZR)
5820 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5821 &AArch64::GPR64RegClass);
5822 do {
5823 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5824 unsigned LocalShiftSize = 0;
5825 if (ThisVal > MaxEncoding) {
5826 ThisVal = ThisVal >> ShiftSize;
5827 LocalShiftSize = ShiftSize;
5829 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5830 "Encoding cannot handle value that big");
5832 Offset -= ThisVal << LocalShiftSize;
5833 if (Offset == 0)
5834 TmpReg = DestReg;
5835 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5836 .addReg(SrcReg)
5837 .addImm(Sign * (int)ThisVal);
5838 if (ShiftSize)
5839 MBI = MBI.addImm(
5840 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5841 MBI = MBI.setMIFlag(Flag);
5843 auto Change =
5844 VScale == 1
5845 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
5846 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5847 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5848 CFAOffset += Change;
5849 else
5850 CFAOffset -= Change;
5851 if (EmitCFAOffset && DestReg == TmpReg) {
5852 MachineFunction &MF = *MBB.getParent();
5853 const TargetSubtargetInfo &STI = MF.getSubtarget();
5854 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5856 unsigned CFIIndex = MF.addFrameInst(
5857 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5858 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5859 .addCFIIndex(CFIIndex)
5860 .setMIFlags(Flag);
5863 if (NeedsWinCFI) {
5864 assert(Sign == 1 && "SEH directives should always have a positive sign");
5865 int Imm = (int)(ThisVal << LocalShiftSize);
5866 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5867 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5868 if (HasWinCFI)
5869 *HasWinCFI = true;
5870 if (Imm == 0)
5871 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5872 else
5873 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5874 .addImm(Imm)
5875 .setMIFlag(Flag);
5876 assert(Offset == 0 && "Expected remaining offset to be zero to "
5877 "emit a single SEH directive");
5878 } else if (DestReg == AArch64::SP) {
5879 if (HasWinCFI)
5880 *HasWinCFI = true;
5881 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5882 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5883 .addImm(Imm)
5884 .setMIFlag(Flag);
5888 SrcReg = TmpReg;
5889 } while (Offset);
5892 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5893 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5894 unsigned DestReg, unsigned SrcReg,
5895 StackOffset Offset, const TargetInstrInfo *TII,
5896 MachineInstr::MIFlag Flag, bool SetNZCV,
5897 bool NeedsWinCFI, bool *HasWinCFI,
5898 bool EmitCFAOffset, StackOffset CFAOffset,
5899 unsigned FrameReg) {
5900 // If a function is marked as arm_locally_streaming, then the runtime value of
5901 // vscale in the prologue/epilogue is different the runtime value of vscale
5902 // in the function's body. To avoid having to consider multiple vscales,
5903 // we can use `addsvl` to allocate any scalable stack-slots, which under
5904 // most circumstances will be only locals, not callee-save slots.
5905 const Function &F = MBB.getParent()->getFunction();
5906 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5908 int64_t Bytes, NumPredicateVectors, NumDataVectors;
5909 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5910 Offset, Bytes, NumPredicateVectors, NumDataVectors);
5912 // First emit non-scalable frame offsets, or a simple 'mov'.
5913 if (Bytes || (!Offset && SrcReg != DestReg)) {
5914 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5915 "SP increment/decrement not 8-byte aligned");
5916 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5917 if (Bytes < 0) {
5918 Bytes = -Bytes;
5919 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5921 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5922 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5923 FrameReg);
5924 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5925 ? StackOffset::getFixed(-Bytes)
5926 : StackOffset::getFixed(Bytes);
5927 SrcReg = DestReg;
5928 FrameReg = DestReg;
5931 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5932 "SetNZCV not supported with SVE vectors");
5933 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5934 "WinCFI not supported with SVE vectors");
5936 if (NumDataVectors) {
5937 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5938 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5939 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5940 CFAOffset, FrameReg);
5941 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5942 SrcReg = DestReg;
5945 if (NumPredicateVectors) {
5946 assert(DestReg != AArch64::SP && "Unaligned access to SP");
5947 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5948 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5949 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5950 CFAOffset, FrameReg);
5954 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5955 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5956 MachineBasicBlock::iterator InsertPt, int FrameIndex,
5957 LiveIntervals *LIS, VirtRegMap *VRM) const {
5958 // This is a bit of a hack. Consider this instruction:
5960 // %0 = COPY %sp; GPR64all:%0
5962 // We explicitly chose GPR64all for the virtual register so such a copy might
5963 // be eliminated by RegisterCoalescer. However, that may not be possible, and
5964 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5965 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5967 // To prevent that, we are going to constrain the %0 register class here.
5968 if (MI.isFullCopy()) {
5969 Register DstReg = MI.getOperand(0).getReg();
5970 Register SrcReg = MI.getOperand(1).getReg();
5971 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5972 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5973 return nullptr;
5975 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5976 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5977 return nullptr;
5979 // Nothing can folded with copy from/to NZCV.
5980 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5981 return nullptr;
5984 // Handle the case where a copy is being spilled or filled but the source
5985 // and destination register class don't match. For example:
5987 // %0 = COPY %xzr; GPR64common:%0
5989 // In this case we can still safely fold away the COPY and generate the
5990 // following spill code:
5992 // STRXui %xzr, %stack.0
5994 // This also eliminates spilled cross register class COPYs (e.g. between x and
5995 // d regs) of the same size. For example:
5997 // %0 = COPY %1; GPR64:%0, FPR64:%1
5999 // will be filled as
6001 // LDRDui %0, fi<#0>
6003 // instead of
6005 // LDRXui %Temp, fi<#0>
6006 // %0 = FMOV %Temp
6008 if (MI.isCopy() && Ops.size() == 1 &&
6009 // Make sure we're only folding the explicit COPY defs/uses.
6010 (Ops[0] == 0 || Ops[0] == 1)) {
6011 bool IsSpill = Ops[0] == 0;
6012 bool IsFill = !IsSpill;
6013 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
6014 const MachineRegisterInfo &MRI = MF.getRegInfo();
6015 MachineBasicBlock &MBB = *MI.getParent();
6016 const MachineOperand &DstMO = MI.getOperand(0);
6017 const MachineOperand &SrcMO = MI.getOperand(1);
6018 Register DstReg = DstMO.getReg();
6019 Register SrcReg = SrcMO.getReg();
6020 // This is slightly expensive to compute for physical regs since
6021 // getMinimalPhysRegClass is slow.
6022 auto getRegClass = [&](unsigned Reg) {
6023 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6024 : TRI.getMinimalPhysRegClass(Reg);
6027 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6028 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6029 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6030 "Mismatched register size in non subreg COPY");
6031 if (IsSpill)
6032 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6033 getRegClass(SrcReg), &TRI, Register());
6034 else
6035 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6036 getRegClass(DstReg), &TRI, Register());
6037 return &*--InsertPt;
6040 // Handle cases like spilling def of:
6042 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6044 // where the physical register source can be widened and stored to the full
6045 // virtual reg destination stack slot, in this case producing:
6047 // STRXui %xzr, %stack.0
6049 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6050 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6051 assert(SrcMO.getSubReg() == 0 &&
6052 "Unexpected subreg on physical register");
6053 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6054 FrameIndex, &AArch64::GPR64RegClass, &TRI,
6055 Register());
6056 return &*--InsertPt;
6059 // Handle cases like filling use of:
6061 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6063 // where we can load the full virtual reg source stack slot, into the subreg
6064 // destination, in this case producing:
6066 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6068 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6069 const TargetRegisterClass *FillRC;
6070 switch (DstMO.getSubReg()) {
6071 default:
6072 FillRC = nullptr;
6073 break;
6074 case AArch64::sub_32:
6075 FillRC = &AArch64::GPR32RegClass;
6076 break;
6077 case AArch64::ssub:
6078 FillRC = &AArch64::FPR32RegClass;
6079 break;
6080 case AArch64::dsub:
6081 FillRC = &AArch64::FPR64RegClass;
6082 break;
6085 if (FillRC) {
6086 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6087 TRI.getRegSizeInBits(*FillRC) &&
6088 "Mismatched regclass size on folded subreg COPY");
6089 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
6090 Register());
6091 MachineInstr &LoadMI = *--InsertPt;
6092 MachineOperand &LoadDst = LoadMI.getOperand(0);
6093 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6094 LoadDst.setSubReg(DstMO.getSubReg());
6095 LoadDst.setIsUndef();
6096 return &LoadMI;
6101 // Cannot fold.
6102 return nullptr;
6105 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
6106 StackOffset &SOffset,
6107 bool *OutUseUnscaledOp,
6108 unsigned *OutUnscaledOp,
6109 int64_t *EmittableOffset) {
6110 // Set output values in case of early exit.
6111 if (EmittableOffset)
6112 *EmittableOffset = 0;
6113 if (OutUseUnscaledOp)
6114 *OutUseUnscaledOp = false;
6115 if (OutUnscaledOp)
6116 *OutUnscaledOp = 0;
6118 // Exit early for structured vector spills/fills as they can't take an
6119 // immediate offset.
6120 switch (MI.getOpcode()) {
6121 default:
6122 break;
6123 case AArch64::LD1Rv1d:
6124 case AArch64::LD1Rv2s:
6125 case AArch64::LD1Rv2d:
6126 case AArch64::LD1Rv4h:
6127 case AArch64::LD1Rv4s:
6128 case AArch64::LD1Rv8b:
6129 case AArch64::LD1Rv8h:
6130 case AArch64::LD1Rv16b:
6131 case AArch64::LD1Twov2d:
6132 case AArch64::LD1Threev2d:
6133 case AArch64::LD1Fourv2d:
6134 case AArch64::LD1Twov1d:
6135 case AArch64::LD1Threev1d:
6136 case AArch64::LD1Fourv1d:
6137 case AArch64::ST1Twov2d:
6138 case AArch64::ST1Threev2d:
6139 case AArch64::ST1Fourv2d:
6140 case AArch64::ST1Twov1d:
6141 case AArch64::ST1Threev1d:
6142 case AArch64::ST1Fourv1d:
6143 case AArch64::ST1i8:
6144 case AArch64::ST1i16:
6145 case AArch64::ST1i32:
6146 case AArch64::ST1i64:
6147 case AArch64::IRG:
6148 case AArch64::IRGstack:
6149 case AArch64::STGloop:
6150 case AArch64::STZGloop:
6151 return AArch64FrameOffsetCannotUpdate;
6154 // Get the min/max offset and the scale.
6155 TypeSize ScaleValue(0U, false), Width(0U, false);
6156 int64_t MinOff, MaxOff;
6157 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
6158 MaxOff))
6159 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6161 // Construct the complete offset.
6162 bool IsMulVL = ScaleValue.isScalable();
6163 unsigned Scale = ScaleValue.getKnownMinValue();
6164 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6166 const MachineOperand &ImmOpnd =
6167 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
6168 Offset += ImmOpnd.getImm() * Scale;
6170 // If the offset doesn't match the scale, we rewrite the instruction to
6171 // use the unscaled instruction instead. Likewise, if we have a negative
6172 // offset and there is an unscaled op to use.
6173 std::optional<unsigned> UnscaledOp =
6174 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
6175 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6176 if (useUnscaledOp &&
6177 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
6178 MaxOff))
6179 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6181 Scale = ScaleValue.getKnownMinValue();
6182 assert(IsMulVL == ScaleValue.isScalable() &&
6183 "Unscaled opcode has different value for scalable");
6185 int64_t Remainder = Offset % Scale;
6186 assert(!(Remainder && useUnscaledOp) &&
6187 "Cannot have remainder when using unscaled op");
6189 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6190 int64_t NewOffset = Offset / Scale;
6191 if (MinOff <= NewOffset && NewOffset <= MaxOff)
6192 Offset = Remainder;
6193 else {
6194 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6195 Offset = Offset - (NewOffset * Scale);
6198 if (EmittableOffset)
6199 *EmittableOffset = NewOffset;
6200 if (OutUseUnscaledOp)
6201 *OutUseUnscaledOp = useUnscaledOp;
6202 if (OutUnscaledOp && UnscaledOp)
6203 *OutUnscaledOp = *UnscaledOp;
6205 if (IsMulVL)
6206 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
6207 else
6208 SOffset = StackOffset::get(Offset, SOffset.getScalable());
6209 return AArch64FrameOffsetCanUpdate |
6210 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
6213 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
6214 unsigned FrameReg, StackOffset &Offset,
6215 const AArch64InstrInfo *TII) {
6216 unsigned Opcode = MI.getOpcode();
6217 unsigned ImmIdx = FrameRegIdx + 1;
6219 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
6220 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
6221 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
6222 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
6223 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
6224 MI.eraseFromParent();
6225 Offset = StackOffset();
6226 return true;
6229 int64_t NewOffset;
6230 unsigned UnscaledOp;
6231 bool UseUnscaledOp;
6232 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
6233 &UnscaledOp, &NewOffset);
6234 if (Status & AArch64FrameOffsetCanUpdate) {
6235 if (Status & AArch64FrameOffsetIsLegal)
6236 // Replace the FrameIndex with FrameReg.
6237 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
6238 if (UseUnscaledOp)
6239 MI.setDesc(TII->get(UnscaledOp));
6241 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
6242 return !Offset;
6245 return false;
6248 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
6249 MachineBasicBlock::iterator MI) const {
6250 DebugLoc DL;
6251 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
6254 MCInst AArch64InstrInfo::getNop() const {
6255 return MCInstBuilder(AArch64::HINT).addImm(0);
6258 // AArch64 supports MachineCombiner.
6259 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
6261 // True when Opc sets flag
6262 static bool isCombineInstrSettingFlag(unsigned Opc) {
6263 switch (Opc) {
6264 case AArch64::ADDSWrr:
6265 case AArch64::ADDSWri:
6266 case AArch64::ADDSXrr:
6267 case AArch64::ADDSXri:
6268 case AArch64::SUBSWrr:
6269 case AArch64::SUBSXrr:
6270 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6271 case AArch64::SUBSWri:
6272 case AArch64::SUBSXri:
6273 return true;
6274 default:
6275 break;
6277 return false;
6280 // 32b Opcodes that can be combined with a MUL
6281 static bool isCombineInstrCandidate32(unsigned Opc) {
6282 switch (Opc) {
6283 case AArch64::ADDWrr:
6284 case AArch64::ADDWri:
6285 case AArch64::SUBWrr:
6286 case AArch64::ADDSWrr:
6287 case AArch64::ADDSWri:
6288 case AArch64::SUBSWrr:
6289 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6290 case AArch64::SUBWri:
6291 case AArch64::SUBSWri:
6292 return true;
6293 default:
6294 break;
6296 return false;
6299 // 64b Opcodes that can be combined with a MUL
6300 static bool isCombineInstrCandidate64(unsigned Opc) {
6301 switch (Opc) {
6302 case AArch64::ADDXrr:
6303 case AArch64::ADDXri:
6304 case AArch64::SUBXrr:
6305 case AArch64::ADDSXrr:
6306 case AArch64::ADDSXri:
6307 case AArch64::SUBSXrr:
6308 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6309 case AArch64::SUBXri:
6310 case AArch64::SUBSXri:
6311 case AArch64::ADDv8i8:
6312 case AArch64::ADDv16i8:
6313 case AArch64::ADDv4i16:
6314 case AArch64::ADDv8i16:
6315 case AArch64::ADDv2i32:
6316 case AArch64::ADDv4i32:
6317 case AArch64::SUBv8i8:
6318 case AArch64::SUBv16i8:
6319 case AArch64::SUBv4i16:
6320 case AArch64::SUBv8i16:
6321 case AArch64::SUBv2i32:
6322 case AArch64::SUBv4i32:
6323 return true;
6324 default:
6325 break;
6327 return false;
6330 // FP Opcodes that can be combined with a FMUL.
6331 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
6332 switch (Inst.getOpcode()) {
6333 default:
6334 break;
6335 case AArch64::FADDHrr:
6336 case AArch64::FADDSrr:
6337 case AArch64::FADDDrr:
6338 case AArch64::FADDv4f16:
6339 case AArch64::FADDv8f16:
6340 case AArch64::FADDv2f32:
6341 case AArch64::FADDv2f64:
6342 case AArch64::FADDv4f32:
6343 case AArch64::FSUBHrr:
6344 case AArch64::FSUBSrr:
6345 case AArch64::FSUBDrr:
6346 case AArch64::FSUBv4f16:
6347 case AArch64::FSUBv8f16:
6348 case AArch64::FSUBv2f32:
6349 case AArch64::FSUBv2f64:
6350 case AArch64::FSUBv4f32:
6351 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
6352 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
6353 // the target options or if FADD/FSUB has the contract fast-math flag.
6354 return Options.UnsafeFPMath ||
6355 Options.AllowFPOpFusion == FPOpFusion::Fast ||
6356 Inst.getFlag(MachineInstr::FmContract);
6357 return true;
6359 return false;
6362 // Opcodes that can be combined with a MUL
6363 static bool isCombineInstrCandidate(unsigned Opc) {
6364 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
6368 // Utility routine that checks if \param MO is defined by an
6369 // \param CombineOpc instruction in the basic block \param MBB
6370 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
6371 unsigned CombineOpc, unsigned ZeroReg = 0,
6372 bool CheckZeroReg = false) {
6373 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6374 MachineInstr *MI = nullptr;
6376 if (MO.isReg() && MO.getReg().isVirtual())
6377 MI = MRI.getUniqueVRegDef(MO.getReg());
6378 // And it needs to be in the trace (otherwise, it won't have a depth).
6379 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
6380 return false;
6381 // Must only used by the user we combine with.
6382 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
6383 return false;
6385 if (CheckZeroReg) {
6386 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
6387 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
6388 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
6389 // The third input reg must be zero.
6390 if (MI->getOperand(3).getReg() != ZeroReg)
6391 return false;
6394 if (isCombineInstrSettingFlag(CombineOpc) &&
6395 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
6396 return false;
6398 return true;
6402 // Is \param MO defined by an integer multiply and can be combined?
6403 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
6404 unsigned MulOpc, unsigned ZeroReg) {
6405 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
6409 // Is \param MO defined by a floating-point multiply and can be combined?
6410 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
6411 unsigned MulOpc) {
6412 return canCombine(MBB, MO, MulOpc);
6415 // TODO: There are many more machine instruction opcodes to match:
6416 // 1. Other data types (integer, vectors)
6417 // 2. Other math / logic operations (xor, or)
6418 // 3. Other forms of the same operation (intrinsics and other variants)
6419 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
6420 bool Invert) const {
6421 if (Invert)
6422 return false;
6423 switch (Inst.getOpcode()) {
6424 // == Floating-point types ==
6425 // -- Floating-point instructions --
6426 case AArch64::FADDHrr:
6427 case AArch64::FADDSrr:
6428 case AArch64::FADDDrr:
6429 case AArch64::FMULHrr:
6430 case AArch64::FMULSrr:
6431 case AArch64::FMULDrr:
6432 case AArch64::FMULX16:
6433 case AArch64::FMULX32:
6434 case AArch64::FMULX64:
6435 // -- Advanced SIMD instructions --
6436 case AArch64::FADDv4f16:
6437 case AArch64::FADDv8f16:
6438 case AArch64::FADDv2f32:
6439 case AArch64::FADDv4f32:
6440 case AArch64::FADDv2f64:
6441 case AArch64::FMULv4f16:
6442 case AArch64::FMULv8f16:
6443 case AArch64::FMULv2f32:
6444 case AArch64::FMULv4f32:
6445 case AArch64::FMULv2f64:
6446 case AArch64::FMULXv4f16:
6447 case AArch64::FMULXv8f16:
6448 case AArch64::FMULXv2f32:
6449 case AArch64::FMULXv4f32:
6450 case AArch64::FMULXv2f64:
6451 // -- SVE instructions --
6452 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6453 // in the SVE instruction set (though there are predicated ones).
6454 case AArch64::FADD_ZZZ_H:
6455 case AArch64::FADD_ZZZ_S:
6456 case AArch64::FADD_ZZZ_D:
6457 case AArch64::FMUL_ZZZ_H:
6458 case AArch64::FMUL_ZZZ_S:
6459 case AArch64::FMUL_ZZZ_D:
6460 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6461 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
6462 Inst.getFlag(MachineInstr::MIFlag::FmNsz));
6464 // == Integer types ==
6465 // -- Base instructions --
6466 // Opcodes MULWrr and MULXrr don't exist because
6467 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6468 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6469 // The machine-combiner does not support three-source-operands machine
6470 // instruction. So we cannot reassociate MULs.
6471 case AArch64::ADDWrr:
6472 case AArch64::ADDXrr:
6473 case AArch64::ANDWrr:
6474 case AArch64::ANDXrr:
6475 case AArch64::ORRWrr:
6476 case AArch64::ORRXrr:
6477 case AArch64::EORWrr:
6478 case AArch64::EORXrr:
6479 case AArch64::EONWrr:
6480 case AArch64::EONXrr:
6481 // -- Advanced SIMD instructions --
6482 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6483 // in the Advanced SIMD instruction set.
6484 case AArch64::ADDv8i8:
6485 case AArch64::ADDv16i8:
6486 case AArch64::ADDv4i16:
6487 case AArch64::ADDv8i16:
6488 case AArch64::ADDv2i32:
6489 case AArch64::ADDv4i32:
6490 case AArch64::ADDv1i64:
6491 case AArch64::ADDv2i64:
6492 case AArch64::MULv8i8:
6493 case AArch64::MULv16i8:
6494 case AArch64::MULv4i16:
6495 case AArch64::MULv8i16:
6496 case AArch64::MULv2i32:
6497 case AArch64::MULv4i32:
6498 case AArch64::ANDv8i8:
6499 case AArch64::ANDv16i8:
6500 case AArch64::ORRv8i8:
6501 case AArch64::ORRv16i8:
6502 case AArch64::EORv8i8:
6503 case AArch64::EORv16i8:
6504 // -- SVE instructions --
6505 case AArch64::ADD_ZZZ_B:
6506 case AArch64::ADD_ZZZ_H:
6507 case AArch64::ADD_ZZZ_S:
6508 case AArch64::ADD_ZZZ_D:
6509 case AArch64::MUL_ZZZ_B:
6510 case AArch64::MUL_ZZZ_H:
6511 case AArch64::MUL_ZZZ_S:
6512 case AArch64::MUL_ZZZ_D:
6513 case AArch64::AND_ZZZ:
6514 case AArch64::ORR_ZZZ:
6515 case AArch64::EOR_ZZZ:
6516 return true;
6518 default:
6519 return false;
6523 /// Find instructions that can be turned into madd.
6524 static bool getMaddPatterns(MachineInstr &Root,
6525 SmallVectorImpl<unsigned> &Patterns) {
6526 unsigned Opc = Root.getOpcode();
6527 MachineBasicBlock &MBB = *Root.getParent();
6528 bool Found = false;
6530 if (!isCombineInstrCandidate(Opc))
6531 return false;
6532 if (isCombineInstrSettingFlag(Opc)) {
6533 int Cmp_NZCV =
6534 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6535 // When NZCV is live bail out.
6536 if (Cmp_NZCV == -1)
6537 return false;
6538 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6539 // When opcode can't change bail out.
6540 // CHECKME: do we miss any cases for opcode conversion?
6541 if (NewOpc == Opc)
6542 return false;
6543 Opc = NewOpc;
6546 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6547 unsigned Pattern) {
6548 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6549 Patterns.push_back(Pattern);
6550 Found = true;
6554 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6555 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6556 Patterns.push_back(Pattern);
6557 Found = true;
6561 typedef AArch64MachineCombinerPattern MCP;
6563 switch (Opc) {
6564 default:
6565 break;
6566 case AArch64::ADDWrr:
6567 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6568 "ADDWrr does not have register operands");
6569 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6570 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6571 break;
6572 case AArch64::ADDXrr:
6573 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6574 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6575 break;
6576 case AArch64::SUBWrr:
6577 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6578 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6579 break;
6580 case AArch64::SUBXrr:
6581 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6582 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6583 break;
6584 case AArch64::ADDWri:
6585 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6586 break;
6587 case AArch64::ADDXri:
6588 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6589 break;
6590 case AArch64::SUBWri:
6591 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6592 break;
6593 case AArch64::SUBXri:
6594 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6595 break;
6596 case AArch64::ADDv8i8:
6597 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6598 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6599 break;
6600 case AArch64::ADDv16i8:
6601 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6602 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6603 break;
6604 case AArch64::ADDv4i16:
6605 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6606 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6607 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6608 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6609 break;
6610 case AArch64::ADDv8i16:
6611 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6612 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6613 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6614 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6615 break;
6616 case AArch64::ADDv2i32:
6617 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6618 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6619 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6620 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6621 break;
6622 case AArch64::ADDv4i32:
6623 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6624 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6625 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6626 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6627 break;
6628 case AArch64::SUBv8i8:
6629 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6630 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6631 break;
6632 case AArch64::SUBv16i8:
6633 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6634 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6635 break;
6636 case AArch64::SUBv4i16:
6637 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6638 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6639 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6640 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6641 break;
6642 case AArch64::SUBv8i16:
6643 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6644 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6645 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6646 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6647 break;
6648 case AArch64::SUBv2i32:
6649 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6650 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6651 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6652 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6653 break;
6654 case AArch64::SUBv4i32:
6655 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6656 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6657 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6658 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6659 break;
6661 return Found;
6663 /// Floating-Point Support
6665 /// Find instructions that can be turned into madd.
6666 static bool getFMAPatterns(MachineInstr &Root,
6667 SmallVectorImpl<unsigned> &Patterns) {
6669 if (!isCombineInstrCandidateFP(Root))
6670 return false;
6672 MachineBasicBlock &MBB = *Root.getParent();
6673 bool Found = false;
6675 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
6676 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6677 Patterns.push_back(Pattern);
6678 return true;
6680 return false;
6683 typedef AArch64MachineCombinerPattern MCP;
6685 switch (Root.getOpcode()) {
6686 default:
6687 assert(false && "Unsupported FP instruction in combiner\n");
6688 break;
6689 case AArch64::FADDHrr:
6690 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6691 "FADDHrr does not have register operands");
6693 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6694 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6695 break;
6696 case AArch64::FADDSrr:
6697 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6698 "FADDSrr does not have register operands");
6700 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6701 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6703 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6704 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6705 break;
6706 case AArch64::FADDDrr:
6707 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6708 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6710 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6711 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6712 break;
6713 case AArch64::FADDv4f16:
6714 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6715 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6717 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6718 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6719 break;
6720 case AArch64::FADDv8f16:
6721 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6722 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6724 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6725 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6726 break;
6727 case AArch64::FADDv2f32:
6728 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6729 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6731 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6732 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6733 break;
6734 case AArch64::FADDv2f64:
6735 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6736 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6738 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6739 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6740 break;
6741 case AArch64::FADDv4f32:
6742 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6743 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6745 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6746 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6747 break;
6748 case AArch64::FSUBHrr:
6749 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6750 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6751 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6752 break;
6753 case AArch64::FSUBSrr:
6754 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6756 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6757 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6759 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6760 break;
6761 case AArch64::FSUBDrr:
6762 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6764 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6765 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6767 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6768 break;
6769 case AArch64::FSUBv4f16:
6770 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6771 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6773 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6774 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6775 break;
6776 case AArch64::FSUBv8f16:
6777 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6778 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6780 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6781 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6782 break;
6783 case AArch64::FSUBv2f32:
6784 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6785 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6787 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6788 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6789 break;
6790 case AArch64::FSUBv2f64:
6791 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6792 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6794 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6795 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6796 break;
6797 case AArch64::FSUBv4f32:
6798 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6799 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6801 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6802 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6803 break;
6805 return Found;
6808 static bool getFMULPatterns(MachineInstr &Root,
6809 SmallVectorImpl<unsigned> &Patterns) {
6810 MachineBasicBlock &MBB = *Root.getParent();
6811 bool Found = false;
6813 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
6814 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6815 MachineOperand &MO = Root.getOperand(Operand);
6816 MachineInstr *MI = nullptr;
6817 if (MO.isReg() && MO.getReg().isVirtual())
6818 MI = MRI.getUniqueVRegDef(MO.getReg());
6819 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6820 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6821 MI->getOperand(1).getReg().isVirtual())
6822 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6823 if (MI && MI->getOpcode() == Opcode) {
6824 Patterns.push_back(Pattern);
6825 return true;
6827 return false;
6830 typedef AArch64MachineCombinerPattern MCP;
6832 switch (Root.getOpcode()) {
6833 default:
6834 return false;
6835 case AArch64::FMULv2f32:
6836 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6837 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6838 break;
6839 case AArch64::FMULv2f64:
6840 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6841 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6842 break;
6843 case AArch64::FMULv4f16:
6844 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6845 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6846 break;
6847 case AArch64::FMULv4f32:
6848 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6849 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6850 break;
6851 case AArch64::FMULv8f16:
6852 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6853 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6854 break;
6857 return Found;
6860 static bool getFNEGPatterns(MachineInstr &Root,
6861 SmallVectorImpl<unsigned> &Patterns) {
6862 unsigned Opc = Root.getOpcode();
6863 MachineBasicBlock &MBB = *Root.getParent();
6864 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6866 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
6867 MachineOperand &MO = Root.getOperand(1);
6868 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6869 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6870 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6871 Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6872 Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6873 MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6874 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6875 Patterns.push_back(Pattern);
6876 return true;
6878 return false;
6881 switch (Opc) {
6882 default:
6883 break;
6884 case AArch64::FNEGDr:
6885 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
6886 case AArch64::FNEGSr:
6887 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
6890 return false;
6893 /// Return true when a code sequence can improve throughput. It
6894 /// should be called only for instructions in loops.
6895 /// \param Pattern - combiner pattern
6896 bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
6897 switch (Pattern) {
6898 default:
6899 break;
6900 case AArch64MachineCombinerPattern::FMULADDH_OP1:
6901 case AArch64MachineCombinerPattern::FMULADDH_OP2:
6902 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
6903 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
6904 case AArch64MachineCombinerPattern::FMULADDS_OP1:
6905 case AArch64MachineCombinerPattern::FMULADDS_OP2:
6906 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
6907 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
6908 case AArch64MachineCombinerPattern::FMULADDD_OP1:
6909 case AArch64MachineCombinerPattern::FMULADDD_OP2:
6910 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
6911 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
6912 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
6913 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
6914 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
6915 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6916 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6917 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6918 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6919 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6920 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6921 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6922 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6923 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
6924 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
6925 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
6926 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
6927 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
6928 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
6929 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
6930 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
6931 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6932 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6933 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6934 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6935 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
6936 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
6937 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6938 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6939 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6940 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6941 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6942 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6943 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6944 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6945 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6946 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6947 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
6948 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
6949 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
6950 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
6951 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
6952 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
6953 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6954 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
6955 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
6956 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
6957 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
6958 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
6959 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
6960 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
6961 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
6962 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
6963 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
6964 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
6965 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
6966 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
6967 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
6968 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
6969 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
6970 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
6971 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
6972 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
6973 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
6974 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
6975 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
6976 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
6977 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
6978 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
6979 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
6980 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
6981 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
6982 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
6983 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
6984 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
6985 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
6986 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
6987 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
6988 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
6989 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6990 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6991 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6992 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6993 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6994 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6995 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6996 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6997 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6998 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6999 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7000 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7001 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7002 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7003 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7004 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7005 return true;
7006 } // end switch (Pattern)
7007 return false;
7010 /// Find other MI combine patterns.
7011 static bool getMiscPatterns(MachineInstr &Root,
7012 SmallVectorImpl<unsigned> &Patterns) {
7013 // A - (B + C) ==> (A - B) - C or (A - C) - B
7014 unsigned Opc = Root.getOpcode();
7015 MachineBasicBlock &MBB = *Root.getParent();
7017 switch (Opc) {
7018 case AArch64::SUBWrr:
7019 case AArch64::SUBSWrr:
7020 case AArch64::SUBXrr:
7021 case AArch64::SUBSXrr:
7022 // Found candidate root.
7023 break;
7024 default:
7025 return false;
7028 if (isCombineInstrSettingFlag(Opc) &&
7029 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
7031 return false;
7033 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
7034 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
7035 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
7036 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
7037 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);
7038 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);
7039 return true;
7042 return false;
7045 CombinerObjective
7046 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
7047 switch (Pattern) {
7048 case AArch64MachineCombinerPattern::SUBADD_OP1:
7049 case AArch64MachineCombinerPattern::SUBADD_OP2:
7050 return CombinerObjective::MustReduceDepth;
7051 default:
7052 return TargetInstrInfo::getCombinerObjective(Pattern);
7056 /// Return true when there is potentially a faster code sequence for an
7057 /// instruction chain ending in \p Root. All potential patterns are listed in
7058 /// the \p Pattern vector. Pattern should be sorted in priority order since the
7059 /// pattern evaluator stops checking as soon as it finds a faster sequence.
7061 bool AArch64InstrInfo::getMachineCombinerPatterns(
7062 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
7063 bool DoRegPressureReduce) const {
7064 // Integer patterns
7065 if (getMaddPatterns(Root, Patterns))
7066 return true;
7067 // Floating point patterns
7068 if (getFMULPatterns(Root, Patterns))
7069 return true;
7070 if (getFMAPatterns(Root, Patterns))
7071 return true;
7072 if (getFNEGPatterns(Root, Patterns))
7073 return true;
7075 // Other patterns
7076 if (getMiscPatterns(Root, Patterns))
7077 return true;
7079 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
7080 DoRegPressureReduce);
7083 enum class FMAInstKind { Default, Indexed, Accumulator };
7084 /// genFusedMultiply - Generate fused multiply instructions.
7085 /// This function supports both integer and floating point instructions.
7086 /// A typical example:
7087 /// F|MUL I=A,B,0
7088 /// F|ADD R,I,C
7089 /// ==> F|MADD R,A,B,C
7090 /// \param MF Containing MachineFunction
7091 /// \param MRI Register information
7092 /// \param TII Target information
7093 /// \param Root is the F|ADD instruction
7094 /// \param [out] InsInstrs is a vector of machine instructions and will
7095 /// contain the generated madd instruction
7096 /// \param IdxMulOpd is index of operand in Root that is the result of
7097 /// the F|MUL. In the example above IdxMulOpd is 1.
7098 /// \param MaddOpc the opcode fo the f|madd instruction
7099 /// \param RC Register class of operands
7100 /// \param kind of fma instruction (addressing mode) to be generated
7101 /// \param ReplacedAddend is the result register from the instruction
7102 /// replacing the non-combined operand, if any.
7103 static MachineInstr *
7104 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
7105 const TargetInstrInfo *TII, MachineInstr &Root,
7106 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
7107 unsigned MaddOpc, const TargetRegisterClass *RC,
7108 FMAInstKind kind = FMAInstKind::Default,
7109 const Register *ReplacedAddend = nullptr) {
7110 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
7112 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
7113 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
7114 Register ResultReg = Root.getOperand(0).getReg();
7115 Register SrcReg0 = MUL->getOperand(1).getReg();
7116 bool Src0IsKill = MUL->getOperand(1).isKill();
7117 Register SrcReg1 = MUL->getOperand(2).getReg();
7118 bool Src1IsKill = MUL->getOperand(2).isKill();
7120 Register SrcReg2;
7121 bool Src2IsKill;
7122 if (ReplacedAddend) {
7123 // If we just generated a new addend, we must be it's only use.
7124 SrcReg2 = *ReplacedAddend;
7125 Src2IsKill = true;
7126 } else {
7127 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
7128 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
7131 if (ResultReg.isVirtual())
7132 MRI.constrainRegClass(ResultReg, RC);
7133 if (SrcReg0.isVirtual())
7134 MRI.constrainRegClass(SrcReg0, RC);
7135 if (SrcReg1.isVirtual())
7136 MRI.constrainRegClass(SrcReg1, RC);
7137 if (SrcReg2.isVirtual())
7138 MRI.constrainRegClass(SrcReg2, RC);
7140 MachineInstrBuilder MIB;
7141 if (kind == FMAInstKind::Default)
7142 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7143 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7144 .addReg(SrcReg1, getKillRegState(Src1IsKill))
7145 .addReg(SrcReg2, getKillRegState(Src2IsKill));
7146 else if (kind == FMAInstKind::Indexed)
7147 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7148 .addReg(SrcReg2, getKillRegState(Src2IsKill))
7149 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7150 .addReg(SrcReg1, getKillRegState(Src1IsKill))
7151 .addImm(MUL->getOperand(3).getImm());
7152 else if (kind == FMAInstKind::Accumulator)
7153 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7154 .addReg(SrcReg2, getKillRegState(Src2IsKill))
7155 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7156 .addReg(SrcReg1, getKillRegState(Src1IsKill));
7157 else
7158 assert(false && "Invalid FMA instruction kind \n");
7159 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
7160 InsInstrs.push_back(MIB);
7161 return MUL;
7164 static MachineInstr *
7165 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
7166 const TargetInstrInfo *TII, MachineInstr &Root,
7167 SmallVectorImpl<MachineInstr *> &InsInstrs) {
7168 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7170 unsigned Opc = 0;
7171 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
7172 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
7173 Opc = AArch64::FNMADDSrrr;
7174 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
7175 Opc = AArch64::FNMADDDrrr;
7176 else
7177 return nullptr;
7179 Register ResultReg = Root.getOperand(0).getReg();
7180 Register SrcReg0 = MAD->getOperand(1).getReg();
7181 Register SrcReg1 = MAD->getOperand(2).getReg();
7182 Register SrcReg2 = MAD->getOperand(3).getReg();
7183 bool Src0IsKill = MAD->getOperand(1).isKill();
7184 bool Src1IsKill = MAD->getOperand(2).isKill();
7185 bool Src2IsKill = MAD->getOperand(3).isKill();
7186 if (ResultReg.isVirtual())
7187 MRI.constrainRegClass(ResultReg, RC);
7188 if (SrcReg0.isVirtual())
7189 MRI.constrainRegClass(SrcReg0, RC);
7190 if (SrcReg1.isVirtual())
7191 MRI.constrainRegClass(SrcReg1, RC);
7192 if (SrcReg2.isVirtual())
7193 MRI.constrainRegClass(SrcReg2, RC);
7195 MachineInstrBuilder MIB =
7196 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
7197 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7198 .addReg(SrcReg1, getKillRegState(Src1IsKill))
7199 .addReg(SrcReg2, getKillRegState(Src2IsKill));
7200 InsInstrs.push_back(MIB);
7202 return MAD;
7205 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
7206 static MachineInstr *
7207 genIndexedMultiply(MachineInstr &Root,
7208 SmallVectorImpl<MachineInstr *> &InsInstrs,
7209 unsigned IdxDupOp, unsigned MulOpc,
7210 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
7211 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
7212 "Invalid index of FMUL operand");
7214 MachineFunction &MF = *Root.getMF();
7215 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7217 MachineInstr *Dup =
7218 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
7220 if (Dup->getOpcode() == TargetOpcode::COPY)
7221 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
7223 Register DupSrcReg = Dup->getOperand(1).getReg();
7224 MRI.clearKillFlags(DupSrcReg);
7225 MRI.constrainRegClass(DupSrcReg, RC);
7227 unsigned DupSrcLane = Dup->getOperand(2).getImm();
7229 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
7230 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
7232 Register ResultReg = Root.getOperand(0).getReg();
7234 MachineInstrBuilder MIB;
7235 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
7236 .add(MulOp)
7237 .addReg(DupSrcReg)
7238 .addImm(DupSrcLane);
7240 InsInstrs.push_back(MIB);
7241 return &Root;
7244 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
7245 /// instructions.
7247 /// \see genFusedMultiply
7248 static MachineInstr *genFusedMultiplyAcc(
7249 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7250 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7251 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
7252 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7253 FMAInstKind::Accumulator);
7256 /// genNeg - Helper to generate an intermediate negation of the second operand
7257 /// of Root
7258 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
7259 const TargetInstrInfo *TII, MachineInstr &Root,
7260 SmallVectorImpl<MachineInstr *> &InsInstrs,
7261 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
7262 unsigned MnegOpc, const TargetRegisterClass *RC) {
7263 Register NewVR = MRI.createVirtualRegister(RC);
7264 MachineInstrBuilder MIB =
7265 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
7266 .add(Root.getOperand(2));
7267 InsInstrs.push_back(MIB);
7269 assert(InstrIdxForVirtReg.empty());
7270 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7272 return NewVR;
7275 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
7276 /// instructions with an additional negation of the accumulator
7277 static MachineInstr *genFusedMultiplyAccNeg(
7278 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7279 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7280 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
7281 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
7282 assert(IdxMulOpd == 1);
7284 Register NewVR =
7285 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
7286 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7287 FMAInstKind::Accumulator, &NewVR);
7290 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
7291 /// instructions.
7293 /// \see genFusedMultiply
7294 static MachineInstr *genFusedMultiplyIdx(
7295 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7296 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7297 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
7298 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7299 FMAInstKind::Indexed);
7302 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
7303 /// instructions with an additional negation of the accumulator
7304 static MachineInstr *genFusedMultiplyIdxNeg(
7305 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7306 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7307 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
7308 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
7309 assert(IdxMulOpd == 1);
7311 Register NewVR =
7312 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
7314 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7315 FMAInstKind::Indexed, &NewVR);
7318 /// genMaddR - Generate madd instruction and combine mul and add using
7319 /// an extra virtual register
7320 /// Example - an ADD intermediate needs to be stored in a register:
7321 /// MUL I=A,B,0
7322 /// ADD R,I,Imm
7323 /// ==> ORR V, ZR, Imm
7324 /// ==> MADD R,A,B,V
7325 /// \param MF Containing MachineFunction
7326 /// \param MRI Register information
7327 /// \param TII Target information
7328 /// \param Root is the ADD instruction
7329 /// \param [out] InsInstrs is a vector of machine instructions and will
7330 /// contain the generated madd instruction
7331 /// \param IdxMulOpd is index of operand in Root that is the result of
7332 /// the MUL. In the example above IdxMulOpd is 1.
7333 /// \param MaddOpc the opcode fo the madd instruction
7334 /// \param VR is a virtual register that holds the value of an ADD operand
7335 /// (V in the example above).
7336 /// \param RC Register class of operands
7337 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
7338 const TargetInstrInfo *TII, MachineInstr &Root,
7339 SmallVectorImpl<MachineInstr *> &InsInstrs,
7340 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
7341 const TargetRegisterClass *RC) {
7342 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
7344 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
7345 Register ResultReg = Root.getOperand(0).getReg();
7346 Register SrcReg0 = MUL->getOperand(1).getReg();
7347 bool Src0IsKill = MUL->getOperand(1).isKill();
7348 Register SrcReg1 = MUL->getOperand(2).getReg();
7349 bool Src1IsKill = MUL->getOperand(2).isKill();
7351 if (ResultReg.isVirtual())
7352 MRI.constrainRegClass(ResultReg, RC);
7353 if (SrcReg0.isVirtual())
7354 MRI.constrainRegClass(SrcReg0, RC);
7355 if (SrcReg1.isVirtual())
7356 MRI.constrainRegClass(SrcReg1, RC);
7357 if (Register::isVirtualRegister(VR))
7358 MRI.constrainRegClass(VR, RC);
7360 MachineInstrBuilder MIB =
7361 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7362 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7363 .addReg(SrcReg1, getKillRegState(Src1IsKill))
7364 .addReg(VR);
7365 // Insert the MADD
7366 InsInstrs.push_back(MIB);
7367 return MUL;
7370 /// Do the following transformation
7371 /// A - (B + C) ==> (A - B) - C
7372 /// A - (B + C) ==> (A - C) - B
7373 static void
7374 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
7375 const TargetInstrInfo *TII, MachineInstr &Root,
7376 SmallVectorImpl<MachineInstr *> &InsInstrs,
7377 SmallVectorImpl<MachineInstr *> &DelInstrs,
7378 unsigned IdxOpd1,
7379 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
7380 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
7381 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
7382 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
7384 Register ResultReg = Root.getOperand(0).getReg();
7385 Register RegA = Root.getOperand(1).getReg();
7386 bool RegAIsKill = Root.getOperand(1).isKill();
7387 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
7388 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
7389 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
7390 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
7391 Register NewVR =
7392 MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(2).getReg()));
7394 unsigned Opcode = Root.getOpcode();
7395 if (Opcode == AArch64::SUBSWrr)
7396 Opcode = AArch64::SUBWrr;
7397 else if (Opcode == AArch64::SUBSXrr)
7398 Opcode = AArch64::SUBXrr;
7399 else
7400 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
7401 "Unexpected instruction opcode.");
7403 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
7404 Flags &= ~MachineInstr::NoSWrap;
7405 Flags &= ~MachineInstr::NoUWrap;
7407 MachineInstrBuilder MIB1 =
7408 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
7409 .addReg(RegA, getKillRegState(RegAIsKill))
7410 .addReg(RegB, getKillRegState(RegBIsKill))
7411 .setMIFlags(Flags);
7412 MachineInstrBuilder MIB2 =
7413 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
7414 .addReg(NewVR, getKillRegState(true))
7415 .addReg(RegC, getKillRegState(RegCIsKill))
7416 .setMIFlags(Flags);
7418 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7419 InsInstrs.push_back(MIB1);
7420 InsInstrs.push_back(MIB2);
7421 DelInstrs.push_back(AddMI);
7422 DelInstrs.push_back(&Root);
7425 /// When getMachineCombinerPatterns() finds potential patterns,
7426 /// this function generates the instructions that could replace the
7427 /// original code sequence
7428 void AArch64InstrInfo::genAlternativeCodeSequence(
7429 MachineInstr &Root, unsigned Pattern,
7430 SmallVectorImpl<MachineInstr *> &InsInstrs,
7431 SmallVectorImpl<MachineInstr *> &DelInstrs,
7432 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
7433 MachineBasicBlock &MBB = *Root.getParent();
7434 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7435 MachineFunction &MF = *MBB.getParent();
7436 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7438 MachineInstr *MUL = nullptr;
7439 const TargetRegisterClass *RC;
7440 unsigned Opc;
7441 switch (Pattern) {
7442 default:
7443 // Reassociate instructions.
7444 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
7445 DelInstrs, InstrIdxForVirtReg);
7446 return;
7447 case AArch64MachineCombinerPattern::SUBADD_OP1:
7448 // A - (B + C)
7449 // ==> (A - B) - C
7450 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
7451 InstrIdxForVirtReg);
7452 return;
7453 case AArch64MachineCombinerPattern::SUBADD_OP2:
7454 // A - (B + C)
7455 // ==> (A - C) - B
7456 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
7457 InstrIdxForVirtReg);
7458 return;
7459 case AArch64MachineCombinerPattern::MULADDW_OP1:
7460 case AArch64MachineCombinerPattern::MULADDX_OP1:
7461 // MUL I=A,B,0
7462 // ADD R,I,C
7463 // ==> MADD R,A,B,C
7464 // --- Create(MADD);
7465 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7466 Opc = AArch64::MADDWrrr;
7467 RC = &AArch64::GPR32RegClass;
7468 } else {
7469 Opc = AArch64::MADDXrrr;
7470 RC = &AArch64::GPR64RegClass;
7472 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7473 break;
7474 case AArch64MachineCombinerPattern::MULADDW_OP2:
7475 case AArch64MachineCombinerPattern::MULADDX_OP2:
7476 // MUL I=A,B,0
7477 // ADD R,C,I
7478 // ==> MADD R,A,B,C
7479 // --- Create(MADD);
7480 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7481 Opc = AArch64::MADDWrrr;
7482 RC = &AArch64::GPR32RegClass;
7483 } else {
7484 Opc = AArch64::MADDXrrr;
7485 RC = &AArch64::GPR64RegClass;
7487 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7488 break;
7489 case AArch64MachineCombinerPattern::MULADDWI_OP1:
7490 case AArch64MachineCombinerPattern::MULADDXI_OP1: {
7491 // MUL I=A,B,0
7492 // ADD R,I,Imm
7493 // ==> MOV V, Imm
7494 // ==> MADD R,A,B,V
7495 // --- Create(MADD);
7496 const TargetRegisterClass *OrrRC;
7497 unsigned BitSize, OrrOpc, ZeroReg;
7498 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
7499 OrrOpc = AArch64::ORRWri;
7500 OrrRC = &AArch64::GPR32spRegClass;
7501 BitSize = 32;
7502 ZeroReg = AArch64::WZR;
7503 Opc = AArch64::MADDWrrr;
7504 RC = &AArch64::GPR32RegClass;
7505 } else {
7506 OrrOpc = AArch64::ORRXri;
7507 OrrRC = &AArch64::GPR64spRegClass;
7508 BitSize = 64;
7509 ZeroReg = AArch64::XZR;
7510 Opc = AArch64::MADDXrrr;
7511 RC = &AArch64::GPR64RegClass;
7513 Register NewVR = MRI.createVirtualRegister(OrrRC);
7514 uint64_t Imm = Root.getOperand(2).getImm();
7516 if (Root.getOperand(3).isImm()) {
7517 unsigned Val = Root.getOperand(3).getImm();
7518 Imm = Imm << Val;
7520 uint64_t UImm = SignExtend64(Imm, BitSize);
7521 // The immediate can be composed via a single instruction.
7522 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7523 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7524 if (Insn.size() != 1)
7525 return;
7526 auto MovI = Insn.begin();
7527 MachineInstrBuilder MIB1;
7528 // MOV is an alias for one of three instructions: movz, movn, and orr.
7529 if (MovI->Opcode == OrrOpc)
7530 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7531 .addReg(ZeroReg)
7532 .addImm(MovI->Op2);
7533 else {
7534 if (BitSize == 32)
7535 assert((MovI->Opcode == AArch64::MOVNWi ||
7536 MovI->Opcode == AArch64::MOVZWi) &&
7537 "Expected opcode");
7538 else
7539 assert((MovI->Opcode == AArch64::MOVNXi ||
7540 MovI->Opcode == AArch64::MOVZXi) &&
7541 "Expected opcode");
7542 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7543 .addImm(MovI->Op1)
7544 .addImm(MovI->Op2);
7546 InsInstrs.push_back(MIB1);
7547 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7548 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7549 break;
7551 case AArch64MachineCombinerPattern::MULSUBW_OP1:
7552 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7553 // MUL I=A,B,0
7554 // SUB R,I, C
7555 // ==> SUB V, 0, C
7556 // ==> MADD R,A,B,V // = -C + A*B
7557 // --- Create(MADD);
7558 const TargetRegisterClass *SubRC;
7559 unsigned SubOpc, ZeroReg;
7560 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7561 SubOpc = AArch64::SUBWrr;
7562 SubRC = &AArch64::GPR32spRegClass;
7563 ZeroReg = AArch64::WZR;
7564 Opc = AArch64::MADDWrrr;
7565 RC = &AArch64::GPR32RegClass;
7566 } else {
7567 SubOpc = AArch64::SUBXrr;
7568 SubRC = &AArch64::GPR64spRegClass;
7569 ZeroReg = AArch64::XZR;
7570 Opc = AArch64::MADDXrrr;
7571 RC = &AArch64::GPR64RegClass;
7573 Register NewVR = MRI.createVirtualRegister(SubRC);
7574 // SUB NewVR, 0, C
7575 MachineInstrBuilder MIB1 =
7576 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7577 .addReg(ZeroReg)
7578 .add(Root.getOperand(2));
7579 InsInstrs.push_back(MIB1);
7580 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7581 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7582 break;
7584 case AArch64MachineCombinerPattern::MULSUBW_OP2:
7585 case AArch64MachineCombinerPattern::MULSUBX_OP2:
7586 // MUL I=A,B,0
7587 // SUB R,C,I
7588 // ==> MSUB R,A,B,C (computes C - A*B)
7589 // --- Create(MSUB);
7590 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7591 Opc = AArch64::MSUBWrrr;
7592 RC = &AArch64::GPR32RegClass;
7593 } else {
7594 Opc = AArch64::MSUBXrrr;
7595 RC = &AArch64::GPR64RegClass;
7597 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7598 break;
7599 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7600 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7601 // MUL I=A,B,0
7602 // SUB R,I, Imm
7603 // ==> MOV V, -Imm
7604 // ==> MADD R,A,B,V // = -Imm + A*B
7605 // --- Create(MADD);
7606 const TargetRegisterClass *OrrRC;
7607 unsigned BitSize, OrrOpc, ZeroReg;
7608 if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7609 OrrOpc = AArch64::ORRWri;
7610 OrrRC = &AArch64::GPR32spRegClass;
7611 BitSize = 32;
7612 ZeroReg = AArch64::WZR;
7613 Opc = AArch64::MADDWrrr;
7614 RC = &AArch64::GPR32RegClass;
7615 } else {
7616 OrrOpc = AArch64::ORRXri;
7617 OrrRC = &AArch64::GPR64spRegClass;
7618 BitSize = 64;
7619 ZeroReg = AArch64::XZR;
7620 Opc = AArch64::MADDXrrr;
7621 RC = &AArch64::GPR64RegClass;
7623 Register NewVR = MRI.createVirtualRegister(OrrRC);
7624 uint64_t Imm = Root.getOperand(2).getImm();
7625 if (Root.getOperand(3).isImm()) {
7626 unsigned Val = Root.getOperand(3).getImm();
7627 Imm = Imm << Val;
7629 uint64_t UImm = SignExtend64(-Imm, BitSize);
7630 // The immediate can be composed via a single instruction.
7631 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7632 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7633 if (Insn.size() != 1)
7634 return;
7635 auto MovI = Insn.begin();
7636 MachineInstrBuilder MIB1;
7637 // MOV is an alias for one of three instructions: movz, movn, and orr.
7638 if (MovI->Opcode == OrrOpc)
7639 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7640 .addReg(ZeroReg)
7641 .addImm(MovI->Op2);
7642 else {
7643 if (BitSize == 32)
7644 assert((MovI->Opcode == AArch64::MOVNWi ||
7645 MovI->Opcode == AArch64::MOVZWi) &&
7646 "Expected opcode");
7647 else
7648 assert((MovI->Opcode == AArch64::MOVNXi ||
7649 MovI->Opcode == AArch64::MOVZXi) &&
7650 "Expected opcode");
7651 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7652 .addImm(MovI->Op1)
7653 .addImm(MovI->Op2);
7655 InsInstrs.push_back(MIB1);
7656 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7657 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7658 break;
7661 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7662 Opc = AArch64::MLAv8i8;
7663 RC = &AArch64::FPR64RegClass;
7664 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7665 break;
7666 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7667 Opc = AArch64::MLAv8i8;
7668 RC = &AArch64::FPR64RegClass;
7669 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7670 break;
7671 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7672 Opc = AArch64::MLAv16i8;
7673 RC = &AArch64::FPR128RegClass;
7674 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7675 break;
7676 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7677 Opc = AArch64::MLAv16i8;
7678 RC = &AArch64::FPR128RegClass;
7679 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7680 break;
7681 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7682 Opc = AArch64::MLAv4i16;
7683 RC = &AArch64::FPR64RegClass;
7684 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7685 break;
7686 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7687 Opc = AArch64::MLAv4i16;
7688 RC = &AArch64::FPR64RegClass;
7689 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7690 break;
7691 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7692 Opc = AArch64::MLAv8i16;
7693 RC = &AArch64::FPR128RegClass;
7694 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7695 break;
7696 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7697 Opc = AArch64::MLAv8i16;
7698 RC = &AArch64::FPR128RegClass;
7699 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7700 break;
7701 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7702 Opc = AArch64::MLAv2i32;
7703 RC = &AArch64::FPR64RegClass;
7704 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7705 break;
7706 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7707 Opc = AArch64::MLAv2i32;
7708 RC = &AArch64::FPR64RegClass;
7709 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7710 break;
7711 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7712 Opc = AArch64::MLAv4i32;
7713 RC = &AArch64::FPR128RegClass;
7714 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7715 break;
7716 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7717 Opc = AArch64::MLAv4i32;
7718 RC = &AArch64::FPR128RegClass;
7719 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7720 break;
7722 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7723 Opc = AArch64::MLAv8i8;
7724 RC = &AArch64::FPR64RegClass;
7725 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7726 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7727 RC);
7728 break;
7729 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7730 Opc = AArch64::MLSv8i8;
7731 RC = &AArch64::FPR64RegClass;
7732 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7733 break;
7734 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7735 Opc = AArch64::MLAv16i8;
7736 RC = &AArch64::FPR128RegClass;
7737 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7738 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7739 RC);
7740 break;
7741 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7742 Opc = AArch64::MLSv16i8;
7743 RC = &AArch64::FPR128RegClass;
7744 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7745 break;
7746 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7747 Opc = AArch64::MLAv4i16;
7748 RC = &AArch64::FPR64RegClass;
7749 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7750 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7751 RC);
7752 break;
7753 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7754 Opc = AArch64::MLSv4i16;
7755 RC = &AArch64::FPR64RegClass;
7756 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7757 break;
7758 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7759 Opc = AArch64::MLAv8i16;
7760 RC = &AArch64::FPR128RegClass;
7761 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7762 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7763 RC);
7764 break;
7765 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7766 Opc = AArch64::MLSv8i16;
7767 RC = &AArch64::FPR128RegClass;
7768 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7769 break;
7770 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7771 Opc = AArch64::MLAv2i32;
7772 RC = &AArch64::FPR64RegClass;
7773 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7774 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7775 RC);
7776 break;
7777 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7778 Opc = AArch64::MLSv2i32;
7779 RC = &AArch64::FPR64RegClass;
7780 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7781 break;
7782 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7783 Opc = AArch64::MLAv4i32;
7784 RC = &AArch64::FPR128RegClass;
7785 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7786 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7787 RC);
7788 break;
7789 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7790 Opc = AArch64::MLSv4i32;
7791 RC = &AArch64::FPR128RegClass;
7792 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7793 break;
7795 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7796 Opc = AArch64::MLAv4i16_indexed;
7797 RC = &AArch64::FPR64RegClass;
7798 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7799 break;
7800 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7801 Opc = AArch64::MLAv4i16_indexed;
7802 RC = &AArch64::FPR64RegClass;
7803 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7804 break;
7805 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7806 Opc = AArch64::MLAv8i16_indexed;
7807 RC = &AArch64::FPR128RegClass;
7808 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7809 break;
7810 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7811 Opc = AArch64::MLAv8i16_indexed;
7812 RC = &AArch64::FPR128RegClass;
7813 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7814 break;
7815 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7816 Opc = AArch64::MLAv2i32_indexed;
7817 RC = &AArch64::FPR64RegClass;
7818 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7819 break;
7820 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7821 Opc = AArch64::MLAv2i32_indexed;
7822 RC = &AArch64::FPR64RegClass;
7823 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7824 break;
7825 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7826 Opc = AArch64::MLAv4i32_indexed;
7827 RC = &AArch64::FPR128RegClass;
7828 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7829 break;
7830 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7831 Opc = AArch64::MLAv4i32_indexed;
7832 RC = &AArch64::FPR128RegClass;
7833 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7834 break;
7836 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7837 Opc = AArch64::MLAv4i16_indexed;
7838 RC = &AArch64::FPR64RegClass;
7839 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7840 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7841 RC);
7842 break;
7843 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7844 Opc = AArch64::MLSv4i16_indexed;
7845 RC = &AArch64::FPR64RegClass;
7846 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7847 break;
7848 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7849 Opc = AArch64::MLAv8i16_indexed;
7850 RC = &AArch64::FPR128RegClass;
7851 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7852 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7853 RC);
7854 break;
7855 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7856 Opc = AArch64::MLSv8i16_indexed;
7857 RC = &AArch64::FPR128RegClass;
7858 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7859 break;
7860 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7861 Opc = AArch64::MLAv2i32_indexed;
7862 RC = &AArch64::FPR64RegClass;
7863 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7864 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7865 RC);
7866 break;
7867 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7868 Opc = AArch64::MLSv2i32_indexed;
7869 RC = &AArch64::FPR64RegClass;
7870 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7871 break;
7872 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7873 Opc = AArch64::MLAv4i32_indexed;
7874 RC = &AArch64::FPR128RegClass;
7875 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7876 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7877 RC);
7878 break;
7879 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7880 Opc = AArch64::MLSv4i32_indexed;
7881 RC = &AArch64::FPR128RegClass;
7882 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7883 break;
7885 // Floating Point Support
7886 case AArch64MachineCombinerPattern::FMULADDH_OP1:
7887 Opc = AArch64::FMADDHrrr;
7888 RC = &AArch64::FPR16RegClass;
7889 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7890 break;
7891 case AArch64MachineCombinerPattern::FMULADDS_OP1:
7892 Opc = AArch64::FMADDSrrr;
7893 RC = &AArch64::FPR32RegClass;
7894 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7895 break;
7896 case AArch64MachineCombinerPattern::FMULADDD_OP1:
7897 Opc = AArch64::FMADDDrrr;
7898 RC = &AArch64::FPR64RegClass;
7899 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7900 break;
7902 case AArch64MachineCombinerPattern::FMULADDH_OP2:
7903 Opc = AArch64::FMADDHrrr;
7904 RC = &AArch64::FPR16RegClass;
7905 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7906 break;
7907 case AArch64MachineCombinerPattern::FMULADDS_OP2:
7908 Opc = AArch64::FMADDSrrr;
7909 RC = &AArch64::FPR32RegClass;
7910 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7911 break;
7912 case AArch64MachineCombinerPattern::FMULADDD_OP2:
7913 Opc = AArch64::FMADDDrrr;
7914 RC = &AArch64::FPR64RegClass;
7915 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7916 break;
7918 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7919 Opc = AArch64::FMLAv1i32_indexed;
7920 RC = &AArch64::FPR32RegClass;
7921 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7922 FMAInstKind::Indexed);
7923 break;
7924 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7925 Opc = AArch64::FMLAv1i32_indexed;
7926 RC = &AArch64::FPR32RegClass;
7927 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7928 FMAInstKind::Indexed);
7929 break;
7931 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7932 Opc = AArch64::FMLAv1i64_indexed;
7933 RC = &AArch64::FPR64RegClass;
7934 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7935 FMAInstKind::Indexed);
7936 break;
7937 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7938 Opc = AArch64::FMLAv1i64_indexed;
7939 RC = &AArch64::FPR64RegClass;
7940 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7941 FMAInstKind::Indexed);
7942 break;
7944 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7945 RC = &AArch64::FPR64RegClass;
7946 Opc = AArch64::FMLAv4i16_indexed;
7947 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7948 FMAInstKind::Indexed);
7949 break;
7950 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7951 RC = &AArch64::FPR64RegClass;
7952 Opc = AArch64::FMLAv4f16;
7953 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7954 FMAInstKind::Accumulator);
7955 break;
7956 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7957 RC = &AArch64::FPR64RegClass;
7958 Opc = AArch64::FMLAv4i16_indexed;
7959 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7960 FMAInstKind::Indexed);
7961 break;
7962 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7963 RC = &AArch64::FPR64RegClass;
7964 Opc = AArch64::FMLAv4f16;
7965 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7966 FMAInstKind::Accumulator);
7967 break;
7969 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7970 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7971 RC = &AArch64::FPR64RegClass;
7972 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7973 Opc = AArch64::FMLAv2i32_indexed;
7974 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7975 FMAInstKind::Indexed);
7976 } else {
7977 Opc = AArch64::FMLAv2f32;
7978 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7979 FMAInstKind::Accumulator);
7981 break;
7982 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7983 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7984 RC = &AArch64::FPR64RegClass;
7985 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7986 Opc = AArch64::FMLAv2i32_indexed;
7987 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7988 FMAInstKind::Indexed);
7989 } else {
7990 Opc = AArch64::FMLAv2f32;
7991 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7992 FMAInstKind::Accumulator);
7994 break;
7996 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7997 RC = &AArch64::FPR128RegClass;
7998 Opc = AArch64::FMLAv8i16_indexed;
7999 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8000 FMAInstKind::Indexed);
8001 break;
8002 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
8003 RC = &AArch64::FPR128RegClass;
8004 Opc = AArch64::FMLAv8f16;
8005 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8006 FMAInstKind::Accumulator);
8007 break;
8008 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
8009 RC = &AArch64::FPR128RegClass;
8010 Opc = AArch64::FMLAv8i16_indexed;
8011 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8012 FMAInstKind::Indexed);
8013 break;
8014 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
8015 RC = &AArch64::FPR128RegClass;
8016 Opc = AArch64::FMLAv8f16;
8017 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8018 FMAInstKind::Accumulator);
8019 break;
8021 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
8022 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
8023 RC = &AArch64::FPR128RegClass;
8024 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
8025 Opc = AArch64::FMLAv2i64_indexed;
8026 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8027 FMAInstKind::Indexed);
8028 } else {
8029 Opc = AArch64::FMLAv2f64;
8030 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8031 FMAInstKind::Accumulator);
8033 break;
8034 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
8035 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
8036 RC = &AArch64::FPR128RegClass;
8037 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
8038 Opc = AArch64::FMLAv2i64_indexed;
8039 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8040 FMAInstKind::Indexed);
8041 } else {
8042 Opc = AArch64::FMLAv2f64;
8043 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8044 FMAInstKind::Accumulator);
8046 break;
8048 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
8049 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
8050 RC = &AArch64::FPR128RegClass;
8051 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
8052 Opc = AArch64::FMLAv4i32_indexed;
8053 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8054 FMAInstKind::Indexed);
8055 } else {
8056 Opc = AArch64::FMLAv4f32;
8057 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8058 FMAInstKind::Accumulator);
8060 break;
8062 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
8063 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
8064 RC = &AArch64::FPR128RegClass;
8065 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
8066 Opc = AArch64::FMLAv4i32_indexed;
8067 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8068 FMAInstKind::Indexed);
8069 } else {
8070 Opc = AArch64::FMLAv4f32;
8071 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8072 FMAInstKind::Accumulator);
8074 break;
8076 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
8077 Opc = AArch64::FNMSUBHrrr;
8078 RC = &AArch64::FPR16RegClass;
8079 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8080 break;
8081 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
8082 Opc = AArch64::FNMSUBSrrr;
8083 RC = &AArch64::FPR32RegClass;
8084 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8085 break;
8086 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
8087 Opc = AArch64::FNMSUBDrrr;
8088 RC = &AArch64::FPR64RegClass;
8089 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8090 break;
8092 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
8093 Opc = AArch64::FNMADDHrrr;
8094 RC = &AArch64::FPR16RegClass;
8095 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8096 break;
8097 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
8098 Opc = AArch64::FNMADDSrrr;
8099 RC = &AArch64::FPR32RegClass;
8100 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8101 break;
8102 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
8103 Opc = AArch64::FNMADDDrrr;
8104 RC = &AArch64::FPR64RegClass;
8105 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8106 break;
8108 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
8109 Opc = AArch64::FMSUBHrrr;
8110 RC = &AArch64::FPR16RegClass;
8111 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8112 break;
8113 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
8114 Opc = AArch64::FMSUBSrrr;
8115 RC = &AArch64::FPR32RegClass;
8116 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8117 break;
8118 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
8119 Opc = AArch64::FMSUBDrrr;
8120 RC = &AArch64::FPR64RegClass;
8121 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8122 break;
8124 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
8125 Opc = AArch64::FMLSv1i32_indexed;
8126 RC = &AArch64::FPR32RegClass;
8127 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8128 FMAInstKind::Indexed);
8129 break;
8131 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
8132 Opc = AArch64::FMLSv1i64_indexed;
8133 RC = &AArch64::FPR64RegClass;
8134 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8135 FMAInstKind::Indexed);
8136 break;
8138 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
8139 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
8140 RC = &AArch64::FPR64RegClass;
8141 Register NewVR = MRI.createVirtualRegister(RC);
8142 MachineInstrBuilder MIB1 =
8143 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
8144 .add(Root.getOperand(2));
8145 InsInstrs.push_back(MIB1);
8146 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8147 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
8148 Opc = AArch64::FMLAv4f16;
8149 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8150 FMAInstKind::Accumulator, &NewVR);
8151 } else {
8152 Opc = AArch64::FMLAv4i16_indexed;
8153 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8154 FMAInstKind::Indexed, &NewVR);
8156 break;
8158 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
8159 RC = &AArch64::FPR64RegClass;
8160 Opc = AArch64::FMLSv4f16;
8161 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8162 FMAInstKind::Accumulator);
8163 break;
8164 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
8165 RC = &AArch64::FPR64RegClass;
8166 Opc = AArch64::FMLSv4i16_indexed;
8167 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8168 FMAInstKind::Indexed);
8169 break;
8171 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
8172 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
8173 RC = &AArch64::FPR64RegClass;
8174 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
8175 Opc = AArch64::FMLSv2i32_indexed;
8176 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8177 FMAInstKind::Indexed);
8178 } else {
8179 Opc = AArch64::FMLSv2f32;
8180 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8181 FMAInstKind::Accumulator);
8183 break;
8185 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
8186 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
8187 RC = &AArch64::FPR128RegClass;
8188 Register NewVR = MRI.createVirtualRegister(RC);
8189 MachineInstrBuilder MIB1 =
8190 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
8191 .add(Root.getOperand(2));
8192 InsInstrs.push_back(MIB1);
8193 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8194 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
8195 Opc = AArch64::FMLAv8f16;
8196 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8197 FMAInstKind::Accumulator, &NewVR);
8198 } else {
8199 Opc = AArch64::FMLAv8i16_indexed;
8200 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8201 FMAInstKind::Indexed, &NewVR);
8203 break;
8205 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
8206 RC = &AArch64::FPR128RegClass;
8207 Opc = AArch64::FMLSv8f16;
8208 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8209 FMAInstKind::Accumulator);
8210 break;
8211 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
8212 RC = &AArch64::FPR128RegClass;
8213 Opc = AArch64::FMLSv8i16_indexed;
8214 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8215 FMAInstKind::Indexed);
8216 break;
8218 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
8219 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
8220 RC = &AArch64::FPR128RegClass;
8221 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
8222 Opc = AArch64::FMLSv2i64_indexed;
8223 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8224 FMAInstKind::Indexed);
8225 } else {
8226 Opc = AArch64::FMLSv2f64;
8227 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8228 FMAInstKind::Accumulator);
8230 break;
8232 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
8233 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
8234 RC = &AArch64::FPR128RegClass;
8235 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
8236 Opc = AArch64::FMLSv4i32_indexed;
8237 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8238 FMAInstKind::Indexed);
8239 } else {
8240 Opc = AArch64::FMLSv4f32;
8241 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8242 FMAInstKind::Accumulator);
8244 break;
8245 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
8246 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
8247 RC = &AArch64::FPR64RegClass;
8248 Register NewVR = MRI.createVirtualRegister(RC);
8249 MachineInstrBuilder MIB1 =
8250 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
8251 .add(Root.getOperand(2));
8252 InsInstrs.push_back(MIB1);
8253 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8254 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
8255 Opc = AArch64::FMLAv2i32_indexed;
8256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8257 FMAInstKind::Indexed, &NewVR);
8258 } else {
8259 Opc = AArch64::FMLAv2f32;
8260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8261 FMAInstKind::Accumulator, &NewVR);
8263 break;
8265 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
8266 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
8267 RC = &AArch64::FPR128RegClass;
8268 Register NewVR = MRI.createVirtualRegister(RC);
8269 MachineInstrBuilder MIB1 =
8270 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
8271 .add(Root.getOperand(2));
8272 InsInstrs.push_back(MIB1);
8273 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8274 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
8275 Opc = AArch64::FMLAv4i32_indexed;
8276 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8277 FMAInstKind::Indexed, &NewVR);
8278 } else {
8279 Opc = AArch64::FMLAv4f32;
8280 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8281 FMAInstKind::Accumulator, &NewVR);
8283 break;
8285 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
8286 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
8287 RC = &AArch64::FPR128RegClass;
8288 Register NewVR = MRI.createVirtualRegister(RC);
8289 MachineInstrBuilder MIB1 =
8290 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
8291 .add(Root.getOperand(2));
8292 InsInstrs.push_back(MIB1);
8293 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8294 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
8295 Opc = AArch64::FMLAv2i64_indexed;
8296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8297 FMAInstKind::Indexed, &NewVR);
8298 } else {
8299 Opc = AArch64::FMLAv2f64;
8300 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8301 FMAInstKind::Accumulator, &NewVR);
8303 break;
8305 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
8306 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
8307 unsigned IdxDupOp =
8308 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
8309 : 2;
8310 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
8311 &AArch64::FPR128RegClass, MRI);
8312 break;
8314 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
8315 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
8316 unsigned IdxDupOp =
8317 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
8318 : 2;
8319 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
8320 &AArch64::FPR128RegClass, MRI);
8321 break;
8323 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
8324 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
8325 unsigned IdxDupOp =
8326 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
8327 : 2;
8328 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
8329 &AArch64::FPR128_loRegClass, MRI);
8330 break;
8332 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
8333 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
8334 unsigned IdxDupOp =
8335 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
8336 : 2;
8337 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
8338 &AArch64::FPR128RegClass, MRI);
8339 break;
8341 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
8342 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
8343 unsigned IdxDupOp =
8344 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
8345 : 2;
8346 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
8347 &AArch64::FPR128_loRegClass, MRI);
8348 break;
8350 case AArch64MachineCombinerPattern::FNMADD: {
8351 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
8352 break;
8355 } // end switch (Pattern)
8356 // Record MUL and ADD/SUB for deletion
8357 if (MUL)
8358 DelInstrs.push_back(MUL);
8359 DelInstrs.push_back(&Root);
8361 // Set the flags on the inserted instructions to be the merged flags of the
8362 // instructions that we have combined.
8363 uint32_t Flags = Root.getFlags();
8364 if (MUL)
8365 Flags = Root.mergeFlagsWith(*MUL);
8366 for (auto *MI : InsInstrs)
8367 MI->setFlags(Flags);
8370 /// Replace csincr-branch sequence by simple conditional branch
8372 /// Examples:
8373 /// 1. \code
8374 /// csinc w9, wzr, wzr, <condition code>
8375 /// tbnz w9, #0, 0x44
8376 /// \endcode
8377 /// to
8378 /// \code
8379 /// b.<inverted condition code>
8380 /// \endcode
8382 /// 2. \code
8383 /// csinc w9, wzr, wzr, <condition code>
8384 /// tbz w9, #0, 0x44
8385 /// \endcode
8386 /// to
8387 /// \code
8388 /// b.<condition code>
8389 /// \endcode
8391 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
8392 /// compare's constant operand is power of 2.
8394 /// Examples:
8395 /// \code
8396 /// and w8, w8, #0x400
8397 /// cbnz w8, L1
8398 /// \endcode
8399 /// to
8400 /// \code
8401 /// tbnz w8, #10, L1
8402 /// \endcode
8404 /// \param MI Conditional Branch
8405 /// \return True when the simple conditional branch is generated
8407 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
8408 bool IsNegativeBranch = false;
8409 bool IsTestAndBranch = false;
8410 unsigned TargetBBInMI = 0;
8411 switch (MI.getOpcode()) {
8412 default:
8413 llvm_unreachable("Unknown branch instruction?");
8414 case AArch64::Bcc:
8415 return false;
8416 case AArch64::CBZW:
8417 case AArch64::CBZX:
8418 TargetBBInMI = 1;
8419 break;
8420 case AArch64::CBNZW:
8421 case AArch64::CBNZX:
8422 TargetBBInMI = 1;
8423 IsNegativeBranch = true;
8424 break;
8425 case AArch64::TBZW:
8426 case AArch64::TBZX:
8427 TargetBBInMI = 2;
8428 IsTestAndBranch = true;
8429 break;
8430 case AArch64::TBNZW:
8431 case AArch64::TBNZX:
8432 TargetBBInMI = 2;
8433 IsNegativeBranch = true;
8434 IsTestAndBranch = true;
8435 break;
8437 // So we increment a zero register and test for bits other
8438 // than bit 0? Conservatively bail out in case the verifier
8439 // missed this case.
8440 if (IsTestAndBranch && MI.getOperand(1).getImm())
8441 return false;
8443 // Find Definition.
8444 assert(MI.getParent() && "Incomplete machine instruciton\n");
8445 MachineBasicBlock *MBB = MI.getParent();
8446 MachineFunction *MF = MBB->getParent();
8447 MachineRegisterInfo *MRI = &MF->getRegInfo();
8448 Register VReg = MI.getOperand(0).getReg();
8449 if (!VReg.isVirtual())
8450 return false;
8452 MachineInstr *DefMI = MRI->getVRegDef(VReg);
8454 // Look through COPY instructions to find definition.
8455 while (DefMI->isCopy()) {
8456 Register CopyVReg = DefMI->getOperand(1).getReg();
8457 if (!MRI->hasOneNonDBGUse(CopyVReg))
8458 return false;
8459 if (!MRI->hasOneDef(CopyVReg))
8460 return false;
8461 DefMI = MRI->getVRegDef(CopyVReg);
8464 switch (DefMI->getOpcode()) {
8465 default:
8466 return false;
8467 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8468 case AArch64::ANDWri:
8469 case AArch64::ANDXri: {
8470 if (IsTestAndBranch)
8471 return false;
8472 if (DefMI->getParent() != MBB)
8473 return false;
8474 if (!MRI->hasOneNonDBGUse(VReg))
8475 return false;
8477 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8478 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8479 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
8480 if (!isPowerOf2_64(Mask))
8481 return false;
8483 MachineOperand &MO = DefMI->getOperand(1);
8484 Register NewReg = MO.getReg();
8485 if (!NewReg.isVirtual())
8486 return false;
8488 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8490 MachineBasicBlock &RefToMBB = *MBB;
8491 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
8492 DebugLoc DL = MI.getDebugLoc();
8493 unsigned Imm = Log2_64(Mask);
8494 unsigned Opc = (Imm < 32)
8495 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8496 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8497 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8498 .addReg(NewReg)
8499 .addImm(Imm)
8500 .addMBB(TBB);
8501 // Register lives on to the CBZ now.
8502 MO.setIsKill(false);
8504 // For immediate smaller than 32, we need to use the 32-bit
8505 // variant (W) in all cases. Indeed the 64-bit variant does not
8506 // allow to encode them.
8507 // Therefore, if the input register is 64-bit, we need to take the
8508 // 32-bit sub-part.
8509 if (!Is32Bit && Imm < 32)
8510 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8511 MI.eraseFromParent();
8512 return true;
8514 // Look for CSINC
8515 case AArch64::CSINCWr:
8516 case AArch64::CSINCXr: {
8517 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8518 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8519 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8520 DefMI->getOperand(2).getReg() == AArch64::XZR))
8521 return false;
8523 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8524 true) != -1)
8525 return false;
8527 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8528 // Convert only when the condition code is not modified between
8529 // the CSINC and the branch. The CC may be used by other
8530 // instructions in between.
8531 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8532 return false;
8533 MachineBasicBlock &RefToMBB = *MBB;
8534 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8535 DebugLoc DL = MI.getDebugLoc();
8536 if (IsNegativeBranch)
8537 CC = AArch64CC::getInvertedCondCode(CC);
8538 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8539 MI.eraseFromParent();
8540 return true;
8545 std::pair<unsigned, unsigned>
8546 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8547 const unsigned Mask = AArch64II::MO_FRAGMENT;
8548 return std::make_pair(TF & Mask, TF & ~Mask);
8551 ArrayRef<std::pair<unsigned, const char *>>
8552 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8553 using namespace AArch64II;
8555 static const std::pair<unsigned, const char *> TargetFlags[] = {
8556 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8557 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
8558 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
8559 {MO_HI12, "aarch64-hi12"}};
8560 return ArrayRef(TargetFlags);
8563 ArrayRef<std::pair<unsigned, const char *>>
8564 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8565 using namespace AArch64II;
8567 static const std::pair<unsigned, const char *> TargetFlags[] = {
8568 {MO_COFFSTUB, "aarch64-coffstub"},
8569 {MO_GOT, "aarch64-got"},
8570 {MO_NC, "aarch64-nc"},
8571 {MO_S, "aarch64-s"},
8572 {MO_TLS, "aarch64-tls"},
8573 {MO_DLLIMPORT, "aarch64-dllimport"},
8574 {MO_PREL, "aarch64-prel"},
8575 {MO_TAGGED, "aarch64-tagged"},
8576 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8578 return ArrayRef(TargetFlags);
8581 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
8582 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8583 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8584 {{MOSuppressPair, "aarch64-suppress-pair"},
8585 {MOStridedAccess, "aarch64-strided-access"}};
8586 return ArrayRef(TargetFlags);
8589 /// Constants defining how certain sequences should be outlined.
8590 /// This encompasses how an outlined function should be called, and what kind of
8591 /// frame should be emitted for that outlined function.
8593 /// \p MachineOutlinerDefault implies that the function should be called with
8594 /// a save and restore of LR to the stack.
8596 /// That is,
8598 /// I1 Save LR OUTLINED_FUNCTION:
8599 /// I2 --> BL OUTLINED_FUNCTION I1
8600 /// I3 Restore LR I2
8601 /// I3
8602 /// RET
8604 /// * Call construction overhead: 3 (save + BL + restore)
8605 /// * Frame construction overhead: 1 (ret)
8606 /// * Requires stack fixups? Yes
8608 /// \p MachineOutlinerTailCall implies that the function is being created from
8609 /// a sequence of instructions ending in a return.
8611 /// That is,
8613 /// I1 OUTLINED_FUNCTION:
8614 /// I2 --> B OUTLINED_FUNCTION I1
8615 /// RET I2
8616 /// RET
8618 /// * Call construction overhead: 1 (B)
8619 /// * Frame construction overhead: 0 (Return included in sequence)
8620 /// * Requires stack fixups? No
8622 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8623 /// a BL instruction, but doesn't require LR to be saved and restored. This
8624 /// happens when LR is known to be dead.
8626 /// That is,
8628 /// I1 OUTLINED_FUNCTION:
8629 /// I2 --> BL OUTLINED_FUNCTION I1
8630 /// I3 I2
8631 /// I3
8632 /// RET
8634 /// * Call construction overhead: 1 (BL)
8635 /// * Frame construction overhead: 1 (RET)
8636 /// * Requires stack fixups? No
8638 /// \p MachineOutlinerThunk implies that the function is being created from
8639 /// a sequence of instructions ending in a call. The outlined function is
8640 /// called with a BL instruction, and the outlined function tail-calls the
8641 /// original call destination.
8643 /// That is,
8645 /// I1 OUTLINED_FUNCTION:
8646 /// I2 --> BL OUTLINED_FUNCTION I1
8647 /// BL f I2
8648 /// B f
8649 /// * Call construction overhead: 1 (BL)
8650 /// * Frame construction overhead: 0
8651 /// * Requires stack fixups? No
8653 /// \p MachineOutlinerRegSave implies that the function should be called with a
8654 /// save and restore of LR to an available register. This allows us to avoid
8655 /// stack fixups. Note that this outlining variant is compatible with the
8656 /// NoLRSave case.
8658 /// That is,
8660 /// I1 Save LR OUTLINED_FUNCTION:
8661 /// I2 --> BL OUTLINED_FUNCTION I1
8662 /// I3 Restore LR I2
8663 /// I3
8664 /// RET
8666 /// * Call construction overhead: 3 (save + BL + restore)
8667 /// * Frame construction overhead: 1 (ret)
8668 /// * Requires stack fixups? No
8669 enum MachineOutlinerClass {
8670 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
8671 MachineOutlinerTailCall, /// Only emit a branch.
8672 MachineOutlinerNoLRSave, /// Emit a call and return.
8673 MachineOutlinerThunk, /// Emit a call and tail-call.
8674 MachineOutlinerRegSave /// Same as default, but save to a register.
8677 enum MachineOutlinerMBBFlags {
8678 LRUnavailableSomewhere = 0x2,
8679 HasCalls = 0x4,
8680 UnsafeRegsDead = 0x8
8683 Register
8684 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8685 MachineFunction *MF = C.getMF();
8686 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8687 const AArch64RegisterInfo *ARI =
8688 static_cast<const AArch64RegisterInfo *>(&TRI);
8689 // Check if there is an available register across the sequence that we can
8690 // use.
8691 for (unsigned Reg : AArch64::GPR64RegClass) {
8692 if (!ARI->isReservedReg(*MF, Reg) &&
8693 Reg != AArch64::LR && // LR is not reserved, but don't use it.
8694 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8695 Reg != AArch64::X17 && // Ditto for X17.
8696 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8697 C.isAvailableInsideSeq(Reg, TRI))
8698 return Reg;
8700 return Register();
8703 static bool
8704 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8705 const outliner::Candidate &b) {
8706 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8707 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8709 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8710 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8713 static bool
8714 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8715 const outliner::Candidate &b) {
8716 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8717 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8719 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8722 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8723 const outliner::Candidate &b) {
8724 const AArch64Subtarget &SubtargetA =
8725 a.getMF()->getSubtarget<AArch64Subtarget>();
8726 const AArch64Subtarget &SubtargetB =
8727 b.getMF()->getSubtarget<AArch64Subtarget>();
8728 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8731 std::optional<std::unique_ptr<outliner::OutlinedFunction>>
8732 AArch64InstrInfo::getOutliningCandidateInfo(
8733 const MachineModuleInfo &MMI,
8734 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
8735 unsigned MinRepeats) const {
8736 unsigned SequenceSize = 0;
8737 for (auto &MI : RepeatedSequenceLocs[0])
8738 SequenceSize += getInstSizeInBytes(MI);
8740 unsigned NumBytesToCreateFrame = 0;
8742 // We only allow outlining for functions having exactly matching return
8743 // address signing attributes, i.e., all share the same value for the
8744 // attribute "sign-return-address" and all share the same type of key they
8745 // are signed with.
8746 // Additionally we require all functions to simultaniously either support
8747 // v8.3a features or not. Otherwise an outlined function could get signed
8748 // using dedicated v8.3 instructions and a call from a function that doesn't
8749 // support v8.3 instructions would therefore be invalid.
8750 if (std::adjacent_find(
8751 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8752 [](const outliner::Candidate &a, const outliner::Candidate &b) {
8753 // Return true if a and b are non-equal w.r.t. return address
8754 // signing or support of v8.3a features
8755 if (outliningCandidatesSigningScopeConsensus(a, b) &&
8756 outliningCandidatesSigningKeyConsensus(a, b) &&
8757 outliningCandidatesV8_3OpsConsensus(a, b)) {
8758 return false;
8760 return true;
8761 }) != RepeatedSequenceLocs.end()) {
8762 return std::nullopt;
8765 // Since at this point all candidates agree on their return address signing
8766 // picking just one is fine. If the candidate functions potentially sign their
8767 // return addresses, the outlined function should do the same. Note that in
8768 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8769 // not certainly true that the outlined function will have to sign its return
8770 // address but this decision is made later, when the decision to outline
8771 // has already been made.
8772 // The same holds for the number of additional instructions we need: On
8773 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8774 // necessary. However, at this point we don't know if the outlined function
8775 // will have a RET instruction so we assume the worst.
8776 const TargetRegisterInfo &TRI = getRegisterInfo();
8777 // Performing a tail call may require extra checks when PAuth is enabled.
8778 // If PAuth is disabled, set it to zero for uniformity.
8779 unsigned NumBytesToCheckLRInTCEpilogue = 0;
8780 if (RepeatedSequenceLocs[0]
8781 .getMF()
8782 ->getInfo<AArch64FunctionInfo>()
8783 ->shouldSignReturnAddress(true)) {
8784 // One PAC and one AUT instructions
8785 NumBytesToCreateFrame += 8;
8787 // PAuth is enabled - set extra tail call cost, if any.
8788 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
8789 *RepeatedSequenceLocs[0].getMF());
8790 NumBytesToCheckLRInTCEpilogue =
8791 AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8792 // Checking the authenticated LR value may significantly impact
8793 // SequenceSize, so account for it for more precise results.
8794 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
8795 SequenceSize += NumBytesToCheckLRInTCEpilogue;
8797 // We have to check if sp modifying instructions would get outlined.
8798 // If so we only allow outlining if sp is unchanged overall, so matching
8799 // sub and add instructions are okay to outline, all other sp modifications
8800 // are not
8801 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8802 int SPValue = 0;
8803 for (auto &MI : C) {
8804 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8805 switch (MI.getOpcode()) {
8806 case AArch64::ADDXri:
8807 case AArch64::ADDWri:
8808 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8809 assert(MI.getOperand(2).isImm() &&
8810 "Expected operand to be immediate");
8811 assert(MI.getOperand(1).isReg() &&
8812 "Expected operand to be a register");
8813 // Check if the add just increments sp. If so, we search for
8814 // matching sub instructions that decrement sp. If not, the
8815 // modification is illegal
8816 if (MI.getOperand(1).getReg() == AArch64::SP)
8817 SPValue += MI.getOperand(2).getImm();
8818 else
8819 return true;
8820 break;
8821 case AArch64::SUBXri:
8822 case AArch64::SUBWri:
8823 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8824 assert(MI.getOperand(2).isImm() &&
8825 "Expected operand to be immediate");
8826 assert(MI.getOperand(1).isReg() &&
8827 "Expected operand to be a register");
8828 // Check if the sub just decrements sp. If so, we search for
8829 // matching add instructions that increment sp. If not, the
8830 // modification is illegal
8831 if (MI.getOperand(1).getReg() == AArch64::SP)
8832 SPValue -= MI.getOperand(2).getImm();
8833 else
8834 return true;
8835 break;
8836 default:
8837 return true;
8841 if (SPValue)
8842 return true;
8843 return false;
8845 // Remove candidates with illegal stack modifying instructions
8846 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8848 // If the sequence doesn't have enough candidates left, then we're done.
8849 if (RepeatedSequenceLocs.size() < MinRepeats)
8850 return std::nullopt;
8853 // Properties about candidate MBBs that hold for all of them.
8854 unsigned FlagsSetInAll = 0xF;
8856 // Compute liveness information for each candidate, and set FlagsSetInAll.
8857 for (outliner::Candidate &C : RepeatedSequenceLocs)
8858 FlagsSetInAll &= C.Flags;
8860 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8862 // Helper lambda which sets call information for every candidate.
8863 auto SetCandidateCallInfo =
8864 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8865 for (outliner::Candidate &C : RepeatedSequenceLocs)
8866 C.setCallInfo(CallID, NumBytesForCall);
8869 unsigned FrameID = MachineOutlinerDefault;
8870 NumBytesToCreateFrame += 4;
8872 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8873 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8876 // We check to see if CFI Instructions are present, and if they are
8877 // we find the number of CFI Instructions in the candidates.
8878 unsigned CFICount = 0;
8879 for (auto &I : RepeatedSequenceLocs[0]) {
8880 if (I.isCFIInstruction())
8881 CFICount++;
8884 // We compare the number of found CFI Instructions to the number of CFI
8885 // instructions in the parent function for each candidate. We must check this
8886 // since if we outline one of the CFI instructions in a function, we have to
8887 // outline them all for correctness. If we do not, the address offsets will be
8888 // incorrect between the two sections of the program.
8889 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8890 std::vector<MCCFIInstruction> CFIInstructions =
8891 C.getMF()->getFrameInstructions();
8893 if (CFICount > 0 && CFICount != CFIInstructions.size())
8894 return std::nullopt;
8897 // Returns true if an instructions is safe to fix up, false otherwise.
8898 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8899 if (MI.isCall())
8900 return true;
8902 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8903 !MI.readsRegister(AArch64::SP, &TRI))
8904 return true;
8906 // Any modification of SP will break our code to save/restore LR.
8907 // FIXME: We could handle some instructions which add a constant
8908 // offset to SP, with a bit more work.
8909 if (MI.modifiesRegister(AArch64::SP, &TRI))
8910 return false;
8912 // At this point, we have a stack instruction that we might need to
8913 // fix up. We'll handle it if it's a load or store.
8914 if (MI.mayLoadOrStore()) {
8915 const MachineOperand *Base; // Filled with the base operand of MI.
8916 int64_t Offset; // Filled with the offset of MI.
8917 bool OffsetIsScalable;
8919 // Does it allow us to offset the base operand and is the base the
8920 // register SP?
8921 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8922 !Base->isReg() || Base->getReg() != AArch64::SP)
8923 return false;
8925 // Fixe-up code below assumes bytes.
8926 if (OffsetIsScalable)
8927 return false;
8929 // Find the minimum/maximum offset for this instruction and check
8930 // if fixing it up would be in range.
8931 int64_t MinOffset,
8932 MaxOffset; // Unscaled offsets for the instruction.
8933 // The scale to multiply the offsets by.
8934 TypeSize Scale(0U, false), DummyWidth(0U, false);
8935 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8937 Offset += 16; // Update the offset to what it would be if we outlined.
8938 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8939 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8940 return false;
8942 // It's in range, so we can outline it.
8943 return true;
8946 // FIXME: Add handling for instructions like "add x0, sp, #8".
8948 // We can't fix it up, so don't outline it.
8949 return false;
8952 // True if it's possible to fix up each stack instruction in this sequence.
8953 // Important for frames/call variants that modify the stack.
8954 bool AllStackInstrsSafe =
8955 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
8957 // If the last instruction in any candidate is a terminator, then we should
8958 // tail call all of the candidates.
8959 if (RepeatedSequenceLocs[0].back().isTerminator()) {
8960 FrameID = MachineOutlinerTailCall;
8961 NumBytesToCreateFrame = 0;
8962 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8963 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8966 else if (LastInstrOpcode == AArch64::BL ||
8967 ((LastInstrOpcode == AArch64::BLR ||
8968 LastInstrOpcode == AArch64::BLRNoIP) &&
8969 !HasBTI)) {
8970 // FIXME: Do we need to check if the code after this uses the value of LR?
8971 FrameID = MachineOutlinerThunk;
8972 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8973 SetCandidateCallInfo(MachineOutlinerThunk, 4);
8976 else {
8977 // We need to decide how to emit calls + frames. We can always emit the same
8978 // frame if we don't need to save to the stack. If we have to save to the
8979 // stack, then we need a different frame.
8980 unsigned NumBytesNoStackCalls = 0;
8981 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8983 // Check if we have to save LR.
8984 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8985 bool LRAvailable =
8986 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8987 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8988 : true;
8989 // If we have a noreturn caller, then we're going to be conservative and
8990 // say that we have to save LR. If we don't have a ret at the end of the
8991 // block, then we can't reason about liveness accurately.
8993 // FIXME: We can probably do better than always disabling this in
8994 // noreturn functions by fixing up the liveness info.
8995 bool IsNoReturn =
8996 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8998 // Is LR available? If so, we don't need a save.
8999 if (LRAvailable && !IsNoReturn) {
9000 NumBytesNoStackCalls += 4;
9001 C.setCallInfo(MachineOutlinerNoLRSave, 4);
9002 CandidatesWithoutStackFixups.push_back(C);
9005 // Is an unused register available? If so, we won't modify the stack, so
9006 // we can outline with the same frame type as those that don't save LR.
9007 else if (findRegisterToSaveLRTo(C)) {
9008 NumBytesNoStackCalls += 12;
9009 C.setCallInfo(MachineOutlinerRegSave, 12);
9010 CandidatesWithoutStackFixups.push_back(C);
9013 // Is SP used in the sequence at all? If not, we don't have to modify
9014 // the stack, so we are guaranteed to get the same frame.
9015 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
9016 NumBytesNoStackCalls += 12;
9017 C.setCallInfo(MachineOutlinerDefault, 12);
9018 CandidatesWithoutStackFixups.push_back(C);
9021 // If we outline this, we need to modify the stack. Pretend we don't
9022 // outline this by saving all of its bytes.
9023 else {
9024 NumBytesNoStackCalls += SequenceSize;
9028 // If there are no places where we have to save LR, then note that we
9029 // don't have to update the stack. Otherwise, give every candidate the
9030 // default call type, as long as it's safe to do so.
9031 if (!AllStackInstrsSafe ||
9032 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
9033 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
9034 FrameID = MachineOutlinerNoLRSave;
9035 if (RepeatedSequenceLocs.size() < MinRepeats)
9036 return std::nullopt;
9037 } else {
9038 SetCandidateCallInfo(MachineOutlinerDefault, 12);
9040 // Bugzilla ID: 46767
9041 // TODO: Check if fixing up the stack more than once is safe so we can
9042 // outline these.
9044 // An outline resulting in a caller that requires stack fixups at the
9045 // callsite to a callee that also requires stack fixups can happen when
9046 // there are no available registers at the candidate callsite for a
9047 // candidate that itself also has calls.
9049 // In other words if function_containing_sequence in the following pseudo
9050 // assembly requires that we save LR at the point of the call, but there
9051 // are no available registers: in this case we save using SP and as a
9052 // result the SP offsets requires stack fixups by multiples of 16.
9054 // function_containing_sequence:
9055 // ...
9056 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
9057 // call OUTLINED_FUNCTION_N
9058 // restore LR from SP
9059 // ...
9061 // OUTLINED_FUNCTION_N:
9062 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
9063 // ...
9064 // bl foo
9065 // restore LR from SP
9066 // ret
9068 // Because the code to handle more than one stack fixup does not
9069 // currently have the proper checks for legality, these cases will assert
9070 // in the AArch64 MachineOutliner. This is because the code to do this
9071 // needs more hardening, testing, better checks that generated code is
9072 // legal, etc and because it is only verified to handle a single pass of
9073 // stack fixup.
9075 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
9076 // these cases until they are known to be handled. Bugzilla 46767 is
9077 // referenced in comments at the assert site.
9079 // To avoid asserting (or generating non-legal code on noassert builds)
9080 // we remove all candidates which would need more than one stack fixup by
9081 // pruning the cases where the candidate has calls while also having no
9082 // available LR and having no available general purpose registers to copy
9083 // LR to (ie one extra stack save/restore).
9085 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
9086 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
9087 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
9088 return (llvm::any_of(C, IsCall)) &&
9089 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
9090 !findRegisterToSaveLRTo(C));
9095 // If we dropped all of the candidates, bail out here.
9096 if (RepeatedSequenceLocs.size() < MinRepeats)
9097 return std::nullopt;
9100 // Does every candidate's MBB contain a call? If so, then we might have a call
9101 // in the range.
9102 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
9103 // Check if the range contains a call. These require a save + restore of the
9104 // link register.
9105 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
9106 bool ModStackToSaveLR = false;
9107 if (any_of(drop_end(FirstCand),
9108 [](const MachineInstr &MI) { return MI.isCall(); }))
9109 ModStackToSaveLR = true;
9111 // Handle the last instruction separately. If this is a tail call, then the
9112 // last instruction is a call. We don't want to save + restore in this case.
9113 // However, it could be possible that the last instruction is a call without
9114 // it being valid to tail call this sequence. We should consider this as
9115 // well.
9116 else if (FrameID != MachineOutlinerThunk &&
9117 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
9118 ModStackToSaveLR = true;
9120 if (ModStackToSaveLR) {
9121 // We can't fix up the stack. Bail out.
9122 if (!AllStackInstrsSafe)
9123 return std::nullopt;
9125 // Save + restore LR.
9126 NumBytesToCreateFrame += 8;
9130 // If we have CFI instructions, we can only outline if the outlined section
9131 // can be a tail call
9132 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
9133 return std::nullopt;
9135 return std::make_unique<outliner::OutlinedFunction>(
9136 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
9139 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
9140 Function &F, std::vector<outliner::Candidate> &Candidates) const {
9141 // If a bunch of candidates reach this point they must agree on their return
9142 // address signing. It is therefore enough to just consider the signing
9143 // behaviour of one of them
9144 const auto &CFn = Candidates.front().getMF()->getFunction();
9146 if (CFn.hasFnAttribute("ptrauth-returns"))
9147 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
9148 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
9149 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
9150 // Since all candidates belong to the same module, just copy the
9151 // function-level attributes of an arbitrary function.
9152 if (CFn.hasFnAttribute("sign-return-address"))
9153 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
9154 if (CFn.hasFnAttribute("sign-return-address-key"))
9155 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
9157 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
9160 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
9161 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
9162 const Function &F = MF.getFunction();
9164 // Can F be deduplicated by the linker? If it can, don't outline from it.
9165 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
9166 return false;
9168 // Don't outline from functions with section markings; the program could
9169 // expect that all the code is in the named section.
9170 // FIXME: Allow outlining from multiple functions with the same section
9171 // marking.
9172 if (F.hasSection())
9173 return false;
9175 // Outlining from functions with redzones is unsafe since the outliner may
9176 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
9177 // outline from it.
9178 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
9179 if (!AFI || AFI->hasRedZone().value_or(true))
9180 return false;
9182 // FIXME: Determine whether it is safe to outline from functions which contain
9183 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
9184 // outlined together and ensure it is safe to outline with async unwind info,
9185 // required for saving & restoring VG around calls.
9186 if (AFI->hasStreamingModeChanges())
9187 return false;
9189 // FIXME: Teach the outliner to generate/handle Windows unwind info.
9190 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
9191 return false;
9193 // It's safe to outline from MF.
9194 return true;
9197 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
9198 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
9199 unsigned &Flags) const {
9200 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
9201 "Must track liveness!");
9202 SmallVector<
9203 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
9204 Ranges;
9205 // According to the AArch64 Procedure Call Standard, the following are
9206 // undefined on entry/exit from a function call:
9208 // * Registers x16, x17, (and thus w16, w17)
9209 // * Condition codes (and thus the NZCV register)
9211 // If any of these registers are used inside or live across an outlined
9212 // function, then they may be modified later, either by the compiler or
9213 // some other tool (like the linker).
9215 // To avoid outlining in these situations, partition each block into ranges
9216 // where these registers are dead. We will only outline from those ranges.
9217 LiveRegUnits LRU(getRegisterInfo());
9218 auto AreAllUnsafeRegsDead = [&LRU]() {
9219 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
9220 LRU.available(AArch64::NZCV);
9223 // We need to know if LR is live across an outlining boundary later on in
9224 // order to decide how we'll create the outlined call, frame, etc.
9226 // It's pretty expensive to check this for *every candidate* within a block.
9227 // That's some potentially n^2 behaviour, since in the worst case, we'd need
9228 // to compute liveness from the end of the block for O(n) candidates within
9229 // the block.
9231 // So, to improve the average case, let's keep track of liveness from the end
9232 // of the block to the beginning of *every outlinable range*. If we know that
9233 // LR is available in every range we could outline from, then we know that
9234 // we don't need to check liveness for any candidate within that range.
9235 bool LRAvailableEverywhere = true;
9236 // Compute liveness bottom-up.
9237 LRU.addLiveOuts(MBB);
9238 // Update flags that require info about the entire MBB.
9239 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
9240 if (MI.isCall() && !MI.isTerminator())
9241 Flags |= MachineOutlinerMBBFlags::HasCalls;
9243 // Range: [RangeBegin, RangeEnd)
9244 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
9245 unsigned RangeLen;
9246 auto CreateNewRangeStartingAt =
9247 [&RangeBegin, &RangeEnd,
9248 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
9249 RangeBegin = NewBegin;
9250 RangeEnd = std::next(RangeBegin);
9251 RangeLen = 0;
9253 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
9254 // At least one unsafe register is not dead. We do not want to outline at
9255 // this point. If it is long enough to outline from, save the range
9256 // [RangeBegin, RangeEnd).
9257 if (RangeLen > 1)
9258 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
9260 // Find the first point where all unsafe registers are dead.
9261 // FIND: <safe instr> <-- end of first potential range
9262 // SKIP: <unsafe def>
9263 // SKIP: ... everything between ...
9264 // SKIP: <unsafe use>
9265 auto FirstPossibleEndPt = MBB.instr_rbegin();
9266 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
9267 LRU.stepBackward(*FirstPossibleEndPt);
9268 // Update flags that impact how we outline across the entire block,
9269 // regardless of safety.
9270 UpdateWholeMBBFlags(*FirstPossibleEndPt);
9271 if (AreAllUnsafeRegsDead())
9272 break;
9274 // If we exhausted the entire block, we have no safe ranges to outline.
9275 if (FirstPossibleEndPt == MBB.instr_rend())
9276 return Ranges;
9277 // Current range.
9278 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
9279 // StartPt points to the first place where all unsafe registers
9280 // are dead (if there is any such point). Begin partitioning the MBB into
9281 // ranges.
9282 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
9283 LRU.stepBackward(MI);
9284 UpdateWholeMBBFlags(MI);
9285 if (!AreAllUnsafeRegsDead()) {
9286 SaveRangeIfNonEmpty();
9287 CreateNewRangeStartingAt(MI.getIterator());
9288 continue;
9290 LRAvailableEverywhere &= LRU.available(AArch64::LR);
9291 RangeBegin = MI.getIterator();
9292 ++RangeLen;
9294 // Above loop misses the last (or only) range. If we are still safe, then
9295 // let's save the range.
9296 if (AreAllUnsafeRegsDead())
9297 SaveRangeIfNonEmpty();
9298 if (Ranges.empty())
9299 return Ranges;
9300 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
9301 // the order.
9302 std::reverse(Ranges.begin(), Ranges.end());
9303 // If there is at least one outlinable range where LR is unavailable
9304 // somewhere, remember that.
9305 if (!LRAvailableEverywhere)
9306 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
9307 return Ranges;
9310 outliner::InstrType
9311 AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
9312 MachineBasicBlock::iterator &MIT,
9313 unsigned Flags) const {
9314 MachineInstr &MI = *MIT;
9315 MachineBasicBlock *MBB = MI.getParent();
9316 MachineFunction *MF = MBB->getParent();
9317 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
9319 // Don't outline anything used for return address signing. The outlined
9320 // function will get signed later if needed
9321 switch (MI.getOpcode()) {
9322 case AArch64::PACM:
9323 case AArch64::PACIASP:
9324 case AArch64::PACIBSP:
9325 case AArch64::PACIASPPC:
9326 case AArch64::PACIBSPPC:
9327 case AArch64::AUTIASP:
9328 case AArch64::AUTIBSP:
9329 case AArch64::AUTIASPPCi:
9330 case AArch64::AUTIASPPCr:
9331 case AArch64::AUTIBSPPCi:
9332 case AArch64::AUTIBSPPCr:
9333 case AArch64::RETAA:
9334 case AArch64::RETAB:
9335 case AArch64::RETAASPPCi:
9336 case AArch64::RETAASPPCr:
9337 case AArch64::RETABSPPCi:
9338 case AArch64::RETABSPPCr:
9339 case AArch64::EMITBKEY:
9340 case AArch64::PAUTH_PROLOGUE:
9341 case AArch64::PAUTH_EPILOGUE:
9342 return outliner::InstrType::Illegal;
9345 // Don't outline LOHs.
9346 if (FuncInfo->getLOHRelated().count(&MI))
9347 return outliner::InstrType::Illegal;
9349 // We can only outline these if we will tail call the outlined function, or
9350 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
9351 // in a tail call.
9353 // FIXME: If the proper fixups for the offset are implemented, this should be
9354 // possible.
9355 if (MI.isCFIInstruction())
9356 return outliner::InstrType::Legal;
9358 // Is this a terminator for a basic block?
9359 if (MI.isTerminator())
9360 // TargetInstrInfo::getOutliningType has already filtered out anything
9361 // that would break this, so we can allow it here.
9362 return outliner::InstrType::Legal;
9364 // Make sure none of the operands are un-outlinable.
9365 for (const MachineOperand &MOP : MI.operands()) {
9366 // A check preventing CFI indices was here before, but only CFI
9367 // instructions should have those.
9368 assert(!MOP.isCFIIndex());
9370 // If it uses LR or W30 explicitly, then don't touch it.
9371 if (MOP.isReg() && !MOP.isImplicit() &&
9372 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
9373 return outliner::InstrType::Illegal;
9376 // Special cases for instructions that can always be outlined, but will fail
9377 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
9378 // be outlined because they don't require a *specific* value to be in LR.
9379 if (MI.getOpcode() == AArch64::ADRP)
9380 return outliner::InstrType::Legal;
9382 // If MI is a call we might be able to outline it. We don't want to outline
9383 // any calls that rely on the position of items on the stack. When we outline
9384 // something containing a call, we have to emit a save and restore of LR in
9385 // the outlined function. Currently, this always happens by saving LR to the
9386 // stack. Thus, if we outline, say, half the parameters for a function call
9387 // plus the call, then we'll break the callee's expectations for the layout
9388 // of the stack.
9390 // FIXME: Allow calls to functions which construct a stack frame, as long
9391 // as they don't access arguments on the stack.
9392 // FIXME: Figure out some way to analyze functions defined in other modules.
9393 // We should be able to compute the memory usage based on the IR calling
9394 // convention, even if we can't see the definition.
9395 if (MI.isCall()) {
9396 // Get the function associated with the call. Look at each operand and find
9397 // the one that represents the callee and get its name.
9398 const Function *Callee = nullptr;
9399 for (const MachineOperand &MOP : MI.operands()) {
9400 if (MOP.isGlobal()) {
9401 Callee = dyn_cast<Function>(MOP.getGlobal());
9402 break;
9406 // Never outline calls to mcount. There isn't any rule that would require
9407 // this, but the Linux kernel's "ftrace" feature depends on it.
9408 if (Callee && Callee->getName() == "\01_mcount")
9409 return outliner::InstrType::Illegal;
9411 // If we don't know anything about the callee, assume it depends on the
9412 // stack layout of the caller. In that case, it's only legal to outline
9413 // as a tail-call. Explicitly list the call instructions we know about so we
9414 // don't get unexpected results with call pseudo-instructions.
9415 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
9416 if (MI.getOpcode() == AArch64::BLR ||
9417 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
9418 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
9420 if (!Callee)
9421 return UnknownCallOutlineType;
9423 // We have a function we have information about. Check it if it's something
9424 // can safely outline.
9425 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
9427 // We don't know what's going on with the callee at all. Don't touch it.
9428 if (!CalleeMF)
9429 return UnknownCallOutlineType;
9431 // Check if we know anything about the callee saves on the function. If we
9432 // don't, then don't touch it, since that implies that we haven't
9433 // computed anything about its stack frame yet.
9434 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
9435 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
9436 MFI.getNumObjects() > 0)
9437 return UnknownCallOutlineType;
9439 // At this point, we can say that CalleeMF ought to not pass anything on the
9440 // stack. Therefore, we can outline it.
9441 return outliner::InstrType::Legal;
9444 // Don't touch the link register or W30.
9445 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
9446 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
9447 return outliner::InstrType::Illegal;
9449 // Don't outline BTI instructions, because that will prevent the outlining
9450 // site from being indirectly callable.
9451 if (hasBTISemantics(MI))
9452 return outliner::InstrType::Illegal;
9454 return outliner::InstrType::Legal;
9457 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9458 for (MachineInstr &MI : MBB) {
9459 const MachineOperand *Base;
9460 TypeSize Width(0, false);
9461 int64_t Offset;
9462 bool OffsetIsScalable;
9464 // Is this a load or store with an immediate offset with SP as the base?
9465 if (!MI.mayLoadOrStore() ||
9466 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
9467 &RI) ||
9468 (Base->isReg() && Base->getReg() != AArch64::SP))
9469 continue;
9471 // It is, so we have to fix it up.
9472 TypeSize Scale(0U, false);
9473 int64_t Dummy1, Dummy2;
9475 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
9476 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9477 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
9478 assert(Scale != 0 && "Unexpected opcode!");
9479 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9481 // We've pushed the return address to the stack, so add 16 to the offset.
9482 // This is safe, since we already checked if it would overflow when we
9483 // checked if this instruction was legal to outline.
9484 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9485 StackOffsetOperand.setImm(NewImm);
9489 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9490 const AArch64InstrInfo *TII,
9491 bool ShouldSignReturnAddr) {
9492 if (!ShouldSignReturnAddr)
9493 return;
9495 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
9496 .setMIFlag(MachineInstr::FrameSetup);
9497 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9498 TII->get(AArch64::PAUTH_EPILOGUE))
9499 .setMIFlag(MachineInstr::FrameDestroy);
9502 void AArch64InstrInfo::buildOutlinedFrame(
9503 MachineBasicBlock &MBB, MachineFunction &MF,
9504 const outliner::OutlinedFunction &OF) const {
9506 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9508 if (OF.FrameConstructionID == MachineOutlinerTailCall)
9509 FI->setOutliningStyle("Tail Call");
9510 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9511 // For thunk outlining, rewrite the last instruction from a call to a
9512 // tail-call.
9513 MachineInstr *Call = &*--MBB.instr_end();
9514 unsigned TailOpcode;
9515 if (Call->getOpcode() == AArch64::BL) {
9516 TailOpcode = AArch64::TCRETURNdi;
9517 } else {
9518 assert(Call->getOpcode() == AArch64::BLR ||
9519 Call->getOpcode() == AArch64::BLRNoIP);
9520 TailOpcode = AArch64::TCRETURNriALL;
9522 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9523 .add(Call->getOperand(0))
9524 .addImm(0);
9525 MBB.insert(MBB.end(), TC);
9526 Call->eraseFromParent();
9528 FI->setOutliningStyle("Thunk");
9531 bool IsLeafFunction = true;
9533 // Is there a call in the outlined range?
9534 auto IsNonTailCall = [](const MachineInstr &MI) {
9535 return MI.isCall() && !MI.isReturn();
9538 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9539 // Fix up the instructions in the range, since we're going to modify the
9540 // stack.
9542 // Bugzilla ID: 46767
9543 // TODO: Check if fixing up twice is safe so we can outline these.
9544 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9545 "Can only fix up stack references once");
9546 fixupPostOutline(MBB);
9548 IsLeafFunction = false;
9550 // LR has to be a live in so that we can save it.
9551 if (!MBB.isLiveIn(AArch64::LR))
9552 MBB.addLiveIn(AArch64::LR);
9554 MachineBasicBlock::iterator It = MBB.begin();
9555 MachineBasicBlock::iterator Et = MBB.end();
9557 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9558 OF.FrameConstructionID == MachineOutlinerThunk)
9559 Et = std::prev(MBB.end());
9561 // Insert a save before the outlined region
9562 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9563 .addReg(AArch64::SP, RegState::Define)
9564 .addReg(AArch64::LR)
9565 .addReg(AArch64::SP)
9566 .addImm(-16);
9567 It = MBB.insert(It, STRXpre);
9569 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9570 const TargetSubtargetInfo &STI = MF.getSubtarget();
9571 const MCRegisterInfo *MRI = STI.getRegisterInfo();
9572 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9574 // Add a CFI saying the stack was moved 16 B down.
9575 int64_t StackPosEntry =
9576 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9577 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9578 .addCFIIndex(StackPosEntry)
9579 .setMIFlags(MachineInstr::FrameSetup);
9581 // Add a CFI saying that the LR that we want to find is now 16 B higher
9582 // than before.
9583 int64_t LRPosEntry = MF.addFrameInst(
9584 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9585 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9586 .addCFIIndex(LRPosEntry)
9587 .setMIFlags(MachineInstr::FrameSetup);
9590 // Insert a restore before the terminator for the function.
9591 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9592 .addReg(AArch64::SP, RegState::Define)
9593 .addReg(AArch64::LR, RegState::Define)
9594 .addReg(AArch64::SP)
9595 .addImm(16);
9596 Et = MBB.insert(Et, LDRXpost);
9599 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9601 // If this is a tail call outlined function, then there's already a return.
9602 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9603 OF.FrameConstructionID == MachineOutlinerThunk) {
9604 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9605 return;
9608 // It's not a tail call, so we have to insert the return ourselves.
9610 // LR has to be a live in so that we can return to it.
9611 if (!MBB.isLiveIn(AArch64::LR))
9612 MBB.addLiveIn(AArch64::LR);
9614 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9615 .addReg(AArch64::LR);
9616 MBB.insert(MBB.end(), ret);
9618 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9620 FI->setOutliningStyle("Function");
9622 // Did we have to modify the stack by saving the link register?
9623 if (OF.FrameConstructionID != MachineOutlinerDefault)
9624 return;
9626 // We modified the stack.
9627 // Walk over the basic block and fix up all the stack accesses.
9628 fixupPostOutline(MBB);
9631 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9632 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9633 MachineFunction &MF, outliner::Candidate &C) const {
9635 // Are we tail calling?
9636 if (C.CallConstructionID == MachineOutlinerTailCall) {
9637 // If yes, then we can just branch to the label.
9638 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9639 .addGlobalAddress(M.getNamedValue(MF.getName()))
9640 .addImm(0));
9641 return It;
9644 // Are we saving the link register?
9645 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9646 C.CallConstructionID == MachineOutlinerThunk) {
9647 // No, so just insert the call.
9648 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9649 .addGlobalAddress(M.getNamedValue(MF.getName())));
9650 return It;
9653 // We want to return the spot where we inserted the call.
9654 MachineBasicBlock::iterator CallPt;
9656 // Instructions for saving and restoring LR around the call instruction we're
9657 // going to insert.
9658 MachineInstr *Save;
9659 MachineInstr *Restore;
9660 // Can we save to a register?
9661 if (C.CallConstructionID == MachineOutlinerRegSave) {
9662 // FIXME: This logic should be sunk into a target-specific interface so that
9663 // we don't have to recompute the register.
9664 Register Reg = findRegisterToSaveLRTo(C);
9665 assert(Reg && "No callee-saved register available?");
9667 // LR has to be a live in so that we can save it.
9668 if (!MBB.isLiveIn(AArch64::LR))
9669 MBB.addLiveIn(AArch64::LR);
9671 // Save and restore LR from Reg.
9672 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9673 .addReg(AArch64::XZR)
9674 .addReg(AArch64::LR)
9675 .addImm(0);
9676 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9677 .addReg(AArch64::XZR)
9678 .addReg(Reg)
9679 .addImm(0);
9680 } else {
9681 // We have the default case. Save and restore from SP.
9682 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9683 .addReg(AArch64::SP, RegState::Define)
9684 .addReg(AArch64::LR)
9685 .addReg(AArch64::SP)
9686 .addImm(-16);
9687 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9688 .addReg(AArch64::SP, RegState::Define)
9689 .addReg(AArch64::LR, RegState::Define)
9690 .addReg(AArch64::SP)
9691 .addImm(16);
9694 It = MBB.insert(It, Save);
9695 It++;
9697 // Insert the call.
9698 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9699 .addGlobalAddress(M.getNamedValue(MF.getName())));
9700 CallPt = It;
9701 It++;
9703 It = MBB.insert(It, Restore);
9704 return CallPt;
9707 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9708 MachineFunction &MF) const {
9709 return MF.getFunction().hasMinSize();
9712 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9713 MachineBasicBlock::iterator Iter,
9714 DebugLoc &DL,
9715 bool AllowSideEffects) const {
9716 const MachineFunction &MF = *MBB.getParent();
9717 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9718 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9720 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9721 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9722 } else if (STI.isSVEorStreamingSVEAvailable()) {
9723 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9724 .addImm(0)
9725 .addImm(0);
9726 } else if (STI.isNeonAvailable()) {
9727 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9728 .addImm(0);
9729 } else {
9730 // This is a streaming-compatible function without SVE. We don't have full
9731 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
9732 // So given `movi v..` would be illegal use `fmov d..` instead.
9733 assert(STI.hasNEON() && "Expected to have NEON.");
9734 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
9735 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
9739 std::optional<DestSourcePair>
9740 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9742 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9743 // and zero immediate operands used as an alias for mov instruction.
9744 if (((MI.getOpcode() == AArch64::ORRWrs &&
9745 MI.getOperand(1).getReg() == AArch64::WZR &&
9746 MI.getOperand(3).getImm() == 0x0) ||
9747 (MI.getOpcode() == AArch64::ORRWrr &&
9748 MI.getOperand(1).getReg() == AArch64::WZR)) &&
9749 // Check that the w->w move is not a zero-extending w->x mov.
9750 (!MI.getOperand(0).getReg().isVirtual() ||
9751 MI.getOperand(0).getSubReg() == 0) &&
9752 (!MI.getOperand(0).getReg().isPhysical() ||
9753 MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9754 AArch64::X0,
9755 /*TRI=*/nullptr) == -1))
9756 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9758 if (MI.getOpcode() == AArch64::ORRXrs &&
9759 MI.getOperand(1).getReg() == AArch64::XZR &&
9760 MI.getOperand(3).getImm() == 0x0)
9761 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9763 return std::nullopt;
9766 std::optional<DestSourcePair>
9767 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9768 if ((MI.getOpcode() == AArch64::ORRWrs &&
9769 MI.getOperand(1).getReg() == AArch64::WZR &&
9770 MI.getOperand(3).getImm() == 0x0) ||
9771 (MI.getOpcode() == AArch64::ORRWrr &&
9772 MI.getOperand(1).getReg() == AArch64::WZR))
9773 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9774 return std::nullopt;
9777 std::optional<RegImmPair>
9778 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9779 int Sign = 1;
9780 int64_t Offset = 0;
9782 // TODO: Handle cases where Reg is a super- or sub-register of the
9783 // destination register.
9784 const MachineOperand &Op0 = MI.getOperand(0);
9785 if (!Op0.isReg() || Reg != Op0.getReg())
9786 return std::nullopt;
9788 switch (MI.getOpcode()) {
9789 default:
9790 return std::nullopt;
9791 case AArch64::SUBWri:
9792 case AArch64::SUBXri:
9793 case AArch64::SUBSWri:
9794 case AArch64::SUBSXri:
9795 Sign *= -1;
9796 [[fallthrough]];
9797 case AArch64::ADDSWri:
9798 case AArch64::ADDSXri:
9799 case AArch64::ADDWri:
9800 case AArch64::ADDXri: {
9801 // TODO: Third operand can be global address (usually some string).
9802 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9803 !MI.getOperand(2).isImm())
9804 return std::nullopt;
9805 int Shift = MI.getOperand(3).getImm();
9806 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9807 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9810 return RegImmPair{MI.getOperand(1).getReg(), Offset};
9813 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9814 /// the destination register then, if possible, describe the value in terms of
9815 /// the source register.
9816 static std::optional<ParamLoadedValue>
9817 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9818 const TargetInstrInfo *TII,
9819 const TargetRegisterInfo *TRI) {
9820 auto DestSrc = TII->isCopyLikeInstr(MI);
9821 if (!DestSrc)
9822 return std::nullopt;
9824 Register DestReg = DestSrc->Destination->getReg();
9825 Register SrcReg = DestSrc->Source->getReg();
9827 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9829 // If the described register is the destination, just return the source.
9830 if (DestReg == DescribedReg)
9831 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9833 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9834 if (MI.getOpcode() == AArch64::ORRWrs &&
9835 TRI->isSuperRegister(DestReg, DescribedReg))
9836 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9838 // We may need to describe the lower part of a ORRXrs move.
9839 if (MI.getOpcode() == AArch64::ORRXrs &&
9840 TRI->isSubRegister(DestReg, DescribedReg)) {
9841 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9842 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9845 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9846 "Unhandled ORR[XW]rs copy case");
9848 return std::nullopt;
9851 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9852 // Functions cannot be split to different sections on AArch64 if they have
9853 // a red zone. This is because relaxing a cross-section branch may require
9854 // incrementing the stack pointer to spill a register, which would overwrite
9855 // the red zone.
9856 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9857 return false;
9859 return TargetInstrInfo::isFunctionSafeToSplit(MF);
9862 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9863 const MachineBasicBlock &MBB) const {
9864 // Asm Goto blocks can contain conditional branches to goto labels, which can
9865 // get moved out of range of the branch instruction.
9866 auto isAsmGoto = [](const MachineInstr &MI) {
9867 return MI.getOpcode() == AArch64::INLINEASM_BR;
9869 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9870 return false;
9872 // Because jump tables are label-relative instead of table-relative, they all
9873 // must be in the same section or relocation fixup handling will fail.
9875 // Check if MBB is a jump table target
9876 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9877 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9878 return llvm::is_contained(JTE.MBBs, &MBB);
9880 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9881 return false;
9883 // Check if MBB contains a jump table lookup
9884 for (const MachineInstr &MI : MBB) {
9885 switch (MI.getOpcode()) {
9886 case TargetOpcode::G_BRJT:
9887 case AArch64::JumpTableDest32:
9888 case AArch64::JumpTableDest16:
9889 case AArch64::JumpTableDest8:
9890 return false;
9891 default:
9892 continue;
9896 // MBB isn't a special case, so it's safe to be split to the cold section.
9897 return true;
9900 std::optional<ParamLoadedValue>
9901 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9902 Register Reg) const {
9903 const MachineFunction *MF = MI.getMF();
9904 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9905 switch (MI.getOpcode()) {
9906 case AArch64::MOVZWi:
9907 case AArch64::MOVZXi: {
9908 // MOVZWi may be used for producing zero-extended 32-bit immediates in
9909 // 64-bit parameters, so we need to consider super-registers.
9910 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9911 return std::nullopt;
9913 if (!MI.getOperand(1).isImm())
9914 return std::nullopt;
9915 int64_t Immediate = MI.getOperand(1).getImm();
9916 int Shift = MI.getOperand(2).getImm();
9917 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9918 nullptr);
9920 case AArch64::ORRWrs:
9921 case AArch64::ORRXrs:
9922 return describeORRLoadedValue(MI, Reg, this, TRI);
9925 return TargetInstrInfo::describeLoadedValue(MI, Reg);
9928 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9929 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9930 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9931 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9932 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9934 // Anyexts are nops.
9935 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9936 return true;
9938 Register DefReg = ExtMI.getOperand(0).getReg();
9939 if (!MRI.hasOneNonDBGUse(DefReg))
9940 return false;
9942 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9943 // addressing mode.
9944 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9945 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9948 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9949 return get(Opc).TSFlags & AArch64::ElementSizeMask;
9952 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9953 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9956 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9957 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9960 unsigned int
9961 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9962 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9965 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9966 unsigned Scale) const {
9967 if (Offset && Scale)
9968 return false;
9970 // Check Reg + Imm
9971 if (!Scale) {
9972 // 9-bit signed offset
9973 if (isInt<9>(Offset))
9974 return true;
9976 // 12-bit unsigned offset
9977 unsigned Shift = Log2_64(NumBytes);
9978 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9979 // Must be a multiple of NumBytes (NumBytes is a power of 2)
9980 (Offset >> Shift) << Shift == Offset)
9981 return true;
9982 return false;
9985 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9986 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9989 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9990 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9991 return AArch64::BLRNoIP;
9992 else
9993 return AArch64::BLR;
9996 MachineBasicBlock::iterator
9997 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9998 Register TargetReg, bool FrameSetup) const {
9999 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
10001 MachineBasicBlock &MBB = *MBBI->getParent();
10002 MachineFunction &MF = *MBB.getParent();
10003 const AArch64InstrInfo *TII =
10004 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
10005 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
10006 DebugLoc DL = MBB.findDebugLoc(MBBI);
10008 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
10009 MachineBasicBlock *LoopTestMBB =
10010 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10011 MF.insert(MBBInsertPoint, LoopTestMBB);
10012 MachineBasicBlock *LoopBodyMBB =
10013 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10014 MF.insert(MBBInsertPoint, LoopBodyMBB);
10015 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10016 MF.insert(MBBInsertPoint, ExitMBB);
10017 MachineInstr::MIFlag Flags =
10018 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
10020 // LoopTest:
10021 // SUB SP, SP, #ProbeSize
10022 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
10023 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
10025 // CMP SP, TargetReg
10026 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
10027 AArch64::XZR)
10028 .addReg(AArch64::SP)
10029 .addReg(TargetReg)
10030 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
10031 .setMIFlags(Flags);
10033 // B.<Cond> LoopExit
10034 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
10035 .addImm(AArch64CC::LE)
10036 .addMBB(ExitMBB)
10037 .setMIFlags(Flags);
10039 // STR XZR, [SP]
10040 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
10041 .addReg(AArch64::XZR)
10042 .addReg(AArch64::SP)
10043 .addImm(0)
10044 .setMIFlags(Flags);
10046 // B loop
10047 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
10048 .addMBB(LoopTestMBB)
10049 .setMIFlags(Flags);
10051 // LoopExit:
10052 // MOV SP, TargetReg
10053 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
10054 .addReg(TargetReg)
10055 .addImm(0)
10056 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
10057 .setMIFlags(Flags);
10059 // LDR XZR, [SP]
10060 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
10061 .addReg(AArch64::XZR, RegState::Define)
10062 .addReg(AArch64::SP)
10063 .addImm(0)
10064 .setMIFlags(Flags);
10066 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
10067 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
10069 LoopTestMBB->addSuccessor(ExitMBB);
10070 LoopTestMBB->addSuccessor(LoopBodyMBB);
10071 LoopBodyMBB->addSuccessor(LoopTestMBB);
10072 MBB.addSuccessor(LoopTestMBB);
10074 // Update liveins.
10075 if (MF.getRegInfo().reservedRegsFrozen())
10076 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
10078 return ExitMBB->begin();
10081 namespace {
10082 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
10083 MachineFunction *MF;
10084 const TargetInstrInfo *TII;
10085 const TargetRegisterInfo *TRI;
10086 MachineRegisterInfo &MRI;
10088 /// The block of the loop
10089 MachineBasicBlock *LoopBB;
10090 /// The conditional branch of the loop
10091 MachineInstr *CondBranch;
10092 /// The compare instruction for loop control
10093 MachineInstr *Comp;
10094 /// The number of the operand of the loop counter value in Comp
10095 unsigned CompCounterOprNum;
10096 /// The instruction that updates the loop counter value
10097 MachineInstr *Update;
10098 /// The number of the operand of the loop counter value in Update
10099 unsigned UpdateCounterOprNum;
10100 /// The initial value of the loop counter
10101 Register Init;
10102 /// True iff Update is a predecessor of Comp
10103 bool IsUpdatePriorComp;
10105 /// The normalized condition used by createTripCountGreaterCondition()
10106 SmallVector<MachineOperand, 4> Cond;
10108 public:
10109 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
10110 MachineInstr *Comp, unsigned CompCounterOprNum,
10111 MachineInstr *Update, unsigned UpdateCounterOprNum,
10112 Register Init, bool IsUpdatePriorComp,
10113 const SmallVectorImpl<MachineOperand> &Cond)
10114 : MF(Comp->getParent()->getParent()),
10115 TII(MF->getSubtarget().getInstrInfo()),
10116 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
10117 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
10118 CompCounterOprNum(CompCounterOprNum), Update(Update),
10119 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
10120 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
10122 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
10123 // Make the instructions for loop control be placed in stage 0.
10124 // The predecessors of Comp are considered by the caller.
10125 return MI == Comp;
10128 std::optional<bool> createTripCountGreaterCondition(
10129 int TC, MachineBasicBlock &MBB,
10130 SmallVectorImpl<MachineOperand> &CondParam) override {
10131 // A branch instruction will be inserted as "if (Cond) goto epilogue".
10132 // Cond is normalized for such use.
10133 // The predecessors of the branch are assumed to have already been inserted.
10134 CondParam = Cond;
10135 return {};
10138 void createRemainingIterationsGreaterCondition(
10139 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
10140 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
10142 void setPreheader(MachineBasicBlock *NewPreheader) override {}
10144 void adjustTripCount(int TripCountAdjust) override {}
10146 bool isMVEExpanderSupported() override { return true; }
10148 } // namespace
10150 /// Clone an instruction from MI. The register of ReplaceOprNum-th operand
10151 /// is replaced by ReplaceReg. The output register is newly created.
10152 /// The other operands are unchanged from MI.
10153 static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
10154 Register ReplaceReg, MachineBasicBlock &MBB,
10155 MachineBasicBlock::iterator InsertTo) {
10156 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10157 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
10158 const TargetRegisterInfo *TRI =
10159 MBB.getParent()->getSubtarget().getRegisterInfo();
10160 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
10161 Register Result = 0;
10162 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
10163 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
10164 Result = MRI.createVirtualRegister(
10165 MRI.getRegClass(NewMI->getOperand(0).getReg()));
10166 NewMI->getOperand(I).setReg(Result);
10167 } else if (I == ReplaceOprNum) {
10168 MRI.constrainRegClass(
10169 ReplaceReg,
10170 TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
10171 NewMI->getOperand(I).setReg(ReplaceReg);
10174 MBB.insert(InsertTo, NewMI);
10175 return Result;
10178 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
10179 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
10180 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
10181 // Create and accumulate conditions for next TC iterations.
10182 // Example:
10183 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
10184 // # iteration of the kernel
10186 // # insert the following instructions
10187 // cond = CSINCXr 0, 0, C, implicit $nzcv
10188 // counter = ADDXri counter, 1 # clone from this->Update
10189 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
10190 // cond = CSINCXr cond, cond, C, implicit $nzcv
10191 // ... (repeat TC times)
10192 // SUBSXri cond, 0, implicit-def $nzcv
10194 assert(CondBranch->getOpcode() == AArch64::Bcc);
10195 // CondCode to exit the loop
10196 AArch64CC::CondCode CC =
10197 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
10198 if (CondBranch->getOperand(1).getMBB() == LoopBB)
10199 CC = AArch64CC::getInvertedCondCode(CC);
10201 // Accumulate conditions to exit the loop
10202 Register AccCond = AArch64::XZR;
10204 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
10205 auto AccumulateCond = [&](Register CurCond,
10206 AArch64CC::CondCode CC) -> Register {
10207 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
10208 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
10209 .addReg(NewCond, RegState::Define)
10210 .addReg(CurCond)
10211 .addReg(CurCond)
10212 .addImm(AArch64CC::getInvertedCondCode(CC));
10213 return NewCond;
10216 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
10217 // Update and Comp for I==0 are already exists in MBB
10218 // (MBB is an unrolled kernel)
10219 Register Counter;
10220 for (int I = 0; I <= TC; ++I) {
10221 Register NextCounter;
10222 if (I != 0)
10223 NextCounter =
10224 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
10226 AccCond = AccumulateCond(AccCond, CC);
10228 if (I != TC) {
10229 if (I == 0) {
10230 if (Update != Comp && IsUpdatePriorComp) {
10231 Counter =
10232 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
10233 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
10234 MBB.end());
10235 } else {
10236 // can use already calculated value
10237 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
10239 } else if (Update != Comp) {
10240 NextCounter =
10241 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10244 Counter = NextCounter;
10246 } else {
10247 Register Counter;
10248 if (LastStage0Insts.empty()) {
10249 // use initial counter value (testing if the trip count is sufficient to
10250 // be executed by pipelined code)
10251 Counter = Init;
10252 if (IsUpdatePriorComp)
10253 Counter =
10254 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10255 } else {
10256 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
10257 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
10260 for (int I = 0; I <= TC; ++I) {
10261 Register NextCounter;
10262 NextCounter =
10263 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
10264 AccCond = AccumulateCond(AccCond, CC);
10265 if (I != TC && Update != Comp)
10266 NextCounter =
10267 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10268 Counter = NextCounter;
10272 // If AccCond == 0, the remainder is greater than TC.
10273 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
10274 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
10275 .addReg(AccCond)
10276 .addImm(0)
10277 .addImm(0);
10278 Cond.clear();
10279 Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
10282 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
10283 Register &RegMBB, Register &RegOther) {
10284 assert(Phi.getNumOperands() == 5);
10285 if (Phi.getOperand(2).getMBB() == MBB) {
10286 RegMBB = Phi.getOperand(1).getReg();
10287 RegOther = Phi.getOperand(3).getReg();
10288 } else {
10289 assert(Phi.getOperand(4).getMBB() == MBB);
10290 RegMBB = Phi.getOperand(3).getReg();
10291 RegOther = Phi.getOperand(1).getReg();
10295 static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
10296 if (!Reg.isVirtual())
10297 return false;
10298 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
10299 return MRI.getVRegDef(Reg)->getParent() != BB;
10302 /// If Reg is an induction variable, return true and set some parameters
10303 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
10304 MachineInstr *&UpdateInst,
10305 unsigned &UpdateCounterOprNum, Register &InitReg,
10306 bool &IsUpdatePriorComp) {
10307 // Example:
10309 // Preheader:
10310 // InitReg = ...
10311 // LoopBB:
10312 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
10313 // Reg = COPY Reg0 ; COPY is ignored.
10314 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
10315 // ; Reg is the value calculated in the previous
10316 // ; iteration, so IsUpdatePriorComp == false.
10318 if (LoopBB->pred_size() != 2)
10319 return false;
10320 if (!Reg.isVirtual())
10321 return false;
10322 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
10323 UpdateInst = nullptr;
10324 UpdateCounterOprNum = 0;
10325 InitReg = 0;
10326 IsUpdatePriorComp = true;
10327 Register CurReg = Reg;
10328 while (true) {
10329 MachineInstr *Def = MRI.getVRegDef(CurReg);
10330 if (Def->getParent() != LoopBB)
10331 return false;
10332 if (Def->isCopy()) {
10333 // Ignore copy instructions unless they contain subregisters
10334 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
10335 return false;
10336 CurReg = Def->getOperand(1).getReg();
10337 } else if (Def->isPHI()) {
10338 if (InitReg != 0)
10339 return false;
10340 if (!UpdateInst)
10341 IsUpdatePriorComp = false;
10342 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
10343 } else {
10344 if (UpdateInst)
10345 return false;
10346 switch (Def->getOpcode()) {
10347 case AArch64::ADDSXri:
10348 case AArch64::ADDSWri:
10349 case AArch64::SUBSXri:
10350 case AArch64::SUBSWri:
10351 case AArch64::ADDXri:
10352 case AArch64::ADDWri:
10353 case AArch64::SUBXri:
10354 case AArch64::SUBWri:
10355 UpdateInst = Def;
10356 UpdateCounterOprNum = 1;
10357 break;
10358 case AArch64::ADDSXrr:
10359 case AArch64::ADDSWrr:
10360 case AArch64::SUBSXrr:
10361 case AArch64::SUBSWrr:
10362 case AArch64::ADDXrr:
10363 case AArch64::ADDWrr:
10364 case AArch64::SUBXrr:
10365 case AArch64::SUBWrr:
10366 UpdateInst = Def;
10367 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
10368 UpdateCounterOprNum = 1;
10369 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
10370 UpdateCounterOprNum = 2;
10371 else
10372 return false;
10373 break;
10374 default:
10375 return false;
10377 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
10380 if (!CurReg.isVirtual())
10381 return false;
10382 if (Reg == CurReg)
10383 break;
10386 if (!UpdateInst)
10387 return false;
10389 return true;
10392 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
10393 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
10394 // Accept loops that meet the following conditions
10395 // * The conditional branch is BCC
10396 // * The compare instruction is ADDS/SUBS/WHILEXX
10397 // * One operand of the compare is an induction variable and the other is a
10398 // loop invariant value
10399 // * The induction variable is incremented/decremented by a single instruction
10400 // * Does not contain CALL or instructions which have unmodeled side effects
10402 for (MachineInstr &MI : *LoopBB)
10403 if (MI.isCall() || MI.hasUnmodeledSideEffects())
10404 // This instruction may use NZCV, which interferes with the instruction to
10405 // be inserted for loop control.
10406 return nullptr;
10408 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
10409 SmallVector<MachineOperand, 4> Cond;
10410 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
10411 return nullptr;
10413 // Infinite loops are not supported
10414 if (TBB == LoopBB && FBB == LoopBB)
10415 return nullptr;
10417 // Must be conditional branch
10418 if (TBB != LoopBB && FBB == nullptr)
10419 return nullptr;
10421 assert((TBB == LoopBB || FBB == LoopBB) &&
10422 "The Loop must be a single-basic-block loop");
10424 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
10425 const TargetRegisterInfo &TRI = getRegisterInfo();
10427 if (CondBranch->getOpcode() != AArch64::Bcc)
10428 return nullptr;
10430 // Normalization for createTripCountGreaterCondition()
10431 if (TBB == LoopBB)
10432 reverseBranchCondition(Cond);
10434 MachineInstr *Comp = nullptr;
10435 unsigned CompCounterOprNum = 0;
10436 for (MachineInstr &MI : reverse(*LoopBB)) {
10437 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
10438 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
10439 // operands is a loop invariant value
10441 switch (MI.getOpcode()) {
10442 case AArch64::SUBSXri:
10443 case AArch64::SUBSWri:
10444 case AArch64::ADDSXri:
10445 case AArch64::ADDSWri:
10446 Comp = &MI;
10447 CompCounterOprNum = 1;
10448 break;
10449 case AArch64::ADDSWrr:
10450 case AArch64::ADDSXrr:
10451 case AArch64::SUBSWrr:
10452 case AArch64::SUBSXrr:
10453 Comp = &MI;
10454 break;
10455 default:
10456 if (isWhileOpcode(MI.getOpcode())) {
10457 Comp = &MI;
10458 break;
10460 return nullptr;
10463 if (CompCounterOprNum == 0) {
10464 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
10465 CompCounterOprNum = 2;
10466 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
10467 CompCounterOprNum = 1;
10468 else
10469 return nullptr;
10471 break;
10474 if (!Comp)
10475 return nullptr;
10477 MachineInstr *Update = nullptr;
10478 Register Init;
10479 bool IsUpdatePriorComp;
10480 unsigned UpdateCounterOprNum;
10481 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
10482 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
10483 return nullptr;
10485 return std::make_unique<AArch64PipelinerLoopInfo>(
10486 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
10487 Init, IsUpdatePriorComp, Cond);
10490 /// verifyInstruction - Perform target specific instruction verification.
10491 bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
10492 StringRef &ErrInfo) const {
10494 // Verify that immediate offsets on load/store instructions are within range.
10495 // Stack objects with an FI operand are excluded as they can be fixed up
10496 // during PEI.
10497 TypeSize Scale(0U, false), Width(0U, false);
10498 int64_t MinOffset, MaxOffset;
10499 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
10500 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
10501 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
10502 int64_t Imm = MI.getOperand(ImmIdx).getImm();
10503 if (Imm < MinOffset || Imm > MaxOffset) {
10504 ErrInfo = "Unexpected immediate on load/store instruction";
10505 return false;
10509 return true;
10512 #define GET_INSTRINFO_HELPERS
10513 #define GET_INSTRMAP_INFO
10514 #include "AArch64GenInstrInfo.inc"