[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64FrameLowering.cpp
blobf6a528c0e6fd48cd2dc05f2700d9dd7052706059
1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
11 // On AArch64, stack frames are structured as follows:
13 // The stack grows downward.
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
19 // At function entry, the "frame" looks as follows:
21 // | | Higher address
22 // |-----------------------------------|
23 // | |
24 // | arguments passed on the stack |
25 // | |
26 // |-----------------------------------| <- sp
27 // | | Lower address
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
36 // | | Higher address
37 // |-----------------------------------|
38 // | |
39 // | arguments passed on the stack |
40 // | |
41 // |-----------------------------------|
42 // | |
43 // | (Win64 only) varargs from reg |
44 // | |
45 // |-----------------------------------|
46 // | |
47 // | callee-saved gpr registers | <--.
48 // | | | On Darwin platforms these
49 // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
50 // | prev_lr | | (frame record first)
51 // | prev_fp | <--'
52 // | async context if needed |
53 // | (a.k.a. "frame record") |
54 // |-----------------------------------| <- fp(=x29)
55 // | |
56 // | callee-saved fp/simd/SVE regs |
57 // | |
58 // |-----------------------------------|
59 // | |
60 // | SVE stack objects |
61 // | |
62 // |-----------------------------------|
63 // |.empty.space.to.make.part.below....|
64 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65 // |.the.standard.16-byte.alignment....| compile time; if present)
66 // |-----------------------------------|
67 // | |
68 // | local variables of fixed size |
69 // | including spill slots |
70 // |-----------------------------------| <- bp(not defined by ABI,
71 // |.variable-sized.local.variables....| LLVM chooses X19)
72 // |.(VLAs)............................| (size of this area is unknown at
73 // |...................................| compile time)
74 // |-----------------------------------| <- sp
75 // | | Lower address
78 // To access the data in a frame, at-compile time, a constant offset must be
79 // computable from one of the pointers (fp, bp, sp) to access it. The size
80 // of the areas with a dotted background cannot be computed at compile-time
81 // if they are present, making it required to have all three of fp, bp and
82 // sp to be set up to be able to access all contents in the frame areas,
83 // assuming all of the frame areas are non-empty.
85 // For most functions, some of the frame areas are empty. For those functions,
86 // it may not be necessary to set up fp or bp:
87 // * A base pointer is definitely needed when there are both VLAs and local
88 // variables with more-than-default alignment requirements.
89 // * A frame pointer is definitely needed when there are local variables with
90 // more-than-default alignment requirements.
92 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
93 // callee-saved area, since the unwind encoding does not allow for encoding
94 // this dynamically and existing tools depend on this layout. For other
95 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
96 // area to allow SVE stack objects (allocated directly below the callee-saves,
97 // if available) to be accessed directly from the framepointer.
98 // The SVE spill/fill instructions have VL-scaled addressing modes such
99 // as:
100 // ldr z8, [fp, #-7 mul vl]
101 // For SVE the size of the vector length (VL) is not known at compile-time, so
102 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
103 // layout, we don't need to add an unscaled offset to the framepointer before
104 // accessing the SVE object in the frame.
106 // In some cases when a base pointer is not strictly needed, it is generated
107 // anyway when offsets from the frame pointer to access local variables become
108 // so large that the offset can't be encoded in the immediate fields of loads
109 // or stores.
111 // Outgoing function arguments must be at the bottom of the stack frame when
112 // calling another function. If we do not have variable-sized stack objects, we
113 // can allocate a "reserved call frame" area at the bottom of the local
114 // variable area, large enough for all outgoing calls. If we do have VLAs, then
115 // the stack pointer must be decremented and incremented around each call to
116 // make space for the arguments below the VLAs.
118 // FIXME: also explain the redzone concept.
120 //===----------------------------------------------------------------------===//
122 #include "AArch64FrameLowering.h"
123 #include "AArch64InstrInfo.h"
124 #include "AArch64MachineFunctionInfo.h"
125 #include "AArch64RegisterInfo.h"
126 #include "AArch64Subtarget.h"
127 #include "AArch64TargetMachine.h"
128 #include "MCTargetDesc/AArch64AddressingModes.h"
129 #include "llvm/ADT/ScopeExit.h"
130 #include "llvm/ADT/SmallVector.h"
131 #include "llvm/ADT/Statistic.h"
132 #include "llvm/CodeGen/LivePhysRegs.h"
133 #include "llvm/CodeGen/MachineBasicBlock.h"
134 #include "llvm/CodeGen/MachineFrameInfo.h"
135 #include "llvm/CodeGen/MachineFunction.h"
136 #include "llvm/CodeGen/MachineInstr.h"
137 #include "llvm/CodeGen/MachineInstrBuilder.h"
138 #include "llvm/CodeGen/MachineMemOperand.h"
139 #include "llvm/CodeGen/MachineModuleInfo.h"
140 #include "llvm/CodeGen/MachineOperand.h"
141 #include "llvm/CodeGen/MachineRegisterInfo.h"
142 #include "llvm/CodeGen/RegisterScavenging.h"
143 #include "llvm/CodeGen/TargetInstrInfo.h"
144 #include "llvm/CodeGen/TargetRegisterInfo.h"
145 #include "llvm/CodeGen/TargetSubtargetInfo.h"
146 #include "llvm/CodeGen/WinEHFuncInfo.h"
147 #include "llvm/IR/Attributes.h"
148 #include "llvm/IR/CallingConv.h"
149 #include "llvm/IR/DataLayout.h"
150 #include "llvm/IR/DebugLoc.h"
151 #include "llvm/IR/Function.h"
152 #include "llvm/MC/MCAsmInfo.h"
153 #include "llvm/MC/MCDwarf.h"
154 #include "llvm/Support/CommandLine.h"
155 #include "llvm/Support/Debug.h"
156 #include "llvm/Support/ErrorHandling.h"
157 #include "llvm/Support/LEB128.h"
158 #include "llvm/Support/MathExtras.h"
159 #include "llvm/Support/raw_ostream.h"
160 #include "llvm/Target/TargetMachine.h"
161 #include "llvm/Target/TargetOptions.h"
162 #include <cassert>
163 #include <cstdint>
164 #include <iterator>
165 #include <vector>
167 using namespace llvm;
169 #define DEBUG_TYPE "frame-info"
171 static cl::opt<bool> EnableRedZone("aarch64-redzone",
172 cl::desc("enable use of redzone on AArch64"),
173 cl::init(false), cl::Hidden);
175 static cl::opt<bool>
176 ReverseCSRRestoreSeq("reverse-csr-restore-seq",
177 cl::desc("reverse the CSR restore sequence"),
178 cl::init(false), cl::Hidden);
180 static cl::opt<bool> StackTaggingMergeSetTag(
181 "stack-tagging-merge-settag",
182 cl::desc("merge settag instruction in function epilog"), cl::init(true),
183 cl::Hidden);
185 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
186 cl::desc("sort stack allocations"),
187 cl::init(true), cl::Hidden);
189 cl::opt<bool> EnableHomogeneousPrologEpilog(
190 "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden,
191 cl::desc("Emit homogeneous prologue and epilogue for the size "
192 "optimization (default = off)"));
194 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
196 /// Returns how much of the incoming argument stack area (in bytes) we should
197 /// clean up in an epilogue. For the C calling convention this will be 0, for
198 /// guaranteed tail call conventions it can be positive (a normal return or a
199 /// tail call to a function that uses less stack space for arguments) or
200 /// negative (for a tail call to a function that needs more stack space than us
201 /// for arguments).
202 static int64_t getArgumentStackToRestore(MachineFunction &MF,
203 MachineBasicBlock &MBB) {
204 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
205 bool IsTailCallReturn = false;
206 if (MBB.end() != MBBI) {
207 unsigned RetOpcode = MBBI->getOpcode();
208 IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
209 RetOpcode == AArch64::TCRETURNri ||
210 RetOpcode == AArch64::TCRETURNriBTI;
212 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
214 int64_t ArgumentPopSize = 0;
215 if (IsTailCallReturn) {
216 MachineOperand &StackAdjust = MBBI->getOperand(1);
218 // For a tail-call in a callee-pops-arguments environment, some or all of
219 // the stack may actually be in use for the call's arguments, this is
220 // calculated during LowerCall and consumed here...
221 ArgumentPopSize = StackAdjust.getImm();
222 } else {
223 // ... otherwise the amount to pop is *all* of the argument space,
224 // conveniently stored in the MachineFunctionInfo by
225 // LowerFormalArguments. This will, of course, be zero for the C calling
226 // convention.
227 ArgumentPopSize = AFI->getArgumentStackToRestore();
230 return ArgumentPopSize;
233 static bool produceCompactUnwindFrame(MachineFunction &MF);
234 static bool needsWinCFI(const MachineFunction &MF);
235 static StackOffset getSVEStackSize(const MachineFunction &MF);
237 /// Returns true if a homogeneous prolog or epilog code can be emitted
238 /// for the size optimization. If possible, a frame helper call is injected.
239 /// When Exit block is given, this check is for epilog.
240 bool AArch64FrameLowering::homogeneousPrologEpilog(
241 MachineFunction &MF, MachineBasicBlock *Exit) const {
242 if (!MF.getFunction().hasMinSize())
243 return false;
244 if (!EnableHomogeneousPrologEpilog)
245 return false;
246 if (ReverseCSRRestoreSeq)
247 return false;
248 if (EnableRedZone)
249 return false;
251 // TODO: Window is supported yet.
252 if (needsWinCFI(MF))
253 return false;
254 // TODO: SVE is not supported yet.
255 if (getSVEStackSize(MF))
256 return false;
258 // Bail on stack adjustment needed on return for simplicity.
259 const MachineFrameInfo &MFI = MF.getFrameInfo();
260 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
261 if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
262 return false;
263 if (Exit && getArgumentStackToRestore(MF, *Exit))
264 return false;
266 return true;
269 /// Returns true if CSRs should be paired.
270 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
271 return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
274 /// This is the biggest offset to the stack pointer we can encode in aarch64
275 /// instructions (without using a separate calculation and a temp register).
276 /// Note that the exception here are vector stores/loads which cannot encode any
277 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
278 static const unsigned DefaultSafeSPDisplacement = 255;
280 /// Look at each instruction that references stack frames and return the stack
281 /// size limit beyond which some of these instructions will require a scratch
282 /// register during their expansion later.
283 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
284 // FIXME: For now, just conservatively guestimate based on unscaled indexing
285 // range. We'll end up allocating an unnecessary spill slot a lot, but
286 // realistically that's not a big deal at this stage of the game.
287 for (MachineBasicBlock &MBB : MF) {
288 for (MachineInstr &MI : MBB) {
289 if (MI.isDebugInstr() || MI.isPseudo() ||
290 MI.getOpcode() == AArch64::ADDXri ||
291 MI.getOpcode() == AArch64::ADDSXri)
292 continue;
294 for (const MachineOperand &MO : MI.operands()) {
295 if (!MO.isFI())
296 continue;
298 StackOffset Offset;
299 if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
300 AArch64FrameOffsetCannotUpdate)
301 return 0;
305 return DefaultSafeSPDisplacement;
308 TargetStackID::Value
309 AArch64FrameLowering::getStackIDForScalableVectors() const {
310 return TargetStackID::ScalableVector;
313 /// Returns the size of the fixed object area (allocated next to sp on entry)
314 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
315 static unsigned getFixedObjectSize(const MachineFunction &MF,
316 const AArch64FunctionInfo *AFI, bool IsWin64,
317 bool IsFunclet) {
318 if (!IsWin64 || IsFunclet) {
319 return AFI->getTailCallReservedStack();
320 } else {
321 if (AFI->getTailCallReservedStack() != 0)
322 report_fatal_error("cannot generate ABI-changing tail call for Win64");
323 // Var args are stored here in the primary function.
324 const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
325 // To support EH funclets we allocate an UnwindHelp object
326 const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
327 return alignTo(VarArgsArea + UnwindHelpObject, 16);
331 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
332 static StackOffset getSVEStackSize(const MachineFunction &MF) {
333 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
334 return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
337 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
338 if (!EnableRedZone)
339 return false;
341 // Don't use the red zone if the function explicitly asks us not to.
342 // This is typically used for kernel code.
343 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
344 const unsigned RedZoneSize =
345 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
346 if (!RedZoneSize)
347 return false;
349 const MachineFrameInfo &MFI = MF.getFrameInfo();
350 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
351 uint64_t NumBytes = AFI->getLocalStackSize();
353 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
354 getSVEStackSize(MF));
357 /// hasFP - Return true if the specified function should have a dedicated frame
358 /// pointer register.
359 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
360 const MachineFrameInfo &MFI = MF.getFrameInfo();
361 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
362 // Win64 EH requires a frame pointer if funclets are present, as the locals
363 // are accessed off the frame pointer in both the parent function and the
364 // funclets.
365 if (MF.hasEHFunclets())
366 return true;
367 // Retain behavior of always omitting the FP for leaf functions when possible.
368 if (MF.getTarget().Options.DisableFramePointerElim(MF))
369 return true;
370 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
371 MFI.hasStackMap() || MFI.hasPatchPoint() ||
372 RegInfo->hasStackRealignment(MF))
373 return true;
374 // With large callframes around we may need to use FP to access the scavenging
375 // emergency spillslot.
377 // Unfortunately some calls to hasFP() like machine verifier ->
378 // getReservedReg() -> hasFP in the middle of global isel are too early
379 // to know the max call frame size. Hopefully conservatively returning "true"
380 // in those cases is fine.
381 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
382 if (!MFI.isMaxCallFrameSizeComputed() ||
383 MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
384 return true;
386 return false;
389 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
390 /// not required, we reserve argument space for call sites in the function
391 /// immediately on entry to the current function. This eliminates the need for
392 /// add/sub sp brackets around call sites. Returns true if the call frame is
393 /// included as part of the stack frame.
394 bool
395 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
396 return !MF.getFrameInfo().hasVarSizedObjects();
399 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
400 MachineFunction &MF, MachineBasicBlock &MBB,
401 MachineBasicBlock::iterator I) const {
402 const AArch64InstrInfo *TII =
403 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
404 DebugLoc DL = I->getDebugLoc();
405 unsigned Opc = I->getOpcode();
406 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
407 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
409 if (!hasReservedCallFrame(MF)) {
410 int64_t Amount = I->getOperand(0).getImm();
411 Amount = alignTo(Amount, getStackAlign());
412 if (!IsDestroy)
413 Amount = -Amount;
415 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
416 // doesn't have to pop anything), then the first operand will be zero too so
417 // this adjustment is a no-op.
418 if (CalleePopAmount == 0) {
419 // FIXME: in-function stack adjustment for calls is limited to 24-bits
420 // because there's no guaranteed temporary register available.
422 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
423 // 1) For offset <= 12-bit, we use LSL #0
424 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
425 // LSL #0, and the other uses LSL #12.
427 // Most call frames will be allocated at the start of a function so
428 // this is OK, but it is a limitation that needs dealing with.
429 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
430 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
431 StackOffset::getFixed(Amount), TII);
433 } else if (CalleePopAmount != 0) {
434 // If the calling convention demands that the callee pops arguments from the
435 // stack, we want to add it back if we have a reserved call frame.
436 assert(CalleePopAmount < 0xffffff && "call frame too large");
437 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
438 StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
440 return MBB.erase(I);
443 // Convenience function to create a DWARF expression for
444 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
445 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
446 int NumBytes, int NumVGScaledBytes, unsigned VG,
447 llvm::raw_string_ostream &Comment) {
448 uint8_t buffer[16];
450 if (NumBytes) {
451 Expr.push_back(dwarf::DW_OP_consts);
452 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
453 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
454 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
457 if (NumVGScaledBytes) {
458 Expr.push_back((uint8_t)dwarf::DW_OP_consts);
459 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
461 Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
462 Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
463 Expr.push_back(0);
465 Expr.push_back((uint8_t)dwarf::DW_OP_mul);
466 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
468 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
469 << std::abs(NumVGScaledBytes) << " * VG";
473 // Creates an MCCFIInstruction:
474 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
475 MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
476 const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
477 int64_t NumBytes, NumVGScaledBytes;
478 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes,
479 NumVGScaledBytes);
481 std::string CommentBuffer = "sp";
482 llvm::raw_string_ostream Comment(CommentBuffer);
484 // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
485 SmallString<64> Expr;
486 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
487 Expr.push_back(0);
488 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
489 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
491 // Wrap this into DW_CFA_def_cfa.
492 SmallString<64> DefCfaExpr;
493 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
494 uint8_t buffer[16];
495 DefCfaExpr.append(buffer,
496 buffer + encodeULEB128(Expr.size(), buffer));
497 DefCfaExpr.append(Expr.str());
498 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
499 Comment.str());
502 MCCFIInstruction AArch64FrameLowering::createCfaOffset(
503 const TargetRegisterInfo &TRI, unsigned Reg,
504 const StackOffset &OffsetFromDefCFA) const {
505 int64_t NumBytes, NumVGScaledBytes;
506 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
507 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
509 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
511 // Non-scalable offsets can use DW_CFA_offset directly.
512 if (!NumVGScaledBytes)
513 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
515 std::string CommentBuffer;
516 llvm::raw_string_ostream Comment(CommentBuffer);
517 Comment << printReg(Reg, &TRI) << " @ cfa";
519 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
520 SmallString<64> OffsetExpr;
521 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
522 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
524 // Wrap this into DW_CFA_expression
525 SmallString<64> CfaExpr;
526 CfaExpr.push_back(dwarf::DW_CFA_expression);
527 uint8_t buffer[16];
528 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
529 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
530 CfaExpr.append(OffsetExpr.str());
532 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
535 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
536 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
537 MachineFunction &MF = *MBB.getParent();
538 MachineFrameInfo &MFI = MF.getFrameInfo();
539 const TargetSubtargetInfo &STI = MF.getSubtarget();
540 const TargetRegisterInfo *TRI = STI.getRegisterInfo();
541 const TargetInstrInfo *TII = STI.getInstrInfo();
542 DebugLoc DL = MBB.findDebugLoc(MBBI);
544 // Add callee saved registers to move list.
545 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
546 if (CSI.empty())
547 return;
549 for (const auto &Info : CSI) {
550 unsigned Reg = Info.getReg();
552 // Not all unwinders may know about SVE registers, so assume the lowest
553 // common demoninator.
554 unsigned NewReg;
555 if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
556 Reg = NewReg;
557 else
558 continue;
560 StackOffset Offset;
561 if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) {
562 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
563 Offset =
564 StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
565 StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
566 } else {
567 Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) -
568 getOffsetOfLocalArea());
570 unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
571 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
572 .addCFIIndex(CFIIndex)
573 .setMIFlags(MachineInstr::FrameSetup);
577 // Find a scratch register that we can use at the start of the prologue to
578 // re-align the stack pointer. We avoid using callee-save registers since they
579 // may appear to be free when this is called from canUseAsPrologue (during
580 // shrink wrapping), but then no longer be free when this is called from
581 // emitPrologue.
583 // FIXME: This is a bit conservative, since in the above case we could use one
584 // of the callee-save registers as a scratch temp to re-align the stack pointer,
585 // but we would then have to make sure that we were in fact saving at least one
586 // callee-save register in the prologue, which is additional complexity that
587 // doesn't seem worth the benefit.
588 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
589 MachineFunction *MF = MBB->getParent();
591 // If MBB is an entry block, use X9 as the scratch register
592 if (&MF->front() == MBB)
593 return AArch64::X9;
595 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
596 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
597 LivePhysRegs LiveRegs(TRI);
598 LiveRegs.addLiveIns(*MBB);
600 // Mark callee saved registers as used so we will not choose them.
601 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
602 for (unsigned i = 0; CSRegs[i]; ++i)
603 LiveRegs.addReg(CSRegs[i]);
605 // Prefer X9 since it was historically used for the prologue scratch reg.
606 const MachineRegisterInfo &MRI = MF->getRegInfo();
607 if (LiveRegs.available(MRI, AArch64::X9))
608 return AArch64::X9;
610 for (unsigned Reg : AArch64::GPR64RegClass) {
611 if (LiveRegs.available(MRI, Reg))
612 return Reg;
614 return AArch64::NoRegister;
617 bool AArch64FrameLowering::canUseAsPrologue(
618 const MachineBasicBlock &MBB) const {
619 const MachineFunction *MF = MBB.getParent();
620 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
621 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
622 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
624 // Don't need a scratch register if we're not going to re-align the stack.
625 if (!RegInfo->hasStackRealignment(*MF))
626 return true;
627 // Otherwise, we can use any block as long as it has a scratch register
628 // available.
629 return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
632 static bool windowsRequiresStackProbe(MachineFunction &MF,
633 uint64_t StackSizeInBytes) {
634 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
635 if (!Subtarget.isTargetWindows())
636 return false;
637 const Function &F = MF.getFunction();
638 // TODO: When implementing stack protectors, take that into account
639 // for the probe threshold.
640 unsigned StackProbeSize = 4096;
641 if (F.hasFnAttribute("stack-probe-size"))
642 F.getFnAttribute("stack-probe-size")
643 .getValueAsString()
644 .getAsInteger(0, StackProbeSize);
645 return (StackSizeInBytes >= StackProbeSize) &&
646 !F.hasFnAttribute("no-stack-arg-probe");
649 static bool needsWinCFI(const MachineFunction &MF) {
650 const Function &F = MF.getFunction();
651 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
652 F.needsUnwindTableEntry();
655 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
656 MachineFunction &MF, uint64_t StackBumpBytes) const {
657 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
658 const MachineFrameInfo &MFI = MF.getFrameInfo();
659 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
660 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
661 if (homogeneousPrologEpilog(MF))
662 return false;
664 if (AFI->getLocalStackSize() == 0)
665 return false;
667 // For WinCFI, if optimizing for size, prefer to not combine the stack bump
668 // (to force a stp with predecrement) to match the packed unwind format,
669 // provided that there actually are any callee saved registers to merge the
670 // decrement with.
671 // This is potentially marginally slower, but allows using the packed
672 // unwind format for functions that both have a local area and callee saved
673 // registers. Using the packed unwind format notably reduces the size of
674 // the unwind info.
675 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
676 MF.getFunction().hasOptSize())
677 return false;
679 // 512 is the maximum immediate for stp/ldp that will be used for
680 // callee-save save/restores
681 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
682 return false;
684 if (MFI.hasVarSizedObjects())
685 return false;
687 if (RegInfo->hasStackRealignment(MF))
688 return false;
690 // This isn't strictly necessary, but it simplifies things a bit since the
691 // current RedZone handling code assumes the SP is adjusted by the
692 // callee-save save/restore code.
693 if (canUseRedZone(MF))
694 return false;
696 // When there is an SVE area on the stack, always allocate the
697 // callee-saves and spills/locals separately.
698 if (getSVEStackSize(MF))
699 return false;
701 return true;
704 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
705 MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
706 if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
707 return false;
709 if (MBB.empty())
710 return true;
712 // Disable combined SP bump if the last instruction is an MTE tag store. It
713 // is almost always better to merge SP adjustment into those instructions.
714 MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
715 MachineBasicBlock::iterator Begin = MBB.begin();
716 while (LastI != Begin) {
717 --LastI;
718 if (LastI->isTransient())
719 continue;
720 if (!LastI->getFlag(MachineInstr::FrameDestroy))
721 break;
723 switch (LastI->getOpcode()) {
724 case AArch64::STGloop:
725 case AArch64::STZGloop:
726 case AArch64::STGOffset:
727 case AArch64::STZGOffset:
728 case AArch64::ST2GOffset:
729 case AArch64::STZ2GOffset:
730 return false;
731 default:
732 return true;
734 llvm_unreachable("unreachable");
737 // Given a load or a store instruction, generate an appropriate unwinding SEH
738 // code on Windows.
739 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
740 const TargetInstrInfo &TII,
741 MachineInstr::MIFlag Flag) {
742 unsigned Opc = MBBI->getOpcode();
743 MachineBasicBlock *MBB = MBBI->getParent();
744 MachineFunction &MF = *MBB->getParent();
745 DebugLoc DL = MBBI->getDebugLoc();
746 unsigned ImmIdx = MBBI->getNumOperands() - 1;
747 int Imm = MBBI->getOperand(ImmIdx).getImm();
748 MachineInstrBuilder MIB;
749 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
750 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
752 switch (Opc) {
753 default:
754 llvm_unreachable("No SEH Opcode for this instruction");
755 case AArch64::LDPDpost:
756 Imm = -Imm;
757 LLVM_FALLTHROUGH;
758 case AArch64::STPDpre: {
759 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
760 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
761 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
762 .addImm(Reg0)
763 .addImm(Reg1)
764 .addImm(Imm * 8)
765 .setMIFlag(Flag);
766 break;
768 case AArch64::LDPXpost:
769 Imm = -Imm;
770 LLVM_FALLTHROUGH;
771 case AArch64::STPXpre: {
772 Register Reg0 = MBBI->getOperand(1).getReg();
773 Register Reg1 = MBBI->getOperand(2).getReg();
774 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
775 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
776 .addImm(Imm * 8)
777 .setMIFlag(Flag);
778 else
779 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
780 .addImm(RegInfo->getSEHRegNum(Reg0))
781 .addImm(RegInfo->getSEHRegNum(Reg1))
782 .addImm(Imm * 8)
783 .setMIFlag(Flag);
784 break;
786 case AArch64::LDRDpost:
787 Imm = -Imm;
788 LLVM_FALLTHROUGH;
789 case AArch64::STRDpre: {
790 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
791 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
792 .addImm(Reg)
793 .addImm(Imm)
794 .setMIFlag(Flag);
795 break;
797 case AArch64::LDRXpost:
798 Imm = -Imm;
799 LLVM_FALLTHROUGH;
800 case AArch64::STRXpre: {
801 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
802 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
803 .addImm(Reg)
804 .addImm(Imm)
805 .setMIFlag(Flag);
806 break;
808 case AArch64::STPDi:
809 case AArch64::LDPDi: {
810 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
811 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
812 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
813 .addImm(Reg0)
814 .addImm(Reg1)
815 .addImm(Imm * 8)
816 .setMIFlag(Flag);
817 break;
819 case AArch64::STPXi:
820 case AArch64::LDPXi: {
821 Register Reg0 = MBBI->getOperand(0).getReg();
822 Register Reg1 = MBBI->getOperand(1).getReg();
823 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
824 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
825 .addImm(Imm * 8)
826 .setMIFlag(Flag);
827 else
828 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
829 .addImm(RegInfo->getSEHRegNum(Reg0))
830 .addImm(RegInfo->getSEHRegNum(Reg1))
831 .addImm(Imm * 8)
832 .setMIFlag(Flag);
833 break;
835 case AArch64::STRXui:
836 case AArch64::LDRXui: {
837 int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
838 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
839 .addImm(Reg)
840 .addImm(Imm * 8)
841 .setMIFlag(Flag);
842 break;
844 case AArch64::STRDui:
845 case AArch64::LDRDui: {
846 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
847 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
848 .addImm(Reg)
849 .addImm(Imm * 8)
850 .setMIFlag(Flag);
851 break;
854 auto I = MBB->insertAfter(MBBI, MIB);
855 return I;
858 // Fix up the SEH opcode associated with the save/restore instruction.
859 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
860 unsigned LocalStackSize) {
861 MachineOperand *ImmOpnd = nullptr;
862 unsigned ImmIdx = MBBI->getNumOperands() - 1;
863 switch (MBBI->getOpcode()) {
864 default:
865 llvm_unreachable("Fix the offset in the SEH instruction");
866 case AArch64::SEH_SaveFPLR:
867 case AArch64::SEH_SaveRegP:
868 case AArch64::SEH_SaveReg:
869 case AArch64::SEH_SaveFRegP:
870 case AArch64::SEH_SaveFReg:
871 ImmOpnd = &MBBI->getOperand(ImmIdx);
872 break;
874 if (ImmOpnd)
875 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
878 // Convert callee-save register save/restore instruction to do stack pointer
879 // decrement/increment to allocate/deallocate the callee-save stack area by
880 // converting store/load to use pre/post increment version.
881 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
882 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
883 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
884 bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
885 // Ignore instructions that do not operate on SP, i.e. shadow call stack
886 // instructions and associated CFI instruction.
887 while (MBBI->getOpcode() == AArch64::STRXpost ||
888 MBBI->getOpcode() == AArch64::LDRXpre ||
889 MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
890 if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
891 assert(MBBI->getOperand(0).getReg() != AArch64::SP);
892 ++MBBI;
894 unsigned NewOpc;
895 switch (MBBI->getOpcode()) {
896 default:
897 llvm_unreachable("Unexpected callee-save save/restore opcode!");
898 case AArch64::STPXi:
899 NewOpc = AArch64::STPXpre;
900 break;
901 case AArch64::STPDi:
902 NewOpc = AArch64::STPDpre;
903 break;
904 case AArch64::STPQi:
905 NewOpc = AArch64::STPQpre;
906 break;
907 case AArch64::STRXui:
908 NewOpc = AArch64::STRXpre;
909 break;
910 case AArch64::STRDui:
911 NewOpc = AArch64::STRDpre;
912 break;
913 case AArch64::STRQui:
914 NewOpc = AArch64::STRQpre;
915 break;
916 case AArch64::LDPXi:
917 NewOpc = AArch64::LDPXpost;
918 break;
919 case AArch64::LDPDi:
920 NewOpc = AArch64::LDPDpost;
921 break;
922 case AArch64::LDPQi:
923 NewOpc = AArch64::LDPQpost;
924 break;
925 case AArch64::LDRXui:
926 NewOpc = AArch64::LDRXpost;
927 break;
928 case AArch64::LDRDui:
929 NewOpc = AArch64::LDRDpost;
930 break;
931 case AArch64::LDRQui:
932 NewOpc = AArch64::LDRQpost;
933 break;
935 // Get rid of the SEH code associated with the old instruction.
936 if (NeedsWinCFI) {
937 auto SEH = std::next(MBBI);
938 if (AArch64InstrInfo::isSEHInstruction(*SEH))
939 SEH->eraseFromParent();
942 TypeSize Scale = TypeSize::Fixed(1);
943 unsigned Width;
944 int64_t MinOffset, MaxOffset;
945 bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
946 NewOpc, Scale, Width, MinOffset, MaxOffset);
947 (void)Success;
948 assert(Success && "unknown load/store opcode");
950 // If the first store isn't right where we want SP then we can't fold the
951 // update in so create a normal arithmetic instruction instead.
952 if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
953 CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
954 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
955 StackOffset::getFixed(CSStackSizeInc), TII,
956 InProlog ? MachineInstr::FrameSetup
957 : MachineInstr::FrameDestroy);
958 return std::prev(MBBI);
961 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
962 MIB.addReg(AArch64::SP, RegState::Define);
964 // Copy all operands other than the immediate offset.
965 unsigned OpndIdx = 0;
966 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
967 ++OpndIdx)
968 MIB.add(MBBI->getOperand(OpndIdx));
970 assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
971 "Unexpected immediate offset in first/last callee-save save/restore "
972 "instruction!");
973 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
974 "Unexpected base register in callee-save save/restore instruction!");
975 assert(CSStackSizeInc % Scale == 0);
976 MIB.addImm(CSStackSizeInc / (int)Scale);
978 MIB.setMIFlags(MBBI->getFlags());
979 MIB.setMemRefs(MBBI->memoperands());
981 // Generate a new SEH code that corresponds to the new instruction.
982 if (NeedsWinCFI) {
983 *HasWinCFI = true;
984 InsertSEH(*MIB, *TII,
985 InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
988 return std::prev(MBB.erase(MBBI));
991 // Fixup callee-save register save/restore instructions to take into account
992 // combined SP bump by adding the local stack size to the stack offsets.
993 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
994 uint64_t LocalStackSize,
995 bool NeedsWinCFI,
996 bool *HasWinCFI) {
997 if (AArch64InstrInfo::isSEHInstruction(MI))
998 return;
1000 unsigned Opc = MI.getOpcode();
1002 // Ignore instructions that do not operate on SP, i.e. shadow call stack
1003 // instructions and associated CFI instruction.
1004 if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
1005 Opc == AArch64::CFI_INSTRUCTION) {
1006 if (Opc != AArch64::CFI_INSTRUCTION)
1007 assert(MI.getOperand(0).getReg() != AArch64::SP);
1008 return;
1011 unsigned Scale;
1012 switch (Opc) {
1013 case AArch64::STPXi:
1014 case AArch64::STRXui:
1015 case AArch64::STPDi:
1016 case AArch64::STRDui:
1017 case AArch64::LDPXi:
1018 case AArch64::LDRXui:
1019 case AArch64::LDPDi:
1020 case AArch64::LDRDui:
1021 Scale = 8;
1022 break;
1023 case AArch64::STPQi:
1024 case AArch64::STRQui:
1025 case AArch64::LDPQi:
1026 case AArch64::LDRQui:
1027 Scale = 16;
1028 break;
1029 default:
1030 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1033 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1034 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1035 "Unexpected base register in callee-save save/restore instruction!");
1036 // Last operand is immediate offset that needs fixing.
1037 MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1038 // All generated opcodes have scaled offsets.
1039 assert(LocalStackSize % Scale == 0);
1040 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1042 if (NeedsWinCFI) {
1043 *HasWinCFI = true;
1044 auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1045 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1046 assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1047 "Expecting a SEH instruction");
1048 fixupSEHOpcode(MBBI, LocalStackSize);
1052 static void adaptForLdStOpt(MachineBasicBlock &MBB,
1053 MachineBasicBlock::iterator FirstSPPopI,
1054 MachineBasicBlock::iterator LastPopI) {
1055 // Sometimes (when we restore in the same order as we save), we can end up
1056 // with code like this:
1058 // ldp x26, x25, [sp]
1059 // ldp x24, x23, [sp, #16]
1060 // ldp x22, x21, [sp, #32]
1061 // ldp x20, x19, [sp, #48]
1062 // add sp, sp, #64
1064 // In this case, it is always better to put the first ldp at the end, so
1065 // that the load-store optimizer can run and merge the ldp and the add into
1066 // a post-index ldp.
1067 // If we managed to grab the first pop instruction, move it to the end.
1068 if (ReverseCSRRestoreSeq)
1069 MBB.splice(FirstSPPopI, &MBB, LastPopI);
1070 // We should end up with something like this now:
1072 // ldp x24, x23, [sp, #16]
1073 // ldp x22, x21, [sp, #32]
1074 // ldp x20, x19, [sp, #48]
1075 // ldp x26, x25, [sp]
1076 // add sp, sp, #64
1078 // and the load-store optimizer can merge the last two instructions into:
1080 // ldp x26, x25, [sp], #64
1084 static bool isTargetWindows(const MachineFunction &MF) {
1085 return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1088 // Convenience function to determine whether I is an SVE callee save.
1089 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1090 switch (I->getOpcode()) {
1091 default:
1092 return false;
1093 case AArch64::STR_ZXI:
1094 case AArch64::STR_PXI:
1095 case AArch64::LDR_ZXI:
1096 case AArch64::LDR_PXI:
1097 return I->getFlag(MachineInstr::FrameSetup) ||
1098 I->getFlag(MachineInstr::FrameDestroy);
1102 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1103 MachineBasicBlock &MBB) const {
1104 MachineBasicBlock::iterator MBBI = MBB.begin();
1105 const MachineFrameInfo &MFI = MF.getFrameInfo();
1106 const Function &F = MF.getFunction();
1107 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1108 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1109 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1110 MachineModuleInfo &MMI = MF.getMMI();
1111 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1112 bool needsFrameMoves =
1113 MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1114 bool HasFP = hasFP(MF);
1115 bool NeedsWinCFI = needsWinCFI(MF);
1116 bool HasWinCFI = false;
1117 auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1119 bool IsFunclet = MBB.isEHFuncletEntry();
1121 // At this point, we're going to decide whether or not the function uses a
1122 // redzone. In most cases, the function doesn't have a redzone so let's
1123 // assume that's false and set it to true in the case that there's a redzone.
1124 AFI->setHasRedZone(false);
1126 // Debug location must be unknown since the first debug location is used
1127 // to determine the end of the prologue.
1128 DebugLoc DL;
1130 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1131 if (MFnI.shouldSignReturnAddress()) {
1133 unsigned PACI;
1134 if (MFnI.shouldSignWithBKey()) {
1135 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
1136 .setMIFlag(MachineInstr::FrameSetup);
1137 PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP;
1138 } else {
1139 PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP;
1142 auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI));
1143 if (Subtarget.hasPAuth())
1144 MI.addReg(AArch64::LR, RegState::Define)
1145 .addReg(AArch64::LR)
1146 .addReg(AArch64::SP, RegState::InternalRead);
1147 MI.setMIFlag(MachineInstr::FrameSetup);
1149 unsigned CFIIndex =
1150 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
1151 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1152 .addCFIIndex(CFIIndex)
1153 .setMIFlags(MachineInstr::FrameSetup);
1156 // We signal the presence of a Swift extended frame to external tools by
1157 // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1158 // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1159 // bits so that is still true.
1160 if (HasFP && AFI->hasSwiftAsyncContext()) {
1161 // ORR x29, x29, #0x1000_0000_0000_0000
1162 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1163 .addUse(AArch64::FP)
1164 .addImm(0x1100)
1165 .setMIFlag(MachineInstr::FrameSetup);
1168 // All calls are tail calls in GHC calling conv, and functions have no
1169 // prologue/epilogue.
1170 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1171 return;
1173 // Set tagged base pointer to the requested stack slot.
1174 // Ideally it should match SP value after prologue.
1175 Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1176 if (TBPI)
1177 AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1178 else
1179 AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1181 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1183 // getStackSize() includes all the locals in its size calculation. We don't
1184 // include these locals when computing the stack size of a funclet, as they
1185 // are allocated in the parent's stack frame and accessed via the frame
1186 // pointer from the funclet. We only save the callee saved registers in the
1187 // funclet, which are really the callee saved registers of the parent
1188 // function, including the funclet.
1189 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1190 : MFI.getStackSize();
1191 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1192 assert(!HasFP && "unexpected function without stack frame but with FP");
1193 assert(!SVEStackSize &&
1194 "unexpected function without stack frame but with SVE objects");
1195 // All of the stack allocation is for locals.
1196 AFI->setLocalStackSize(NumBytes);
1197 if (!NumBytes)
1198 return;
1199 // REDZONE: If the stack size is less than 128 bytes, we don't need
1200 // to actually allocate.
1201 if (canUseRedZone(MF)) {
1202 AFI->setHasRedZone(true);
1203 ++NumRedZoneFunctions;
1204 } else {
1205 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1206 StackOffset::getFixed(-NumBytes), TII,
1207 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1208 if (!NeedsWinCFI && needsFrameMoves) {
1209 // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1210 MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
1211 // Encode the stack size of the leaf function.
1212 unsigned CFIIndex = MF.addFrameInst(
1213 MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1214 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1215 .addCFIIndex(CFIIndex)
1216 .setMIFlags(MachineInstr::FrameSetup);
1220 if (NeedsWinCFI) {
1221 HasWinCFI = true;
1222 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1223 .setMIFlag(MachineInstr::FrameSetup);
1226 return;
1229 bool IsWin64 =
1230 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1231 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1233 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1234 // All of the remaining stack allocations are for locals.
1235 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1236 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1237 bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1238 if (CombineSPBump) {
1239 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1240 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1241 StackOffset::getFixed(-NumBytes), TII,
1242 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1243 NumBytes = 0;
1244 } else if (HomPrologEpilog) {
1245 // Stack has been already adjusted.
1246 NumBytes -= PrologueSaveSize;
1247 } else if (PrologueSaveSize != 0) {
1248 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
1249 MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
1250 NumBytes -= PrologueSaveSize;
1252 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1254 // Move past the saves of the callee-saved registers, fixing up the offsets
1255 // and pre-inc if we decided to combine the callee-save and local stack
1256 // pointer bump above.
1257 MachineBasicBlock::iterator End = MBB.end();
1258 while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1259 !IsSVECalleeSave(MBBI)) {
1260 if (CombineSPBump)
1261 fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
1262 NeedsWinCFI, &HasWinCFI);
1263 ++MBBI;
1266 // For funclets the FP belongs to the containing function.
1267 if (!IsFunclet && HasFP) {
1268 // Only set up FP if we actually need to.
1269 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1271 if (CombineSPBump)
1272 FPOffset += AFI->getLocalStackSize();
1274 if (AFI->hasSwiftAsyncContext()) {
1275 // Before we update the live FP we have to ensure there's a valid (or
1276 // null) asynchronous context in its slot just before FP in the frame
1277 // record, so store it now.
1278 const auto &Attrs = MF.getFunction().getAttributes();
1279 bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1280 if (HaveInitialContext)
1281 MBB.addLiveIn(AArch64::X22);
1282 BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1283 .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR)
1284 .addUse(AArch64::SP)
1285 .addImm(FPOffset - 8)
1286 .setMIFlags(MachineInstr::FrameSetup);
1289 if (HomPrologEpilog) {
1290 auto Prolog = MBBI;
1291 --Prolog;
1292 assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
1293 Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
1294 } else {
1295 // Issue sub fp, sp, FPOffset or
1296 // mov fp,sp when FPOffset is zero.
1297 // Note: All stores of callee-saved registers are marked as "FrameSetup".
1298 // This code marks the instruction(s) that set the FP also.
1299 emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
1300 StackOffset::getFixed(FPOffset), TII,
1301 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1305 if (windowsRequiresStackProbe(MF, NumBytes)) {
1306 uint64_t NumWords = NumBytes >> 4;
1307 if (NeedsWinCFI) {
1308 HasWinCFI = true;
1309 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
1310 // exceed this amount. We need to move at most 2^24 - 1 into x15.
1311 // This is at most two instructions, MOVZ follwed by MOVK.
1312 // TODO: Fix to use multiple stack alloc unwind codes for stacks
1313 // exceeding 256MB in size.
1314 if (NumBytes >= (1 << 28))
1315 report_fatal_error("Stack size cannot exceed 256MB for stack "
1316 "unwinding purposes");
1318 uint32_t LowNumWords = NumWords & 0xFFFF;
1319 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
1320 .addImm(LowNumWords)
1321 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1322 .setMIFlag(MachineInstr::FrameSetup);
1323 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1324 .setMIFlag(MachineInstr::FrameSetup);
1325 if ((NumWords & 0xFFFF0000) != 0) {
1326 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
1327 .addReg(AArch64::X15)
1328 .addImm((NumWords & 0xFFFF0000) >> 16) // High half
1329 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
1330 .setMIFlag(MachineInstr::FrameSetup);
1331 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1332 .setMIFlag(MachineInstr::FrameSetup);
1334 } else {
1335 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
1336 .addImm(NumWords)
1337 .setMIFlags(MachineInstr::FrameSetup);
1340 switch (MF.getTarget().getCodeModel()) {
1341 case CodeModel::Tiny:
1342 case CodeModel::Small:
1343 case CodeModel::Medium:
1344 case CodeModel::Kernel:
1345 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
1346 .addExternalSymbol("__chkstk")
1347 .addReg(AArch64::X15, RegState::Implicit)
1348 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1349 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1350 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1351 .setMIFlags(MachineInstr::FrameSetup);
1352 if (NeedsWinCFI) {
1353 HasWinCFI = true;
1354 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1355 .setMIFlag(MachineInstr::FrameSetup);
1357 break;
1358 case CodeModel::Large:
1359 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
1360 .addReg(AArch64::X16, RegState::Define)
1361 .addExternalSymbol("__chkstk")
1362 .addExternalSymbol("__chkstk")
1363 .setMIFlags(MachineInstr::FrameSetup);
1364 if (NeedsWinCFI) {
1365 HasWinCFI = true;
1366 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1367 .setMIFlag(MachineInstr::FrameSetup);
1370 BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
1371 .addReg(AArch64::X16, RegState::Kill)
1372 .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
1373 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
1374 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
1375 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
1376 .setMIFlags(MachineInstr::FrameSetup);
1377 if (NeedsWinCFI) {
1378 HasWinCFI = true;
1379 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1380 .setMIFlag(MachineInstr::FrameSetup);
1382 break;
1385 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
1386 .addReg(AArch64::SP, RegState::Kill)
1387 .addReg(AArch64::X15, RegState::Kill)
1388 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
1389 .setMIFlags(MachineInstr::FrameSetup);
1390 if (NeedsWinCFI) {
1391 HasWinCFI = true;
1392 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1393 .addImm(NumBytes)
1394 .setMIFlag(MachineInstr::FrameSetup);
1396 NumBytes = 0;
1399 StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
1400 MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
1402 // Process the SVE callee-saves to determine what space needs to be
1403 // allocated.
1404 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1405 // Find callee save instructions in frame.
1406 CalleeSavesBegin = MBBI;
1407 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
1408 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
1409 ++MBBI;
1410 CalleeSavesEnd = MBBI;
1412 AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
1413 AllocateAfter = SVEStackSize - AllocateBefore;
1416 // Allocate space for the callee saves (if any).
1417 emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
1418 -AllocateBefore, TII,
1419 MachineInstr::FrameSetup);
1421 // Finally allocate remaining SVE stack space.
1422 emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
1423 -AllocateAfter, TII,
1424 MachineInstr::FrameSetup);
1426 // Allocate space for the rest of the frame.
1427 if (NumBytes) {
1428 // Alignment is required for the parent frame, not the funclet
1429 const bool NeedsRealignment =
1430 !IsFunclet && RegInfo->hasStackRealignment(MF);
1431 unsigned scratchSPReg = AArch64::SP;
1433 if (NeedsRealignment) {
1434 scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
1435 assert(scratchSPReg != AArch64::NoRegister);
1438 // If we're a leaf function, try using the red zone.
1439 if (!canUseRedZone(MF))
1440 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
1441 // the correct value here, as NumBytes also includes padding bytes,
1442 // which shouldn't be counted here.
1443 emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
1444 StackOffset::getFixed(-NumBytes), TII,
1445 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1447 if (NeedsRealignment) {
1448 const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
1449 assert(NrBitsToZero > 1);
1450 assert(scratchSPReg != AArch64::SP);
1452 // SUB X9, SP, NumBytes
1453 // -- X9 is temporary register, so shouldn't contain any live data here,
1454 // -- free to use. This is already produced by emitFrameOffset above.
1455 // AND SP, X9, 0b11111...0000
1456 // The logical immediates have a non-trivial encoding. The following
1457 // formula computes the encoded immediate with all ones but
1458 // NrBitsToZero zero bits as least significant bits.
1459 uint32_t andMaskEncoded = (1 << 12) // = N
1460 | ((64 - NrBitsToZero) << 6) // immr
1461 | ((64 - NrBitsToZero - 1) << 0); // imms
1463 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
1464 .addReg(scratchSPReg, RegState::Kill)
1465 .addImm(andMaskEncoded);
1466 AFI->setStackRealigned(true);
1467 if (NeedsWinCFI) {
1468 HasWinCFI = true;
1469 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
1470 .addImm(NumBytes & andMaskEncoded)
1471 .setMIFlag(MachineInstr::FrameSetup);
1476 // If we need a base pointer, set it up here. It's whatever the value of the
1477 // stack pointer is at this point. Any variable size objects will be allocated
1478 // after this, so we can still use the base pointer to reference locals.
1480 // FIXME: Clarify FrameSetup flags here.
1481 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
1482 // needed.
1483 // For funclets the BP belongs to the containing function.
1484 if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
1485 TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
1486 false);
1487 if (NeedsWinCFI) {
1488 HasWinCFI = true;
1489 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1490 .setMIFlag(MachineInstr::FrameSetup);
1494 // The very last FrameSetup instruction indicates the end of prologue. Emit a
1495 // SEH opcode indicating the prologue end.
1496 if (NeedsWinCFI && HasWinCFI) {
1497 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1498 .setMIFlag(MachineInstr::FrameSetup);
1501 // SEH funclets are passed the frame pointer in X1. If the parent
1502 // function uses the base register, then the base register is used
1503 // directly, and is not retrieved from X1.
1504 if (IsFunclet && F.hasPersonalityFn()) {
1505 EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
1506 if (isAsynchronousEHPersonality(Per)) {
1507 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
1508 .addReg(AArch64::X1)
1509 .setMIFlag(MachineInstr::FrameSetup);
1510 MBB.addLiveIn(AArch64::X1);
1514 if (needsFrameMoves) {
1515 // An example of the prologue:
1517 // .globl __foo
1518 // .align 2
1519 // __foo:
1520 // Ltmp0:
1521 // .cfi_startproc
1522 // .cfi_personality 155, ___gxx_personality_v0
1523 // Leh_func_begin:
1524 // .cfi_lsda 16, Lexception33
1526 // stp xa,bx, [sp, -#offset]!
1527 // ...
1528 // stp x28, x27, [sp, #offset-32]
1529 // stp fp, lr, [sp, #offset-16]
1530 // add fp, sp, #offset - 16
1531 // sub sp, sp, #1360
1533 // The Stack:
1534 // +-------------------------------------------+
1535 // 10000 | ........ | ........ | ........ | ........ |
1536 // 10004 | ........ | ........ | ........ | ........ |
1537 // +-------------------------------------------+
1538 // 10008 | ........ | ........ | ........ | ........ |
1539 // 1000c | ........ | ........ | ........ | ........ |
1540 // +===========================================+
1541 // 10010 | X28 Register |
1542 // 10014 | X28 Register |
1543 // +-------------------------------------------+
1544 // 10018 | X27 Register |
1545 // 1001c | X27 Register |
1546 // +===========================================+
1547 // 10020 | Frame Pointer |
1548 // 10024 | Frame Pointer |
1549 // +-------------------------------------------+
1550 // 10028 | Link Register |
1551 // 1002c | Link Register |
1552 // +===========================================+
1553 // 10030 | ........ | ........ | ........ | ........ |
1554 // 10034 | ........ | ........ | ........ | ........ |
1555 // +-------------------------------------------+
1556 // 10038 | ........ | ........ | ........ | ........ |
1557 // 1003c | ........ | ........ | ........ | ........ |
1558 // +-------------------------------------------+
1560 // [sp] = 10030 :: >>initial value<<
1561 // sp = 10020 :: stp fp, lr, [sp, #-16]!
1562 // fp = sp == 10020 :: mov fp, sp
1563 // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
1564 // sp == 10010 :: >>final value<<
1566 // The frame pointer (w29) points to address 10020. If we use an offset of
1567 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
1568 // for w27, and -32 for w28:
1570 // Ltmp1:
1571 // .cfi_def_cfa w29, 16
1572 // Ltmp2:
1573 // .cfi_offset w30, -8
1574 // Ltmp3:
1575 // .cfi_offset w29, -16
1576 // Ltmp4:
1577 // .cfi_offset w27, -24
1578 // Ltmp5:
1579 // .cfi_offset w28, -32
1581 if (HasFP) {
1582 const int OffsetToFirstCalleeSaveFromFP =
1583 AFI->getCalleeSaveBaseToFrameRecordOffset() -
1584 AFI->getCalleeSavedStackSize();
1585 Register FramePtr = RegInfo->getFrameRegister(MF);
1587 // Define the current CFA rule to use the provided FP.
1588 unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
1589 unsigned CFIIndex = MF.addFrameInst(
1590 MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1591 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1592 .addCFIIndex(CFIIndex)
1593 .setMIFlags(MachineInstr::FrameSetup);
1594 } else {
1595 unsigned CFIIndex;
1596 if (SVEStackSize) {
1597 const TargetSubtargetInfo &STI = MF.getSubtarget();
1598 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
1599 StackOffset TotalSize =
1600 SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
1601 CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
1602 } else {
1603 // Encode the stack size of the leaf function.
1604 CFIIndex = MF.addFrameInst(
1605 MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
1607 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1608 .addCFIIndex(CFIIndex)
1609 .setMIFlags(MachineInstr::FrameSetup);
1612 // Now emit the moves for whatever callee saved regs we have (including FP,
1613 // LR if those are saved).
1614 emitCalleeSavedFrameMoves(MBB, MBBI);
1618 static void InsertReturnAddressAuth(MachineFunction &MF,
1619 MachineBasicBlock &MBB) {
1620 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
1621 if (!MFI.shouldSignReturnAddress())
1622 return;
1623 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1624 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1626 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1627 DebugLoc DL;
1628 if (MBBI != MBB.end())
1629 DL = MBBI->getDebugLoc();
1631 // The AUTIASP instruction assembles to a hint instruction before v8.3a so
1632 // this instruction can safely used for any v8a architecture.
1633 // From v8.3a onwards there are optimised authenticate LR and return
1634 // instructions, namely RETA{A,B}, that can be used instead.
1635 if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
1636 MBBI->getOpcode() == AArch64::RET_ReallyLR) {
1637 BuildMI(MBB, MBBI, DL,
1638 TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA))
1639 .copyImplicitOps(*MBBI);
1640 MBB.erase(MBBI);
1641 } else {
1642 BuildMI(
1643 MBB, MBBI, DL,
1644 TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
1645 .setMIFlag(MachineInstr::FrameDestroy);
1649 static bool isFuncletReturnInstr(const MachineInstr &MI) {
1650 switch (MI.getOpcode()) {
1651 default:
1652 return false;
1653 case AArch64::CATCHRET:
1654 case AArch64::CLEANUPRET:
1655 return true;
1659 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
1660 MachineBasicBlock &MBB) const {
1661 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1662 MachineFrameInfo &MFI = MF.getFrameInfo();
1663 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1664 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1665 DebugLoc DL;
1666 bool NeedsWinCFI = needsWinCFI(MF);
1667 bool HasWinCFI = false;
1668 bool IsFunclet = false;
1669 auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); });
1671 if (MBB.end() != MBBI) {
1672 DL = MBBI->getDebugLoc();
1673 IsFunclet = isFuncletReturnInstr(*MBBI);
1676 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
1677 : MFI.getStackSize();
1678 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1680 // All calls are tail calls in GHC calling conv, and functions have no
1681 // prologue/epilogue.
1682 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1683 return;
1685 // How much of the stack used by incoming arguments this function is expected
1686 // to restore in this particular epilogue.
1687 int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
1689 // The stack frame should be like below,
1691 // ---------------------- ---
1692 // | | |
1693 // | BytesInStackArgArea| CalleeArgStackSize
1694 // | (NumReusableBytes) | (of tail call)
1695 // | | ---
1696 // | | |
1697 // ---------------------| --- |
1698 // | | | |
1699 // | CalleeSavedReg | | |
1700 // | (CalleeSavedStackSize)| | |
1701 // | | | |
1702 // ---------------------| | NumBytes
1703 // | | StackSize (StackAdjustUp)
1704 // | LocalStackSize | | |
1705 // | (covering callee | | |
1706 // | args) | | |
1707 // | | | |
1708 // ---------------------- --- ---
1710 // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
1711 // = StackSize + ArgumentPopSize
1713 // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
1714 // it as the 2nd argument of AArch64ISD::TC_RETURN.
1716 auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
1718 bool IsWin64 =
1719 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1720 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1722 int64_t AfterCSRPopSize = ArgumentStackToRestore;
1723 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1724 // We cannot rely on the local stack size set in emitPrologue if the function
1725 // has funclets, as funclets have different local stack size requirements, and
1726 // the current value set in emitPrologue may be that of the containing
1727 // function.
1728 if (MF.hasEHFunclets())
1729 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1730 if (homogeneousPrologEpilog(MF, &MBB)) {
1731 assert(!NeedsWinCFI);
1732 auto LastPopI = MBB.getFirstTerminator();
1733 if (LastPopI != MBB.begin()) {
1734 auto HomogeneousEpilog = std::prev(LastPopI);
1735 if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
1736 LastPopI = HomogeneousEpilog;
1739 // Adjust local stack
1740 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1741 StackOffset::getFixed(AFI->getLocalStackSize()), TII,
1742 MachineInstr::FrameDestroy, false, NeedsWinCFI);
1744 // SP has been already adjusted while restoring callee save regs.
1745 // We've bailed-out the case with adjusting SP for arguments.
1746 assert(AfterCSRPopSize == 0);
1747 return;
1749 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
1750 // Assume we can't combine the last pop with the sp restore.
1752 if (!CombineSPBump && PrologueSaveSize != 0) {
1753 MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
1754 while (AArch64InstrInfo::isSEHInstruction(*Pop))
1755 Pop = std::prev(Pop);
1756 // Converting the last ldp to a post-index ldp is valid only if the last
1757 // ldp's offset is 0.
1758 const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
1759 // If the offset is 0 and the AfterCSR pop is not actually trying to
1760 // allocate more stack for arguments (in space that an untimely interrupt
1761 // may clobber), convert it to a post-index ldp.
1762 if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0)
1763 convertCalleeSaveRestoreToSPPrePostIncDec(
1764 MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
1765 else {
1766 // If not, make sure to emit an add after the last ldp.
1767 // We're doing this by transfering the size to be restored from the
1768 // adjustment *before* the CSR pops to the adjustment *after* the CSR
1769 // pops.
1770 AfterCSRPopSize += PrologueSaveSize;
1774 // Move past the restores of the callee-saved registers.
1775 // If we plan on combining the sp bump of the local stack size and the callee
1776 // save stack size, we might need to adjust the CSR save and restore offsets.
1777 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
1778 MachineBasicBlock::iterator Begin = MBB.begin();
1779 while (LastPopI != Begin) {
1780 --LastPopI;
1781 if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
1782 IsSVECalleeSave(LastPopI)) {
1783 ++LastPopI;
1784 break;
1785 } else if (CombineSPBump)
1786 fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
1787 NeedsWinCFI, &HasWinCFI);
1790 if (MF.hasWinCFI()) {
1791 // If the prologue didn't contain any SEH opcodes and didn't set the
1792 // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the
1793 // EpilogStart - to avoid generating CFI for functions that don't need it.
1794 // (And as we didn't generate any prologue at all, it would be asymmetrical
1795 // to the epilogue.) By the end of the function, we assert that
1796 // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption.
1797 HasWinCFI = true;
1798 BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
1799 .setMIFlag(MachineInstr::FrameDestroy);
1802 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
1803 // We need to reset FP to its untagged state on return. Bit 60 is currently
1804 // used to show the presence of an extended frame.
1806 // BIC x29, x29, #0x1000_0000_0000_0000
1807 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
1808 AArch64::FP)
1809 .addUse(AArch64::FP)
1810 .addImm(0x10fe)
1811 .setMIFlag(MachineInstr::FrameDestroy);
1814 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1816 // If there is a single SP update, insert it before the ret and we're done.
1817 if (CombineSPBump) {
1818 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1819 emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
1820 StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
1821 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
1822 &HasWinCFI);
1823 if (HasWinCFI)
1824 BuildMI(MBB, MBB.getFirstTerminator(), DL,
1825 TII->get(AArch64::SEH_EpilogEnd))
1826 .setMIFlag(MachineInstr::FrameDestroy);
1827 return;
1830 NumBytes -= PrologueSaveSize;
1831 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1833 // Process the SVE callee-saves to determine what space needs to be
1834 // deallocated.
1835 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
1836 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
1837 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
1838 RestoreBegin = std::prev(RestoreEnd);
1839 while (RestoreBegin != MBB.begin() &&
1840 IsSVECalleeSave(std::prev(RestoreBegin)))
1841 --RestoreBegin;
1843 assert(IsSVECalleeSave(RestoreBegin) &&
1844 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
1846 StackOffset CalleeSavedSizeAsOffset =
1847 StackOffset::getScalable(CalleeSavedSize);
1848 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
1849 DeallocateAfter = CalleeSavedSizeAsOffset;
1852 // Deallocate the SVE area.
1853 if (SVEStackSize) {
1854 if (AFI->isStackRealigned()) {
1855 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
1856 // Set SP to start of SVE callee-save area from which they can
1857 // be reloaded. The code below will deallocate the stack space
1858 // space by moving FP -> SP.
1859 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
1860 StackOffset::getScalable(-CalleeSavedSize), TII,
1861 MachineInstr::FrameDestroy);
1862 } else {
1863 if (AFI->getSVECalleeSavedStackSize()) {
1864 // Deallocate the non-SVE locals first before we can deallocate (and
1865 // restore callee saves) from the SVE area.
1866 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1867 StackOffset::getFixed(NumBytes), TII,
1868 MachineInstr::FrameDestroy);
1869 NumBytes = 0;
1872 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
1873 DeallocateBefore, TII, MachineInstr::FrameDestroy);
1875 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1876 DeallocateAfter, TII, MachineInstr::FrameDestroy);
1880 if (!hasFP(MF)) {
1881 bool RedZone = canUseRedZone(MF);
1882 // If this was a redzone leaf function, we don't need to restore the
1883 // stack pointer (but we may need to pop stack args for fastcc).
1884 if (RedZone && AfterCSRPopSize == 0)
1885 return;
1887 bool NoCalleeSaveRestore = PrologueSaveSize == 0;
1888 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
1889 if (NoCalleeSaveRestore)
1890 StackRestoreBytes += AfterCSRPopSize;
1892 // If we were able to combine the local stack pop with the argument pop,
1893 // then we're done.
1894 bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
1896 // If we're done after this, make sure to help the load store optimizer.
1897 if (Done)
1898 adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
1900 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1901 StackOffset::getFixed(StackRestoreBytes), TII,
1902 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1903 if (Done) {
1904 if (HasWinCFI) {
1905 BuildMI(MBB, MBB.getFirstTerminator(), DL,
1906 TII->get(AArch64::SEH_EpilogEnd))
1907 .setMIFlag(MachineInstr::FrameDestroy);
1909 return;
1912 NumBytes = 0;
1915 // Restore the original stack pointer.
1916 // FIXME: Rather than doing the math here, we should instead just use
1917 // non-post-indexed loads for the restores if we aren't actually going to
1918 // be able to save any instructions.
1919 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
1920 emitFrameOffset(
1921 MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
1922 StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
1923 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
1924 } else if (NumBytes)
1925 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
1926 StackOffset::getFixed(NumBytes), TII,
1927 MachineInstr::FrameDestroy, false, NeedsWinCFI);
1929 // This must be placed after the callee-save restore code because that code
1930 // assumes the SP is at the same location as it was after the callee-save save
1931 // code in the prologue.
1932 if (AfterCSRPopSize) {
1933 assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
1934 "interrupt may have clobbered");
1935 // Find an insertion point for the first ldp so that it goes before the
1936 // shadow call stack epilog instruction. This ensures that the restore of
1937 // lr from x18 is placed after the restore from sp.
1938 auto FirstSPPopI = MBB.getFirstTerminator();
1939 while (FirstSPPopI != Begin) {
1940 auto Prev = std::prev(FirstSPPopI);
1941 if (Prev->getOpcode() != AArch64::LDRXpre ||
1942 Prev->getOperand(0).getReg() == AArch64::SP)
1943 break;
1944 FirstSPPopI = Prev;
1947 adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
1949 emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
1950 StackOffset::getFixed(AfterCSRPopSize), TII,
1951 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
1953 if (HasWinCFI)
1954 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
1955 .setMIFlag(MachineInstr::FrameDestroy);
1958 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
1959 /// debug info. It's the same as what we use for resolving the code-gen
1960 /// references for now. FIXME: This can go wrong when references are
1961 /// SP-relative and simple call frames aren't used.
1962 StackOffset
1963 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1964 Register &FrameReg) const {
1965 return resolveFrameIndexReference(
1966 MF, FI, FrameReg,
1967 /*PreferFP=*/
1968 MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress),
1969 /*ForSimm=*/false);
1972 StackOffset
1973 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
1974 int FI) const {
1975 return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
1978 static StackOffset getFPOffset(const MachineFunction &MF,
1979 int64_t ObjectOffset) {
1980 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
1981 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1982 bool IsWin64 =
1983 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
1984 unsigned FixedObject =
1985 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
1986 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
1987 int64_t FPAdjust =
1988 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
1989 return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
1992 static StackOffset getStackOffset(const MachineFunction &MF,
1993 int64_t ObjectOffset) {
1994 const auto &MFI = MF.getFrameInfo();
1995 return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
1998 // TODO: This function currently does not work for scalable vectors.
1999 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2000 int FI) const {
2001 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2002 MF.getSubtarget().getRegisterInfo());
2003 int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2004 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2005 ? getFPOffset(MF, ObjectOffset).getFixed()
2006 : getStackOffset(MF, ObjectOffset).getFixed();
2009 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2010 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2011 bool ForSimm) const {
2012 const auto &MFI = MF.getFrameInfo();
2013 int64_t ObjectOffset = MFI.getObjectOffset(FI);
2014 bool isFixed = MFI.isFixedObjectIndex(FI);
2015 bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2016 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2017 PreferFP, ForSimm);
2020 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2021 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2022 Register &FrameReg, bool PreferFP, bool ForSimm) const {
2023 const auto &MFI = MF.getFrameInfo();
2024 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2025 MF.getSubtarget().getRegisterInfo());
2026 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2027 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2029 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2030 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2031 bool isCSR =
2032 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2034 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2036 // Use frame pointer to reference fixed objects. Use it for locals if
2037 // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2038 // reliable as a base). Make sure useFPForScavengingIndex() does the
2039 // right thing for the emergency spill slot.
2040 bool UseFP = false;
2041 if (AFI->hasStackFrame() && !isSVE) {
2042 // We shouldn't prefer using the FP when there is an SVE area
2043 // in between the FP and the non-SVE locals/spills.
2044 PreferFP &= !SVEStackSize;
2046 // Note: Keeping the following as multiple 'if' statements rather than
2047 // merging to a single expression for readability.
2049 // Argument access should always use the FP.
2050 if (isFixed) {
2051 UseFP = hasFP(MF);
2052 } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2053 // References to the CSR area must use FP if we're re-aligning the stack
2054 // since the dynamically-sized alignment padding is between the SP/BP and
2055 // the CSR area.
2056 assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2057 UseFP = true;
2058 } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2059 // If the FPOffset is negative and we're producing a signed immediate, we
2060 // have to keep in mind that the available offset range for negative
2061 // offsets is smaller than for positive ones. If an offset is available
2062 // via the FP and the SP, use whichever is closest.
2063 bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2064 PreferFP |= Offset > -FPOffset;
2066 if (MFI.hasVarSizedObjects()) {
2067 // If we have variable sized objects, we can use either FP or BP, as the
2068 // SP offset is unknown. We can use the base pointer if we have one and
2069 // FP is not preferred. If not, we're stuck with using FP.
2070 bool CanUseBP = RegInfo->hasBasePointer(MF);
2071 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2072 UseFP = PreferFP;
2073 else if (!CanUseBP) // Can't use BP. Forced to use FP.
2074 UseFP = true;
2075 // else we can use BP and FP, but the offset from FP won't fit.
2076 // That will make us scavenge registers which we can probably avoid by
2077 // using BP. If it won't fit for BP either, we'll scavenge anyway.
2078 } else if (FPOffset >= 0) {
2079 // Use SP or FP, whichever gives us the best chance of the offset
2080 // being in range for direct access. If the FPOffset is positive,
2081 // that'll always be best, as the SP will be even further away.
2082 UseFP = true;
2083 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2084 // Funclets access the locals contained in the parent's stack frame
2085 // via the frame pointer, so we have to use the FP in the parent
2086 // function.
2087 (void) Subtarget;
2088 assert(
2089 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
2090 "Funclets should only be present on Win64");
2091 UseFP = true;
2092 } else {
2093 // We have the choice between FP and (SP or BP).
2094 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2095 UseFP = true;
2100 assert(
2101 ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2102 "In the presence of dynamic stack pointer realignment, "
2103 "non-argument/CSR objects cannot be accessed through the frame pointer");
2105 if (isSVE) {
2106 StackOffset FPOffset =
2107 StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
2108 StackOffset SPOffset =
2109 SVEStackSize +
2110 StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2111 ObjectOffset);
2112 // Always use the FP for SVE spills if available and beneficial.
2113 if (hasFP(MF) && (SPOffset.getFixed() ||
2114 FPOffset.getScalable() < SPOffset.getScalable() ||
2115 RegInfo->hasStackRealignment(MF))) {
2116 FrameReg = RegInfo->getFrameRegister(MF);
2117 return FPOffset;
2120 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2121 : (unsigned)AArch64::SP;
2122 return SPOffset;
2125 StackOffset ScalableOffset = {};
2126 if (UseFP && !(isFixed || isCSR))
2127 ScalableOffset = -SVEStackSize;
2128 if (!UseFP && (isFixed || isCSR))
2129 ScalableOffset = SVEStackSize;
2131 if (UseFP) {
2132 FrameReg = RegInfo->getFrameRegister(MF);
2133 return StackOffset::getFixed(FPOffset) + ScalableOffset;
2136 // Use the base pointer if we have one.
2137 if (RegInfo->hasBasePointer(MF))
2138 FrameReg = RegInfo->getBaseRegister();
2139 else {
2140 assert(!MFI.hasVarSizedObjects() &&
2141 "Can't use SP when we have var sized objects.");
2142 FrameReg = AArch64::SP;
2143 // If we're using the red zone for this function, the SP won't actually
2144 // be adjusted, so the offsets will be negative. They're also all
2145 // within range of the signed 9-bit immediate instructions.
2146 if (canUseRedZone(MF))
2147 Offset -= AFI->getLocalStackSize();
2150 return StackOffset::getFixed(Offset) + ScalableOffset;
2153 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2154 // Do not set a kill flag on values that are also marked as live-in. This
2155 // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2156 // callee saved registers.
2157 // Omitting the kill flags is conservatively correct even if the live-in
2158 // is not used after all.
2159 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2160 return getKillRegState(!IsLiveIn);
2163 static bool produceCompactUnwindFrame(MachineFunction &MF) {
2164 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2165 AttributeList Attrs = MF.getFunction().getAttributes();
2166 return Subtarget.isTargetMachO() &&
2167 !(Subtarget.getTargetLowering()->supportSwiftError() &&
2168 Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2169 MF.getFunction().getCallingConv() != CallingConv::SwiftTail;
2172 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2173 bool NeedsWinCFI, bool IsFirst) {
2174 // If we are generating register pairs for a Windows function that requires
2175 // EH support, then pair consecutive registers only. There are no unwind
2176 // opcodes for saves/restores of non-consectuve register pairs.
2177 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2178 // save_lrpair.
2179 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2181 if (Reg2 == AArch64::FP)
2182 return true;
2183 if (!NeedsWinCFI)
2184 return false;
2185 if (Reg2 == Reg1 + 1)
2186 return false;
2187 // If pairing a GPR with LR, the pair can be described by the save_lrpair
2188 // opcode. If this is the first register pair, it would end up with a
2189 // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2190 // if LR is paired with something else than the first register.
2191 // The save_lrpair opcode requires the first register to be an odd one.
2192 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2193 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2194 return false;
2195 return true;
2198 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2199 /// WindowsCFI requires that only consecutive registers can be paired.
2200 /// LR and FP need to be allocated together when the frame needs to save
2201 /// the frame-record. This means any other register pairing with LR is invalid.
2202 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2203 bool UsesWinAAPCS, bool NeedsWinCFI,
2204 bool NeedsFrameRecord, bool IsFirst) {
2205 if (UsesWinAAPCS)
2206 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst);
2208 // If we need to store the frame record, don't pair any register
2209 // with LR other than FP.
2210 if (NeedsFrameRecord)
2211 return Reg2 == AArch64::LR;
2213 return false;
2216 namespace {
2218 struct RegPairInfo {
2219 unsigned Reg1 = AArch64::NoRegister;
2220 unsigned Reg2 = AArch64::NoRegister;
2221 int FrameIdx;
2222 int Offset;
2223 enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
2225 RegPairInfo() = default;
2227 bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2229 unsigned getScale() const {
2230 switch (Type) {
2231 case PPR:
2232 return 2;
2233 case GPR:
2234 case FPR64:
2235 return 8;
2236 case ZPR:
2237 case FPR128:
2238 return 16;
2240 llvm_unreachable("Unsupported type");
2243 bool isScalable() const { return Type == PPR || Type == ZPR; }
2246 } // end anonymous namespace
2248 static void computeCalleeSaveRegisterPairs(
2249 MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
2250 const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
2251 bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
2253 if (CSI.empty())
2254 return;
2256 bool IsWindows = isTargetWindows(MF);
2257 bool NeedsWinCFI = needsWinCFI(MF);
2258 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2259 MachineFrameInfo &MFI = MF.getFrameInfo();
2260 CallingConv::ID CC = MF.getFunction().getCallingConv();
2261 unsigned Count = CSI.size();
2262 (void)CC;
2263 // MachO's compact unwind format relies on all registers being stored in
2264 // pairs.
2265 assert((!produceCompactUnwindFrame(MF) ||
2266 CC == CallingConv::PreserveMost ||
2267 (Count & 1) == 0) &&
2268 "Odd number of callee-saved regs to spill!");
2269 int ByteOffset = AFI->getCalleeSavedStackSize();
2270 int StackFillDir = -1;
2271 int RegInc = 1;
2272 unsigned FirstReg = 0;
2273 if (NeedsWinCFI) {
2274 // For WinCFI, fill the stack from the bottom up.
2275 ByteOffset = 0;
2276 StackFillDir = 1;
2277 // As the CSI array is reversed to match PrologEpilogInserter, iterate
2278 // backwards, to pair up registers starting from lower numbered registers.
2279 RegInc = -1;
2280 FirstReg = Count - 1;
2282 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
2283 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
2285 // When iterating backwards, the loop condition relies on unsigned wraparound.
2286 for (unsigned i = FirstReg; i < Count; i += RegInc) {
2287 RegPairInfo RPI;
2288 RPI.Reg1 = CSI[i].getReg();
2290 if (AArch64::GPR64RegClass.contains(RPI.Reg1))
2291 RPI.Type = RegPairInfo::GPR;
2292 else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
2293 RPI.Type = RegPairInfo::FPR64;
2294 else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
2295 RPI.Type = RegPairInfo::FPR128;
2296 else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
2297 RPI.Type = RegPairInfo::ZPR;
2298 else if (AArch64::PPRRegClass.contains(RPI.Reg1))
2299 RPI.Type = RegPairInfo::PPR;
2300 else
2301 llvm_unreachable("Unsupported register class.");
2303 // Add the next reg to the pair if it is in the same register class.
2304 if (unsigned(i + RegInc) < Count) {
2305 unsigned NextReg = CSI[i + RegInc].getReg();
2306 bool IsFirst = i == FirstReg;
2307 switch (RPI.Type) {
2308 case RegPairInfo::GPR:
2309 if (AArch64::GPR64RegClass.contains(NextReg) &&
2310 !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
2311 NeedsWinCFI, NeedsFrameRecord, IsFirst))
2312 RPI.Reg2 = NextReg;
2313 break;
2314 case RegPairInfo::FPR64:
2315 if (AArch64::FPR64RegClass.contains(NextReg) &&
2316 !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
2317 IsFirst))
2318 RPI.Reg2 = NextReg;
2319 break;
2320 case RegPairInfo::FPR128:
2321 if (AArch64::FPR128RegClass.contains(NextReg))
2322 RPI.Reg2 = NextReg;
2323 break;
2324 case RegPairInfo::PPR:
2325 case RegPairInfo::ZPR:
2326 break;
2330 // If either of the registers to be saved is the lr register, it means that
2331 // we also need to save lr in the shadow call stack.
2332 if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
2333 MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
2334 if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
2335 report_fatal_error("Must reserve x18 to use shadow call stack");
2336 NeedShadowCallStackProlog = true;
2339 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
2340 // list to come in sorted by frame index so that we can issue the store
2341 // pair instructions directly. Assert if we see anything otherwise.
2343 // The order of the registers in the list is controlled by
2344 // getCalleeSavedRegs(), so they will always be in-order, as well.
2345 assert((!RPI.isPaired() ||
2346 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
2347 "Out of order callee saved regs!");
2349 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
2350 RPI.Reg1 == AArch64::LR) &&
2351 "FrameRecord must be allocated together with LR");
2353 // Windows AAPCS has FP and LR reversed.
2354 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
2355 RPI.Reg2 == AArch64::LR) &&
2356 "FrameRecord must be allocated together with LR");
2358 // MachO's compact unwind format relies on all registers being stored in
2359 // adjacent register pairs.
2360 assert((!produceCompactUnwindFrame(MF) ||
2361 CC == CallingConv::PreserveMost ||
2362 (RPI.isPaired() &&
2363 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
2364 RPI.Reg1 + 1 == RPI.Reg2))) &&
2365 "Callee-save registers not saved as adjacent register pair!");
2367 RPI.FrameIdx = CSI[i].getFrameIdx();
2368 if (NeedsWinCFI &&
2369 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
2370 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
2372 int Scale = RPI.getScale();
2374 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2375 assert(OffsetPre % Scale == 0);
2377 if (RPI.isScalable())
2378 ScalableByteOffset += StackFillDir * Scale;
2379 else
2380 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
2382 // Swift's async context is directly before FP, so allocate an extra
2383 // 8 bytes for it.
2384 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2385 RPI.Reg2 == AArch64::FP)
2386 ByteOffset += StackFillDir * 8;
2388 assert(!(RPI.isScalable() && RPI.isPaired()) &&
2389 "Paired spill/fill instructions don't exist for SVE vectors");
2391 // Round up size of non-pair to pair size if we need to pad the
2392 // callee-save area to ensure 16-byte alignment.
2393 if (NeedGapToAlignStack && !NeedsWinCFI &&
2394 !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
2395 !RPI.isPaired() && ByteOffset % 16 != 0) {
2396 ByteOffset += 8 * StackFillDir;
2397 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
2398 // A stack frame with a gap looks like this, bottom up:
2399 // d9, d8. x21, gap, x20, x19.
2400 // Set extra alignment on the x21 object to create the gap above it.
2401 MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
2402 NeedGapToAlignStack = false;
2405 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
2406 assert(OffsetPost % Scale == 0);
2407 // If filling top down (default), we want the offset after incrementing it.
2408 // If fillibg bootom up (WinCFI) we need the original offset.
2409 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
2411 // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
2412 // Swift context can directly precede FP.
2413 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
2414 RPI.Reg2 == AArch64::FP)
2415 Offset += 8;
2416 RPI.Offset = Offset / Scale;
2418 assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
2419 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
2420 "Offset out of bounds for LDP/STP immediate");
2422 // Save the offset to frame record so that the FP register can point to the
2423 // innermost frame record (spilled FP and LR registers).
2424 if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR &&
2425 RPI.Reg2 == AArch64::FP) ||
2426 (IsWindows && RPI.Reg1 == AArch64::FP &&
2427 RPI.Reg2 == AArch64::LR)))
2428 AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
2430 RegPairs.push_back(RPI);
2431 if (RPI.isPaired())
2432 i += RegInc;
2434 if (NeedsWinCFI) {
2435 // If we need an alignment gap in the stack, align the topmost stack
2436 // object. A stack frame with a gap looks like this, bottom up:
2437 // x19, d8. d9, gap.
2438 // Set extra alignment on the topmost stack object (the first element in
2439 // CSI, which goes top down), to create the gap above it.
2440 if (AFI->hasCalleeSaveStackFreeSpace())
2441 MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
2442 // We iterated bottom up over the registers; flip RegPairs back to top
2443 // down order.
2444 std::reverse(RegPairs.begin(), RegPairs.end());
2448 bool AArch64FrameLowering::spillCalleeSavedRegisters(
2449 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2450 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2451 MachineFunction &MF = *MBB.getParent();
2452 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2453 bool NeedsWinCFI = needsWinCFI(MF);
2454 DebugLoc DL;
2455 SmallVector<RegPairInfo, 8> RegPairs;
2457 bool NeedShadowCallStackProlog = false;
2458 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2459 NeedShadowCallStackProlog, hasFP(MF));
2460 const MachineRegisterInfo &MRI = MF.getRegInfo();
2462 if (NeedShadowCallStackProlog) {
2463 // Shadow call stack prolog: str x30, [x18], #8
2464 BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
2465 .addReg(AArch64::X18, RegState::Define)
2466 .addReg(AArch64::LR)
2467 .addReg(AArch64::X18)
2468 .addImm(8)
2469 .setMIFlag(MachineInstr::FrameSetup);
2471 if (NeedsWinCFI)
2472 BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
2473 .setMIFlag(MachineInstr::FrameSetup);
2475 if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
2476 // Emit a CFI instruction that causes 8 to be subtracted from the value of
2477 // x18 when unwinding past this frame.
2478 static const char CFIInst[] = {
2479 dwarf::DW_CFA_val_expression,
2480 18, // register
2481 2, // length
2482 static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
2483 static_cast<char>(-8) & 0x7f, // addend (sleb128)
2485 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
2486 nullptr, StringRef(CFIInst, sizeof(CFIInst))));
2487 BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
2488 .addCFIIndex(CFIIndex)
2489 .setMIFlag(MachineInstr::FrameSetup);
2492 // This instruction also makes x18 live-in to the entry block.
2493 MBB.addLiveIn(AArch64::X18);
2496 if (homogeneousPrologEpilog(MF)) {
2497 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
2498 .setMIFlag(MachineInstr::FrameSetup);
2500 for (auto &RPI : RegPairs) {
2501 MIB.addReg(RPI.Reg1);
2502 MIB.addReg(RPI.Reg2);
2504 // Update register live in.
2505 if (!MRI.isReserved(RPI.Reg1))
2506 MBB.addLiveIn(RPI.Reg1);
2507 if (!MRI.isReserved(RPI.Reg2))
2508 MBB.addLiveIn(RPI.Reg2);
2510 return true;
2512 for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
2513 ++RPII) {
2514 RegPairInfo RPI = *RPII;
2515 unsigned Reg1 = RPI.Reg1;
2516 unsigned Reg2 = RPI.Reg2;
2517 unsigned StrOpc;
2519 // Issue sequence of spills for cs regs. The first spill may be converted
2520 // to a pre-decrement store later by emitPrologue if the callee-save stack
2521 // area allocation can't be combined with the local stack area allocation.
2522 // For example:
2523 // stp x22, x21, [sp, #0] // addImm(+0)
2524 // stp x20, x19, [sp, #16] // addImm(+2)
2525 // stp fp, lr, [sp, #32] // addImm(+4)
2526 // Rationale: This sequence saves uop updates compared to a sequence of
2527 // pre-increment spills like stp xi,xj,[sp,#-16]!
2528 // Note: Similar rationale and sequence for restores in epilog.
2529 unsigned Size;
2530 Align Alignment;
2531 switch (RPI.Type) {
2532 case RegPairInfo::GPR:
2533 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
2534 Size = 8;
2535 Alignment = Align(8);
2536 break;
2537 case RegPairInfo::FPR64:
2538 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
2539 Size = 8;
2540 Alignment = Align(8);
2541 break;
2542 case RegPairInfo::FPR128:
2543 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
2544 Size = 16;
2545 Alignment = Align(16);
2546 break;
2547 case RegPairInfo::ZPR:
2548 StrOpc = AArch64::STR_ZXI;
2549 Size = 16;
2550 Alignment = Align(16);
2551 break;
2552 case RegPairInfo::PPR:
2553 StrOpc = AArch64::STR_PXI;
2554 Size = 2;
2555 Alignment = Align(2);
2556 break;
2558 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
2559 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2560 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2561 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2562 dbgs() << ")\n");
2564 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
2565 "Windows unwdinding requires a consecutive (FP,LR) pair");
2566 // Windows unwind codes require consecutive registers if registers are
2567 // paired. Make the switch here, so that the code below will save (x,x+1)
2568 // and not (x+1,x).
2569 unsigned FrameIdxReg1 = RPI.FrameIdx;
2570 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2571 if (NeedsWinCFI && RPI.isPaired()) {
2572 std::swap(Reg1, Reg2);
2573 std::swap(FrameIdxReg1, FrameIdxReg2);
2575 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
2576 if (!MRI.isReserved(Reg1))
2577 MBB.addLiveIn(Reg1);
2578 if (RPI.isPaired()) {
2579 if (!MRI.isReserved(Reg2))
2580 MBB.addLiveIn(Reg2);
2581 MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
2582 MIB.addMemOperand(MF.getMachineMemOperand(
2583 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2584 MachineMemOperand::MOStore, Size, Alignment));
2586 MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
2587 .addReg(AArch64::SP)
2588 .addImm(RPI.Offset) // [sp, #offset*scale],
2589 // where factor*scale is implicit
2590 .setMIFlag(MachineInstr::FrameSetup);
2591 MIB.addMemOperand(MF.getMachineMemOperand(
2592 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2593 MachineMemOperand::MOStore, Size, Alignment));
2594 if (NeedsWinCFI)
2595 InsertSEH(MIB, TII, MachineInstr::FrameSetup);
2597 // Update the StackIDs of the SVE stack slots.
2598 MachineFrameInfo &MFI = MF.getFrameInfo();
2599 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
2600 MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
2603 return true;
2606 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
2607 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2608 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2609 MachineFunction &MF = *MBB.getParent();
2610 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
2611 DebugLoc DL;
2612 SmallVector<RegPairInfo, 8> RegPairs;
2613 bool NeedsWinCFI = needsWinCFI(MF);
2615 if (MI != MBB.end())
2616 DL = MI->getDebugLoc();
2618 bool NeedShadowCallStackProlog = false;
2619 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
2620 NeedShadowCallStackProlog, hasFP(MF));
2622 auto EmitMI = [&](const RegPairInfo &RPI) {
2623 unsigned Reg1 = RPI.Reg1;
2624 unsigned Reg2 = RPI.Reg2;
2626 // Issue sequence of restores for cs regs. The last restore may be converted
2627 // to a post-increment load later by emitEpilogue if the callee-save stack
2628 // area allocation can't be combined with the local stack area allocation.
2629 // For example:
2630 // ldp fp, lr, [sp, #32] // addImm(+4)
2631 // ldp x20, x19, [sp, #16] // addImm(+2)
2632 // ldp x22, x21, [sp, #0] // addImm(+0)
2633 // Note: see comment in spillCalleeSavedRegisters()
2634 unsigned LdrOpc;
2635 unsigned Size;
2636 Align Alignment;
2637 switch (RPI.Type) {
2638 case RegPairInfo::GPR:
2639 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
2640 Size = 8;
2641 Alignment = Align(8);
2642 break;
2643 case RegPairInfo::FPR64:
2644 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
2645 Size = 8;
2646 Alignment = Align(8);
2647 break;
2648 case RegPairInfo::FPR128:
2649 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
2650 Size = 16;
2651 Alignment = Align(16);
2652 break;
2653 case RegPairInfo::ZPR:
2654 LdrOpc = AArch64::LDR_ZXI;
2655 Size = 16;
2656 Alignment = Align(16);
2657 break;
2658 case RegPairInfo::PPR:
2659 LdrOpc = AArch64::LDR_PXI;
2660 Size = 2;
2661 Alignment = Align(2);
2662 break;
2664 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
2665 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
2666 dbgs() << ") -> fi#(" << RPI.FrameIdx;
2667 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
2668 dbgs() << ")\n");
2670 // Windows unwind codes require consecutive registers if registers are
2671 // paired. Make the switch here, so that the code below will save (x,x+1)
2672 // and not (x+1,x).
2673 unsigned FrameIdxReg1 = RPI.FrameIdx;
2674 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
2675 if (NeedsWinCFI && RPI.isPaired()) {
2676 std::swap(Reg1, Reg2);
2677 std::swap(FrameIdxReg1, FrameIdxReg2);
2679 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
2680 if (RPI.isPaired()) {
2681 MIB.addReg(Reg2, getDefRegState(true));
2682 MIB.addMemOperand(MF.getMachineMemOperand(
2683 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
2684 MachineMemOperand::MOLoad, Size, Alignment));
2686 MIB.addReg(Reg1, getDefRegState(true))
2687 .addReg(AArch64::SP)
2688 .addImm(RPI.Offset) // [sp, #offset*scale]
2689 // where factor*scale is implicit
2690 .setMIFlag(MachineInstr::FrameDestroy);
2691 MIB.addMemOperand(MF.getMachineMemOperand(
2692 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
2693 MachineMemOperand::MOLoad, Size, Alignment));
2694 if (NeedsWinCFI)
2695 InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
2698 // SVE objects are always restored in reverse order.
2699 for (const RegPairInfo &RPI : reverse(RegPairs))
2700 if (RPI.isScalable())
2701 EmitMI(RPI);
2703 if (ReverseCSRRestoreSeq) {
2704 for (const RegPairInfo &RPI : reverse(RegPairs))
2705 if (!RPI.isScalable())
2706 EmitMI(RPI);
2707 } else if (homogeneousPrologEpilog(MF, &MBB)) {
2708 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
2709 .setMIFlag(MachineInstr::FrameDestroy);
2710 for (auto &RPI : RegPairs) {
2711 MIB.addReg(RPI.Reg1, RegState::Define);
2712 MIB.addReg(RPI.Reg2, RegState::Define);
2714 return true;
2715 } else
2716 for (const RegPairInfo &RPI : RegPairs)
2717 if (!RPI.isScalable())
2718 EmitMI(RPI);
2720 if (NeedShadowCallStackProlog) {
2721 // Shadow call stack epilog: ldr x30, [x18, #-8]!
2722 BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
2723 .addReg(AArch64::X18, RegState::Define)
2724 .addReg(AArch64::LR, RegState::Define)
2725 .addReg(AArch64::X18)
2726 .addImm(-8)
2727 .setMIFlag(MachineInstr::FrameDestroy);
2730 return true;
2733 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
2734 BitVector &SavedRegs,
2735 RegScavenger *RS) const {
2736 // All calls are tail calls in GHC calling conv, and functions have no
2737 // prologue/epilogue.
2738 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2739 return;
2741 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
2742 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
2743 MF.getSubtarget().getRegisterInfo());
2744 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2745 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2746 unsigned UnspilledCSGPR = AArch64::NoRegister;
2747 unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
2749 MachineFrameInfo &MFI = MF.getFrameInfo();
2750 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
2752 unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
2753 ? RegInfo->getBaseRegister()
2754 : (unsigned)AArch64::NoRegister;
2756 unsigned ExtraCSSpill = 0;
2757 // Figure out which callee-saved registers to save/restore.
2758 for (unsigned i = 0; CSRegs[i]; ++i) {
2759 const unsigned Reg = CSRegs[i];
2761 // Add the base pointer register to SavedRegs if it is callee-save.
2762 if (Reg == BasePointerReg)
2763 SavedRegs.set(Reg);
2765 bool RegUsed = SavedRegs.test(Reg);
2766 unsigned PairedReg = AArch64::NoRegister;
2767 if (AArch64::GPR64RegClass.contains(Reg) ||
2768 AArch64::FPR64RegClass.contains(Reg) ||
2769 AArch64::FPR128RegClass.contains(Reg))
2770 PairedReg = CSRegs[i ^ 1];
2772 if (!RegUsed) {
2773 if (AArch64::GPR64RegClass.contains(Reg) &&
2774 !RegInfo->isReservedReg(MF, Reg)) {
2775 UnspilledCSGPR = Reg;
2776 UnspilledCSGPRPaired = PairedReg;
2778 continue;
2781 // MachO's compact unwind format relies on all registers being stored in
2782 // pairs.
2783 // FIXME: the usual format is actually better if unwinding isn't needed.
2784 if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
2785 !SavedRegs.test(PairedReg)) {
2786 SavedRegs.set(PairedReg);
2787 if (AArch64::GPR64RegClass.contains(PairedReg) &&
2788 !RegInfo->isReservedReg(MF, PairedReg))
2789 ExtraCSSpill = PairedReg;
2793 if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
2794 !Subtarget.isTargetWindows()) {
2795 // For Windows calling convention on a non-windows OS, where X18 is treated
2796 // as reserved, back up X18 when entering non-windows code (marked with the
2797 // Windows calling convention) and restore when returning regardless of
2798 // whether the individual function uses it - it might call other functions
2799 // that clobber it.
2800 SavedRegs.set(AArch64::X18);
2803 // Calculates the callee saved stack size.
2804 unsigned CSStackSize = 0;
2805 unsigned SVECSStackSize = 0;
2806 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2807 const MachineRegisterInfo &MRI = MF.getRegInfo();
2808 for (unsigned Reg : SavedRegs.set_bits()) {
2809 auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
2810 if (AArch64::PPRRegClass.contains(Reg) ||
2811 AArch64::ZPRRegClass.contains(Reg))
2812 SVECSStackSize += RegSize;
2813 else
2814 CSStackSize += RegSize;
2817 // Save number of saved regs, so we can easily update CSStackSize later.
2818 unsigned NumSavedRegs = SavedRegs.count();
2820 // The frame record needs to be created by saving the appropriate registers
2821 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
2822 if (hasFP(MF) ||
2823 windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
2824 SavedRegs.set(AArch64::FP);
2825 SavedRegs.set(AArch64::LR);
2828 LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
2829 for (unsigned Reg
2830 : SavedRegs.set_bits()) dbgs()
2831 << ' ' << printReg(Reg, RegInfo);
2832 dbgs() << "\n";);
2834 // If any callee-saved registers are used, the frame cannot be eliminated.
2835 int64_t SVEStackSize =
2836 alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
2837 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
2839 // The CSR spill slots have not been allocated yet, so estimateStackSize
2840 // won't include them.
2841 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
2843 // Conservatively always assume BigStack when there are SVE spills.
2844 bool BigStack = SVEStackSize ||
2845 (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
2846 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
2847 AFI->setHasStackFrame(true);
2849 // Estimate if we might need to scavenge a register at some point in order
2850 // to materialize a stack offset. If so, either spill one additional
2851 // callee-saved register or reserve a special spill slot to facilitate
2852 // register scavenging. If we already spilled an extra callee-saved register
2853 // above to keep the number of spills even, we don't need to do anything else
2854 // here.
2855 if (BigStack) {
2856 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
2857 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
2858 << " to get a scratch register.\n");
2859 SavedRegs.set(UnspilledCSGPR);
2860 // MachO's compact unwind format relies on all registers being stored in
2861 // pairs, so if we need to spill one extra for BigStack, then we need to
2862 // store the pair.
2863 if (producePairRegisters(MF))
2864 SavedRegs.set(UnspilledCSGPRPaired);
2865 ExtraCSSpill = UnspilledCSGPR;
2868 // If we didn't find an extra callee-saved register to spill, create
2869 // an emergency spill slot.
2870 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
2871 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2872 const TargetRegisterClass &RC = AArch64::GPR64RegClass;
2873 unsigned Size = TRI->getSpillSize(RC);
2874 Align Alignment = TRI->getSpillAlign(RC);
2875 int FI = MFI.CreateStackObject(Size, Alignment, false);
2876 RS->addScavengingFrameIndex(FI);
2877 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
2878 << " as the emergency spill slot.\n");
2882 // Adding the size of additional 64bit GPR saves.
2883 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
2885 // A Swift asynchronous context extends the frame record with a pointer
2886 // directly before FP.
2887 if (hasFP(MF) && AFI->hasSwiftAsyncContext())
2888 CSStackSize += 8;
2890 uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
2891 LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
2892 << EstimatedStackSize + AlignedCSStackSize
2893 << " bytes.\n");
2895 assert((!MFI.isCalleeSavedInfoValid() ||
2896 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
2897 "Should not invalidate callee saved info");
2899 // Round up to register pair alignment to avoid additional SP adjustment
2900 // instructions.
2901 AFI->setCalleeSavedStackSize(AlignedCSStackSize);
2902 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
2903 AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
2906 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
2907 MachineFunction &MF, const TargetRegisterInfo *RegInfo,
2908 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
2909 unsigned &MaxCSFrameIndex) const {
2910 bool NeedsWinCFI = needsWinCFI(MF);
2911 // To match the canonical windows frame layout, reverse the list of
2912 // callee saved registers to get them laid out by PrologEpilogInserter
2913 // in the right order. (PrologEpilogInserter allocates stack objects top
2914 // down. Windows canonical prologs store higher numbered registers at
2915 // the top, thus have the CSI array start from the highest registers.)
2916 if (NeedsWinCFI)
2917 std::reverse(CSI.begin(), CSI.end());
2919 if (CSI.empty())
2920 return true; // Early exit if no callee saved registers are modified!
2922 // Now that we know which registers need to be saved and restored, allocate
2923 // stack slots for them.
2924 MachineFrameInfo &MFI = MF.getFrameInfo();
2925 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2926 for (auto &CS : CSI) {
2927 Register Reg = CS.getReg();
2928 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
2930 unsigned Size = RegInfo->getSpillSize(*RC);
2931 Align Alignment(RegInfo->getSpillAlign(*RC));
2932 int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
2933 CS.setFrameIdx(FrameIdx);
2935 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2936 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2938 // Grab 8 bytes below FP for the extended asynchronous frame info.
2939 if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) {
2940 FrameIdx = MFI.CreateStackObject(8, Alignment, true);
2941 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
2942 if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
2943 if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
2946 return true;
2949 bool AArch64FrameLowering::enableStackSlotScavenging(
2950 const MachineFunction &MF) const {
2951 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2952 return AFI->hasCalleeSaveStackFreeSpace();
2955 /// returns true if there are any SVE callee saves.
2956 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
2957 int &Min, int &Max) {
2958 Min = std::numeric_limits<int>::max();
2959 Max = std::numeric_limits<int>::min();
2961 if (!MFI.isCalleeSavedInfoValid())
2962 return false;
2964 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
2965 for (auto &CS : CSI) {
2966 if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
2967 AArch64::PPRRegClass.contains(CS.getReg())) {
2968 assert((Max == std::numeric_limits<int>::min() ||
2969 Max + 1 == CS.getFrameIdx()) &&
2970 "SVE CalleeSaves are not consecutive");
2972 Min = std::min(Min, CS.getFrameIdx());
2973 Max = std::max(Max, CS.getFrameIdx());
2976 return Min != std::numeric_limits<int>::max();
2979 // Process all the SVE stack objects and determine offsets for each
2980 // object. If AssignOffsets is true, the offsets get assigned.
2981 // Fills in the first and last callee-saved frame indices into
2982 // Min/MaxCSFrameIndex, respectively.
2983 // Returns the size of the stack.
2984 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
2985 int &MinCSFrameIndex,
2986 int &MaxCSFrameIndex,
2987 bool AssignOffsets) {
2988 #ifndef NDEBUG
2989 // First process all fixed stack objects.
2990 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
2991 assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
2992 "SVE vectors should never be passed on the stack by value, only by "
2993 "reference.");
2994 #endif
2996 auto Assign = [&MFI](int FI, int64_t Offset) {
2997 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
2998 MFI.setObjectOffset(FI, Offset);
3001 int64_t Offset = 0;
3003 // Then process all callee saved slots.
3004 if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
3005 // Assign offsets to the callee save slots.
3006 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
3007 Offset += MFI.getObjectSize(I);
3008 Offset = alignTo(Offset, MFI.getObjectAlign(I));
3009 if (AssignOffsets)
3010 Assign(I, -Offset);
3014 // Ensure that the Callee-save area is aligned to 16bytes.
3015 Offset = alignTo(Offset, Align(16U));
3017 // Create a buffer of SVE objects to allocate and sort it.
3018 SmallVector<int, 8> ObjectsToAllocate;
3019 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
3020 unsigned StackID = MFI.getStackID(I);
3021 if (StackID != TargetStackID::ScalableVector)
3022 continue;
3023 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
3024 continue;
3025 if (MFI.isDeadObjectIndex(I))
3026 continue;
3028 ObjectsToAllocate.push_back(I);
3031 // Allocate all SVE locals and spills
3032 for (unsigned FI : ObjectsToAllocate) {
3033 Align Alignment = MFI.getObjectAlign(FI);
3034 // FIXME: Given that the length of SVE vectors is not necessarily a power of
3035 // two, we'd need to align every object dynamically at runtime if the
3036 // alignment is larger than 16. This is not yet supported.
3037 if (Alignment > Align(16))
3038 report_fatal_error(
3039 "Alignment of scalable vectors > 16 bytes is not yet supported");
3041 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
3042 if (AssignOffsets)
3043 Assign(FI, -Offset);
3046 return Offset;
3049 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
3050 MachineFrameInfo &MFI) const {
3051 int MinCSFrameIndex, MaxCSFrameIndex;
3052 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
3055 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
3056 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
3057 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
3058 true);
3061 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
3062 MachineFunction &MF, RegScavenger *RS) const {
3063 MachineFrameInfo &MFI = MF.getFrameInfo();
3065 assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
3066 "Upwards growing stack unsupported");
3068 int MinCSFrameIndex, MaxCSFrameIndex;
3069 int64_t SVEStackSize =
3070 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
3072 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3073 AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
3074 AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
3076 // If this function isn't doing Win64-style C++ EH, we don't need to do
3077 // anything.
3078 if (!MF.hasEHFunclets())
3079 return;
3080 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3081 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
3083 MachineBasicBlock &MBB = MF.front();
3084 auto MBBI = MBB.begin();
3085 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
3086 ++MBBI;
3088 // Create an UnwindHelp object.
3089 // The UnwindHelp object is allocated at the start of the fixed object area
3090 int64_t FixedObject =
3091 getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
3092 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
3093 /*SPOffset*/ -FixedObject,
3094 /*IsImmutable=*/false);
3095 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
3097 // We need to store -2 into the UnwindHelp object at the start of the
3098 // function.
3099 DebugLoc DL;
3100 RS->enterBasicBlockEnd(MBB);
3101 RS->backward(std::prev(MBBI));
3102 unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
3103 assert(DstReg && "There must be a free register after frame setup");
3104 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
3105 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
3106 .addReg(DstReg, getKillRegState(true))
3107 .addFrameIndex(UnwindHelpFI)
3108 .addImm(0);
3111 namespace {
3112 struct TagStoreInstr {
3113 MachineInstr *MI;
3114 int64_t Offset, Size;
3115 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
3116 : MI(MI), Offset(Offset), Size(Size) {}
3119 class TagStoreEdit {
3120 MachineFunction *MF;
3121 MachineBasicBlock *MBB;
3122 MachineRegisterInfo *MRI;
3123 // Tag store instructions that are being replaced.
3124 SmallVector<TagStoreInstr, 8> TagStores;
3125 // Combined memref arguments of the above instructions.
3126 SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
3128 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
3129 // FrameRegOffset + Size) with the address tag of SP.
3130 Register FrameReg;
3131 StackOffset FrameRegOffset;
3132 int64_t Size;
3133 // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
3134 Optional<int64_t> FrameRegUpdate;
3135 // MIFlags for any FrameReg updating instructions.
3136 unsigned FrameRegUpdateFlags;
3138 // Use zeroing instruction variants.
3139 bool ZeroData;
3140 DebugLoc DL;
3142 void emitUnrolled(MachineBasicBlock::iterator InsertI);
3143 void emitLoop(MachineBasicBlock::iterator InsertI);
3145 public:
3146 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
3147 : MBB(MBB), ZeroData(ZeroData) {
3148 MF = MBB->getParent();
3149 MRI = &MF->getRegInfo();
3151 // Add an instruction to be replaced. Instructions must be added in the
3152 // ascending order of Offset, and have to be adjacent.
3153 void addInstruction(TagStoreInstr I) {
3154 assert((TagStores.empty() ||
3155 TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
3156 "Non-adjacent tag store instructions.");
3157 TagStores.push_back(I);
3159 void clear() { TagStores.clear(); }
3160 // Emit equivalent code at the given location, and erase the current set of
3161 // instructions. May skip if the replacement is not profitable. May invalidate
3162 // the input iterator and replace it with a valid one.
3163 void emitCode(MachineBasicBlock::iterator &InsertI,
3164 const AArch64FrameLowering *TFI, bool IsLast);
3167 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
3168 const AArch64InstrInfo *TII =
3169 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3171 const int64_t kMinOffset = -256 * 16;
3172 const int64_t kMaxOffset = 255 * 16;
3174 Register BaseReg = FrameReg;
3175 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
3176 if (BaseRegOffsetBytes < kMinOffset ||
3177 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
3178 Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3179 emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
3180 StackOffset::getFixed(BaseRegOffsetBytes), TII);
3181 BaseReg = ScratchReg;
3182 BaseRegOffsetBytes = 0;
3185 MachineInstr *LastI = nullptr;
3186 while (Size) {
3187 int64_t InstrSize = (Size > 16) ? 32 : 16;
3188 unsigned Opcode =
3189 InstrSize == 16
3190 ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
3191 : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
3192 MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
3193 .addReg(AArch64::SP)
3194 .addReg(BaseReg)
3195 .addImm(BaseRegOffsetBytes / 16)
3196 .setMemRefs(CombinedMemRefs);
3197 // A store to [BaseReg, #0] should go last for an opportunity to fold the
3198 // final SP adjustment in the epilogue.
3199 if (BaseRegOffsetBytes == 0)
3200 LastI = I;
3201 BaseRegOffsetBytes += InstrSize;
3202 Size -= InstrSize;
3205 if (LastI)
3206 MBB->splice(InsertI, MBB, LastI);
3209 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
3210 const AArch64InstrInfo *TII =
3211 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
3213 Register BaseReg = FrameRegUpdate
3214 ? FrameReg
3215 : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3216 Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
3218 emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
3220 int64_t LoopSize = Size;
3221 // If the loop size is not a multiple of 32, split off one 16-byte store at
3222 // the end to fold BaseReg update into.
3223 if (FrameRegUpdate && *FrameRegUpdate)
3224 LoopSize -= LoopSize % 32;
3225 MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
3226 TII->get(ZeroData ? AArch64::STZGloop_wback
3227 : AArch64::STGloop_wback))
3228 .addDef(SizeReg)
3229 .addDef(BaseReg)
3230 .addImm(LoopSize)
3231 .addReg(BaseReg)
3232 .setMemRefs(CombinedMemRefs);
3233 if (FrameRegUpdate)
3234 LoopI->setFlags(FrameRegUpdateFlags);
3236 int64_t ExtraBaseRegUpdate =
3237 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
3238 if (LoopSize < Size) {
3239 assert(FrameRegUpdate);
3240 assert(Size - LoopSize == 16);
3241 // Tag 16 more bytes at BaseReg and update BaseReg.
3242 BuildMI(*MBB, InsertI, DL,
3243 TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
3244 .addDef(BaseReg)
3245 .addReg(BaseReg)
3246 .addReg(BaseReg)
3247 .addImm(1 + ExtraBaseRegUpdate / 16)
3248 .setMemRefs(CombinedMemRefs)
3249 .setMIFlags(FrameRegUpdateFlags);
3250 } else if (ExtraBaseRegUpdate) {
3251 // Update BaseReg.
3252 BuildMI(
3253 *MBB, InsertI, DL,
3254 TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
3255 .addDef(BaseReg)
3256 .addReg(BaseReg)
3257 .addImm(std::abs(ExtraBaseRegUpdate))
3258 .addImm(0)
3259 .setMIFlags(FrameRegUpdateFlags);
3263 // Check if *II is a register update that can be merged into STGloop that ends
3264 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
3265 // end of the loop.
3266 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
3267 int64_t Size, int64_t *TotalOffset) {
3268 MachineInstr &MI = *II;
3269 if ((MI.getOpcode() == AArch64::ADDXri ||
3270 MI.getOpcode() == AArch64::SUBXri) &&
3271 MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
3272 unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
3273 int64_t Offset = MI.getOperand(2).getImm() << Shift;
3274 if (MI.getOpcode() == AArch64::SUBXri)
3275 Offset = -Offset;
3276 int64_t AbsPostOffset = std::abs(Offset - Size);
3277 const int64_t kMaxOffset =
3278 0xFFF; // Max encoding for unshifted ADDXri / SUBXri
3279 if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
3280 *TotalOffset = Offset;
3281 return true;
3284 return false;
3287 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
3288 SmallVectorImpl<MachineMemOperand *> &MemRefs) {
3289 MemRefs.clear();
3290 for (auto &TS : TSE) {
3291 MachineInstr *MI = TS.MI;
3292 // An instruction without memory operands may access anything. Be
3293 // conservative and return an empty list.
3294 if (MI->memoperands_empty()) {
3295 MemRefs.clear();
3296 return;
3298 MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
3302 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
3303 const AArch64FrameLowering *TFI, bool IsLast) {
3304 if (TagStores.empty())
3305 return;
3306 TagStoreInstr &FirstTagStore = TagStores[0];
3307 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
3308 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
3309 DL = TagStores[0].MI->getDebugLoc();
3311 Register Reg;
3312 FrameRegOffset = TFI->resolveFrameOffsetReference(
3313 *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
3314 /*PreferFP=*/false, /*ForSimm=*/true);
3315 FrameReg = Reg;
3316 FrameRegUpdate = None;
3318 mergeMemRefs(TagStores, CombinedMemRefs);
3320 LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
3321 for (const auto &Instr
3322 : TagStores) { dbgs() << " " << *Instr.MI; });
3324 // Size threshold where a loop becomes shorter than a linear sequence of
3325 // tagging instructions.
3326 const int kSetTagLoopThreshold = 176;
3327 if (Size < kSetTagLoopThreshold) {
3328 if (TagStores.size() < 2)
3329 return;
3330 emitUnrolled(InsertI);
3331 } else {
3332 MachineInstr *UpdateInstr = nullptr;
3333 int64_t TotalOffset;
3334 if (IsLast) {
3335 // See if we can merge base register update into the STGloop.
3336 // This is done in AArch64LoadStoreOptimizer for "normal" stores,
3337 // but STGloop is way too unusual for that, and also it only
3338 // realistically happens in function epilogue. Also, STGloop is expanded
3339 // before that pass.
3340 if (InsertI != MBB->end() &&
3341 canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
3342 &TotalOffset)) {
3343 UpdateInstr = &*InsertI++;
3344 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
3345 << *UpdateInstr);
3349 if (!UpdateInstr && TagStores.size() < 2)
3350 return;
3352 if (UpdateInstr) {
3353 FrameRegUpdate = TotalOffset;
3354 FrameRegUpdateFlags = UpdateInstr->getFlags();
3356 emitLoop(InsertI);
3357 if (UpdateInstr)
3358 UpdateInstr->eraseFromParent();
3361 for (auto &TS : TagStores)
3362 TS.MI->eraseFromParent();
3365 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
3366 int64_t &Size, bool &ZeroData) {
3367 MachineFunction &MF = *MI.getParent()->getParent();
3368 const MachineFrameInfo &MFI = MF.getFrameInfo();
3370 unsigned Opcode = MI.getOpcode();
3371 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
3372 Opcode == AArch64::STZ2GOffset);
3374 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
3375 if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
3376 return false;
3377 if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
3378 return false;
3379 Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
3380 Size = MI.getOperand(2).getImm();
3381 return true;
3384 if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
3385 Size = 16;
3386 else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
3387 Size = 32;
3388 else
3389 return false;
3391 if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
3392 return false;
3394 Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
3395 16 * MI.getOperand(2).getImm();
3396 return true;
3399 // Detect a run of memory tagging instructions for adjacent stack frame slots,
3400 // and replace them with a shorter instruction sequence:
3401 // * replace STG + STG with ST2G
3402 // * replace STGloop + STGloop with STGloop
3403 // This code needs to run when stack slot offsets are already known, but before
3404 // FrameIndex operands in STG instructions are eliminated.
3405 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
3406 const AArch64FrameLowering *TFI,
3407 RegScavenger *RS) {
3408 bool FirstZeroData;
3409 int64_t Size, Offset;
3410 MachineInstr &MI = *II;
3411 MachineBasicBlock *MBB = MI.getParent();
3412 MachineBasicBlock::iterator NextI = ++II;
3413 if (&MI == &MBB->instr_back())
3414 return II;
3415 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
3416 return II;
3418 SmallVector<TagStoreInstr, 4> Instrs;
3419 Instrs.emplace_back(&MI, Offset, Size);
3421 constexpr int kScanLimit = 10;
3422 int Count = 0;
3423 for (MachineBasicBlock::iterator E = MBB->end();
3424 NextI != E && Count < kScanLimit; ++NextI) {
3425 MachineInstr &MI = *NextI;
3426 bool ZeroData;
3427 int64_t Size, Offset;
3428 // Collect instructions that update memory tags with a FrameIndex operand
3429 // and (when applicable) constant size, and whose output registers are dead
3430 // (the latter is almost always the case in practice). Since these
3431 // instructions effectively have no inputs or outputs, we are free to skip
3432 // any non-aliasing instructions in between without tracking used registers.
3433 if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
3434 if (ZeroData != FirstZeroData)
3435 break;
3436 Instrs.emplace_back(&MI, Offset, Size);
3437 continue;
3440 // Only count non-transient, non-tagging instructions toward the scan
3441 // limit.
3442 if (!MI.isTransient())
3443 ++Count;
3445 // Just in case, stop before the epilogue code starts.
3446 if (MI.getFlag(MachineInstr::FrameSetup) ||
3447 MI.getFlag(MachineInstr::FrameDestroy))
3448 break;
3450 // Reject anything that may alias the collected instructions.
3451 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
3452 break;
3455 // New code will be inserted after the last tagging instruction we've found.
3456 MachineBasicBlock::iterator InsertI = Instrs.back().MI;
3457 InsertI++;
3459 llvm::stable_sort(Instrs,
3460 [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
3461 return Left.Offset < Right.Offset;
3464 // Make sure that we don't have any overlapping stores.
3465 int64_t CurOffset = Instrs[0].Offset;
3466 for (auto &Instr : Instrs) {
3467 if (CurOffset > Instr.Offset)
3468 return NextI;
3469 CurOffset = Instr.Offset + Instr.Size;
3472 // Find contiguous runs of tagged memory and emit shorter instruction
3473 // sequencies for them when possible.
3474 TagStoreEdit TSE(MBB, FirstZeroData);
3475 Optional<int64_t> EndOffset;
3476 for (auto &Instr : Instrs) {
3477 if (EndOffset && *EndOffset != Instr.Offset) {
3478 // Found a gap.
3479 TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
3480 TSE.clear();
3483 TSE.addInstruction(Instr);
3484 EndOffset = Instr.Offset + Instr.Size;
3487 TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
3489 return InsertI;
3491 } // namespace
3493 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
3494 MachineFunction &MF, RegScavenger *RS = nullptr) const {
3495 if (StackTaggingMergeSetTag)
3496 for (auto &BB : MF)
3497 for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
3498 II = tryMergeAdjacentSTG(II, this, RS);
3501 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
3502 /// before the update. This is easily retrieved as it is exactly the offset
3503 /// that is set in processFunctionBeforeFrameFinalized.
3504 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
3505 const MachineFunction &MF, int FI, Register &FrameReg,
3506 bool IgnoreSPUpdates) const {
3507 const MachineFrameInfo &MFI = MF.getFrameInfo();
3508 if (IgnoreSPUpdates) {
3509 LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
3510 << MFI.getObjectOffset(FI) << "\n");
3511 FrameReg = AArch64::SP;
3512 return StackOffset::getFixed(MFI.getObjectOffset(FI));
3515 return getFrameIndexReference(MF, FI, FrameReg);
3518 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
3519 /// the parent's frame pointer
3520 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
3521 const MachineFunction &MF) const {
3522 return 0;
3525 /// Funclets only need to account for space for the callee saved registers,
3526 /// as the locals are accounted for in the parent's stack frame.
3527 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
3528 const MachineFunction &MF) const {
3529 // This is the size of the pushed CSRs.
3530 unsigned CSSize =
3531 MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
3532 // This is the amount of stack a funclet needs to allocate.
3533 return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
3534 getStackAlign());
3537 namespace {
3538 struct FrameObject {
3539 bool IsValid = false;
3540 // Index of the object in MFI.
3541 int ObjectIndex = 0;
3542 // Group ID this object belongs to.
3543 int GroupIndex = -1;
3544 // This object should be placed first (closest to SP).
3545 bool ObjectFirst = false;
3546 // This object's group (which always contains the object with
3547 // ObjectFirst==true) should be placed first.
3548 bool GroupFirst = false;
3551 class GroupBuilder {
3552 SmallVector<int, 8> CurrentMembers;
3553 int NextGroupIndex = 0;
3554 std::vector<FrameObject> &Objects;
3556 public:
3557 GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
3558 void AddMember(int Index) { CurrentMembers.push_back(Index); }
3559 void EndCurrentGroup() {
3560 if (CurrentMembers.size() > 1) {
3561 // Create a new group with the current member list. This might remove them
3562 // from their pre-existing groups. That's OK, dealing with overlapping
3563 // groups is too hard and unlikely to make a difference.
3564 LLVM_DEBUG(dbgs() << "group:");
3565 for (int Index : CurrentMembers) {
3566 Objects[Index].GroupIndex = NextGroupIndex;
3567 LLVM_DEBUG(dbgs() << " " << Index);
3569 LLVM_DEBUG(dbgs() << "\n");
3570 NextGroupIndex++;
3572 CurrentMembers.clear();
3576 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
3577 // Objects at a lower index are closer to FP; objects at a higher index are
3578 // closer to SP.
3580 // For consistency in our comparison, all invalid objects are placed
3581 // at the end. This also allows us to stop walking when we hit the
3582 // first invalid item after it's all sorted.
3584 // The "first" object goes first (closest to SP), followed by the members of
3585 // the "first" group.
3587 // The rest are sorted by the group index to keep the groups together.
3588 // Higher numbered groups are more likely to be around longer (i.e. untagged
3589 // in the function epilogue and not at some earlier point). Place them closer
3590 // to SP.
3592 // If all else equal, sort by the object index to keep the objects in the
3593 // original order.
3594 return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
3595 A.ObjectIndex) <
3596 std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
3597 B.ObjectIndex);
3599 } // namespace
3601 void AArch64FrameLowering::orderFrameObjects(
3602 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
3603 if (!OrderFrameObjects || ObjectsToAllocate.empty())
3604 return;
3606 const MachineFrameInfo &MFI = MF.getFrameInfo();
3607 std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
3608 for (auto &Obj : ObjectsToAllocate) {
3609 FrameObjects[Obj].IsValid = true;
3610 FrameObjects[Obj].ObjectIndex = Obj;
3613 // Identify stack slots that are tagged at the same time.
3614 GroupBuilder GB(FrameObjects);
3615 for (auto &MBB : MF) {
3616 for (auto &MI : MBB) {
3617 if (MI.isDebugInstr())
3618 continue;
3619 int OpIndex;
3620 switch (MI.getOpcode()) {
3621 case AArch64::STGloop:
3622 case AArch64::STZGloop:
3623 OpIndex = 3;
3624 break;
3625 case AArch64::STGOffset:
3626 case AArch64::STZGOffset:
3627 case AArch64::ST2GOffset:
3628 case AArch64::STZ2GOffset:
3629 OpIndex = 1;
3630 break;
3631 default:
3632 OpIndex = -1;
3635 int TaggedFI = -1;
3636 if (OpIndex >= 0) {
3637 const MachineOperand &MO = MI.getOperand(OpIndex);
3638 if (MO.isFI()) {
3639 int FI = MO.getIndex();
3640 if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
3641 FrameObjects[FI].IsValid)
3642 TaggedFI = FI;
3646 // If this is a stack tagging instruction for a slot that is not part of a
3647 // group yet, either start a new group or add it to the current one.
3648 if (TaggedFI >= 0)
3649 GB.AddMember(TaggedFI);
3650 else
3651 GB.EndCurrentGroup();
3653 // Groups should never span multiple basic blocks.
3654 GB.EndCurrentGroup();
3657 // If the function's tagged base pointer is pinned to a stack slot, we want to
3658 // put that slot first when possible. This will likely place it at SP + 0,
3659 // and save one instruction when generating the base pointer because IRG does
3660 // not allow an immediate offset.
3661 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
3662 Optional<int> TBPI = AFI.getTaggedBasePointerIndex();
3663 if (TBPI) {
3664 FrameObjects[*TBPI].ObjectFirst = true;
3665 FrameObjects[*TBPI].GroupFirst = true;
3666 int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
3667 if (FirstGroupIndex >= 0)
3668 for (FrameObject &Object : FrameObjects)
3669 if (Object.GroupIndex == FirstGroupIndex)
3670 Object.GroupFirst = true;
3673 llvm::stable_sort(FrameObjects, FrameObjectCompare);
3675 int i = 0;
3676 for (auto &Obj : FrameObjects) {
3677 // All invalid items are sorted at the end, so it's safe to stop.
3678 if (!Obj.IsValid)
3679 break;
3680 ObjectsToAllocate[i++] = Obj.ObjectIndex;
3683 LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj
3684 : FrameObjects) {
3685 if (!Obj.IsValid)
3686 break;
3687 dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
3688 if (Obj.ObjectFirst)
3689 dbgs() << ", first";
3690 if (Obj.GroupFirst)
3691 dbgs() << ", group-first";
3692 dbgs() << "\n";