AMDGPU: Mark test as XFAIL in expensive_checks builds
[llvm-project.git] / llvm / lib / Target / X86 / X86FrameLowering.cpp
blob4d40c23eb5617afa307ce941e297a57a6df0885b
1 //===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 implementation of TargetFrameLowering class.
11 //===----------------------------------------------------------------------===//
13 #include "X86FrameLowering.h"
14 #include "MCTargetDesc/X86MCTargetDesc.h"
15 #include "X86InstrBuilder.h"
16 #include "X86InstrInfo.h"
17 #include "X86MachineFunctionInfo.h"
18 #include "X86Subtarget.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LivePhysRegs.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/CodeGen/MachineModuleInfo.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/WinEHFuncInfo.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/EHPersonalities.h"
30 #include "llvm/IR/Function.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/MC/MCObjectFileInfo.h"
34 #include "llvm/MC/MCSymbol.h"
35 #include "llvm/Support/LEB128.h"
36 #include "llvm/Target/TargetOptions.h"
37 #include <cstdlib>
39 #define DEBUG_TYPE "x86-fl"
41 STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
42 STATISTIC(NumFrameExtraProbe,
43 "Number of extra stack probes generated in prologue");
44 STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2");
46 using namespace llvm;
48 X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
49 MaybeAlign StackAlignOverride)
50 : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
51 STI.is64Bit() ? -8 : -4),
52 STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
53 // Cache a bunch of frame-related predicates for this subtarget.
54 SlotSize = TRI->getSlotSize();
55 Is64Bit = STI.is64Bit();
56 IsLP64 = STI.isTarget64BitLP64();
57 // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
58 Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
59 StackPtr = TRI->getStackRegister();
62 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
63 return !MF.getFrameInfo().hasVarSizedObjects() &&
64 !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&
65 !MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();
68 /// canSimplifyCallFramePseudos - If there is a reserved call frame, the
69 /// call frame pseudos can be simplified. Having a FP, as in the default
70 /// implementation, is not sufficient here since we can't always use it.
71 /// Use a more nuanced condition.
72 bool X86FrameLowering::canSimplifyCallFramePseudos(
73 const MachineFunction &MF) const {
74 return hasReservedCallFrame(MF) ||
75 MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
76 (hasFP(MF) && !TRI->hasStackRealignment(MF)) ||
77 TRI->hasBasePointer(MF);
80 // needsFrameIndexResolution - Do we need to perform FI resolution for
81 // this function. Normally, this is required only when the function
82 // has any stack objects. However, FI resolution actually has another job,
83 // not apparent from the title - it resolves callframesetup/destroy
84 // that were not simplified earlier.
85 // So, this is required for x86 functions that have push sequences even
86 // when there are no stack objects.
87 bool X86FrameLowering::needsFrameIndexResolution(
88 const MachineFunction &MF) const {
89 return MF.getFrameInfo().hasStackObjects() ||
90 MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
93 /// hasFPImpl - Return true if the specified function should have a dedicated
94 /// frame pointer register. This is true if the function has variable sized
95 /// allocas or if frame pointer elimination is disabled.
96 bool X86FrameLowering::hasFPImpl(const MachineFunction &MF) const {
97 const MachineFrameInfo &MFI = MF.getFrameInfo();
98 return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
99 TRI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
100 MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
101 MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
102 MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
103 MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
104 MFI.hasStackMap() || MFI.hasPatchPoint() ||
105 (isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment()));
108 static unsigned getSUBriOpcode(bool IsLP64) {
109 return IsLP64 ? X86::SUB64ri32 : X86::SUB32ri;
112 static unsigned getADDriOpcode(bool IsLP64) {
113 return IsLP64 ? X86::ADD64ri32 : X86::ADD32ri;
116 static unsigned getSUBrrOpcode(bool IsLP64) {
117 return IsLP64 ? X86::SUB64rr : X86::SUB32rr;
120 static unsigned getADDrrOpcode(bool IsLP64) {
121 return IsLP64 ? X86::ADD64rr : X86::ADD32rr;
124 static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
125 return IsLP64 ? X86::AND64ri32 : X86::AND32ri;
128 static unsigned getLEArOpcode(bool IsLP64) {
129 return IsLP64 ? X86::LEA64r : X86::LEA32r;
132 static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
133 if (Use64BitReg) {
134 if (isUInt<32>(Imm))
135 return X86::MOV32ri64;
136 if (isInt<32>(Imm))
137 return X86::MOV64ri32;
138 return X86::MOV64ri;
140 return X86::MOV32ri;
143 // Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the
144 // value written by the PUSH from the stack. The processor tracks these marked
145 // instructions internally and fast-forwards register data between matching PUSH
146 // and POP instructions, without going through memory or through the training
147 // loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient
148 // memory-renaming optimization can be used.
150 // The PPX hint is purely a performance hint. Instructions with this hint have
151 // the same functional semantics as those without. PPX hints set by the
152 // compiler that violate the balancing rule may turn off the PPX optimization,
153 // but they will not affect program semantics.
155 // Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp
156 // are not considered).
158 // PUSH2 and POP2 are instructions for (respectively) pushing/popping 2
159 // GPRs at a time to/from the stack.
160 static unsigned getPUSHOpcode(const X86Subtarget &ST) {
161 return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r)
162 : X86::PUSH32r;
164 static unsigned getPOPOpcode(const X86Subtarget &ST) {
165 return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r)
166 : X86::POP32r;
168 static unsigned getPUSH2Opcode(const X86Subtarget &ST) {
169 return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2;
171 static unsigned getPOP2Opcode(const X86Subtarget &ST) {
172 return ST.hasPPX() ? X86::POP2P : X86::POP2;
175 static bool isEAXLiveIn(MachineBasicBlock &MBB) {
176 for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
177 unsigned Reg = RegMask.PhysReg;
179 if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
180 Reg == X86::AH || Reg == X86::AL)
181 return true;
184 return false;
187 /// Check if the flags need to be preserved before the terminators.
188 /// This would be the case, if the eflags is live-in of the region
189 /// composed by the terminators or live-out of that region, without
190 /// being defined by a terminator.
191 static bool
192 flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
193 for (const MachineInstr &MI : MBB.terminators()) {
194 bool BreakNext = false;
195 for (const MachineOperand &MO : MI.operands()) {
196 if (!MO.isReg())
197 continue;
198 Register Reg = MO.getReg();
199 if (Reg != X86::EFLAGS)
200 continue;
202 // This terminator needs an eflags that is not defined
203 // by a previous another terminator:
204 // EFLAGS is live-in of the region composed by the terminators.
205 if (!MO.isDef())
206 return true;
207 // This terminator defines the eflags, i.e., we don't need to preserve it.
208 // However, we still need to check this specific terminator does not
209 // read a live-in value.
210 BreakNext = true;
212 // We found a definition of the eflags, no need to preserve them.
213 if (BreakNext)
214 return false;
217 // None of the terminators use or define the eflags.
218 // Check if they are live-out, that would imply we need to preserve them.
219 for (const MachineBasicBlock *Succ : MBB.successors())
220 if (Succ->isLiveIn(X86::EFLAGS))
221 return true;
223 return false;
226 /// emitSPUpdate - Emit a series of instructions to increment / decrement the
227 /// stack pointer by a constant value.
228 void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
229 MachineBasicBlock::iterator &MBBI,
230 const DebugLoc &DL, int64_t NumBytes,
231 bool InEpilogue) const {
232 bool isSub = NumBytes < 0;
233 uint64_t Offset = isSub ? -NumBytes : NumBytes;
234 MachineInstr::MIFlag Flag =
235 isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
237 uint64_t Chunk = (1LL << 31) - 1;
239 MachineFunction &MF = *MBB.getParent();
240 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
241 const X86TargetLowering &TLI = *STI.getTargetLowering();
242 const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
244 // It's ok to not take into account large chunks when probing, as the
245 // allocation is split in smaller chunks anyway.
246 if (EmitInlineStackProbe && !InEpilogue) {
248 // This pseudo-instruction is going to be expanded, potentially using a
249 // loop, by inlineStackProbe().
250 BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset);
251 return;
252 } else if (Offset > Chunk) {
253 // Rather than emit a long series of instructions for large offsets,
254 // load the offset into a register and do one sub/add
255 unsigned Reg = 0;
256 unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
258 if (isSub && !isEAXLiveIn(MBB))
259 Reg = Rax;
260 else
261 Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
263 unsigned AddSubRROpc =
264 isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
265 if (Reg) {
266 BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg)
267 .addImm(Offset)
268 .setMIFlag(Flag);
269 MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
270 .addReg(StackPtr)
271 .addReg(Reg);
272 MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
273 return;
274 } else if (Offset > 8 * Chunk) {
275 // If we would need more than 8 add or sub instructions (a >16GB stack
276 // frame), it's worth spilling RAX to materialize this immediate.
277 // pushq %rax
278 // movabsq +-$Offset+-SlotSize, %rax
279 // addq %rsp, %rax
280 // xchg %rax, (%rsp)
281 // movq (%rsp), %rsp
282 assert(Is64Bit && "can't have 32-bit 16GB stack frame");
283 BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
284 .addReg(Rax, RegState::Kill)
285 .setMIFlag(Flag);
286 // Subtract is not commutative, so negate the offset and always use add.
287 // Subtract 8 less and add 8 more to account for the PUSH we just did.
288 if (isSub)
289 Offset = -(Offset - SlotSize);
290 else
291 Offset = Offset + SlotSize;
292 BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax)
293 .addImm(Offset)
294 .setMIFlag(Flag);
295 MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
296 .addReg(Rax)
297 .addReg(StackPtr);
298 MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
299 // Exchange the new SP in RAX with the top of the stack.
300 addRegOffset(
301 BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
302 StackPtr, false, 0);
303 // Load new SP from the top of the stack into RSP.
304 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
305 StackPtr, false, 0);
306 return;
310 while (Offset) {
311 uint64_t ThisVal = std::min(Offset, Chunk);
312 if (ThisVal == SlotSize) {
313 // Use push / pop for slot sized adjustments as a size optimization. We
314 // need to find a dead register when using pop.
315 unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
316 : TRI->findDeadCallerSavedReg(MBB, MBBI);
317 if (Reg) {
318 unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
319 : (Is64Bit ? X86::POP64r : X86::POP32r);
320 BuildMI(MBB, MBBI, DL, TII.get(Opc))
321 .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
322 .setMIFlag(Flag);
323 Offset -= ThisVal;
324 continue;
328 BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
329 .setMIFlag(Flag);
331 Offset -= ThisVal;
335 MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
336 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
337 const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
338 assert(Offset != 0 && "zero offset stack adjustment requested");
340 // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
341 // is tricky.
342 bool UseLEA;
343 if (!InEpilogue) {
344 // Check if inserting the prologue at the beginning
345 // of MBB would require to use LEA operations.
346 // We need to use LEA operations if EFLAGS is live in, because
347 // it means an instruction will read it before it gets defined.
348 UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
349 } else {
350 // If we can use LEA for SP but we shouldn't, check that none
351 // of the terminators uses the eflags. Otherwise we will insert
352 // a ADD that will redefine the eflags and break the condition.
353 // Alternatively, we could move the ADD, but this may not be possible
354 // and is an optimization anyway.
355 UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
356 if (UseLEA && !STI.useLeaForSP())
357 UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
358 // If that assert breaks, that means we do not do the right thing
359 // in canUseAsEpilogue.
360 assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
361 "We shouldn't have allowed this insertion point");
364 MachineInstrBuilder MI;
365 if (UseLEA) {
366 MI = addRegOffset(BuildMI(MBB, MBBI, DL,
367 TII.get(getLEArOpcode(Uses64BitFramePtr)),
368 StackPtr),
369 StackPtr, false, Offset);
370 } else {
371 bool IsSub = Offset < 0;
372 uint64_t AbsOffset = IsSub ? -Offset : Offset;
373 const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr)
374 : getADDriOpcode(Uses64BitFramePtr);
375 MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
376 .addReg(StackPtr)
377 .addImm(AbsOffset);
378 MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
380 return MI;
383 int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
384 MachineBasicBlock::iterator &MBBI,
385 bool doMergeWithPrevious) const {
386 if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
387 (!doMergeWithPrevious && MBBI == MBB.end()))
388 return 0;
390 MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
392 PI = skipDebugInstructionsBackward(PI, MBB.begin());
393 // It is assumed that ADD/SUB/LEA instruction is succeded by one CFI
394 // instruction, and that there are no DBG_VALUE or other instructions between
395 // ADD/SUB/LEA and its corresponding CFI instruction.
396 /* TODO: Add support for the case where there are multiple CFI instructions
397 below the ADD/SUB/LEA, e.g.:
400 cfi_def_cfa_offset
401 cfi_offset
404 if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
405 PI = std::prev(PI);
407 unsigned Opc = PI->getOpcode();
408 int Offset = 0;
410 if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
411 PI->getOperand(0).getReg() == StackPtr) {
412 assert(PI->getOperand(1).getReg() == StackPtr);
413 Offset = PI->getOperand(2).getImm();
414 } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
415 PI->getOperand(0).getReg() == StackPtr &&
416 PI->getOperand(1).getReg() == StackPtr &&
417 PI->getOperand(2).getImm() == 1 &&
418 PI->getOperand(3).getReg() == X86::NoRegister &&
419 PI->getOperand(5).getReg() == X86::NoRegister) {
420 // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
421 Offset = PI->getOperand(4).getImm();
422 } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
423 PI->getOperand(0).getReg() == StackPtr) {
424 assert(PI->getOperand(1).getReg() == StackPtr);
425 Offset = -PI->getOperand(2).getImm();
426 } else
427 return 0;
429 PI = MBB.erase(PI);
430 if (PI != MBB.end() && PI->isCFIInstruction()) {
431 auto CIs = MBB.getParent()->getFrameInstructions();
432 MCCFIInstruction CI = CIs[PI->getOperand(0).getCFIIndex()];
433 if (CI.getOperation() == MCCFIInstruction::OpDefCfaOffset ||
434 CI.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)
435 PI = MBB.erase(PI);
437 if (!doMergeWithPrevious)
438 MBBI = skipDebugInstructionsForward(PI, MBB.end());
440 return Offset;
443 void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
444 MachineBasicBlock::iterator MBBI,
445 const DebugLoc &DL,
446 const MCCFIInstruction &CFIInst,
447 MachineInstr::MIFlag Flag) const {
448 MachineFunction &MF = *MBB.getParent();
449 unsigned CFIIndex = MF.addFrameInst(CFIInst);
451 if (CFIInst.getOperation() == MCCFIInstruction::OpAdjustCfaOffset)
452 MF.getInfo<X86MachineFunctionInfo>()->setHasCFIAdjustCfa(true);
454 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
455 .addCFIIndex(CFIIndex)
456 .setMIFlag(Flag);
459 /// Emits Dwarf Info specifying offsets of callee saved registers and
460 /// frame pointer. This is called only when basic block sections are enabled.
461 void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA(
462 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
463 MachineFunction &MF = *MBB.getParent();
464 if (!hasFP(MF)) {
465 emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
466 return;
468 const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
469 const Register FramePtr = TRI->getFrameRegister(MF);
470 const Register MachineFramePtr =
471 STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
472 : FramePtr;
473 unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
474 // Offset = space for return address + size of the frame pointer itself.
475 int64_t Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
476 BuildCFI(MBB, MBBI, DebugLoc{},
477 MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));
478 emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
481 void X86FrameLowering::emitCalleeSavedFrameMoves(
482 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
483 const DebugLoc &DL, bool IsPrologue) const {
484 MachineFunction &MF = *MBB.getParent();
485 MachineFrameInfo &MFI = MF.getFrameInfo();
486 const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo();
487 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
489 // Add callee saved registers to move list.
490 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
492 // Calculate offsets.
493 for (const CalleeSavedInfo &I : CSI) {
494 int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
495 Register Reg = I.getReg();
496 unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
498 if (IsPrologue) {
499 if (X86FI->getStackPtrSaveMI()) {
500 // +2*SlotSize because there is return address and ebp at the bottom
501 // of the stack.
502 // | retaddr |
503 // | ebp |
504 // | |<--ebp
505 Offset += 2 * SlotSize;
506 SmallString<64> CfaExpr;
507 CfaExpr.push_back(dwarf::DW_CFA_expression);
508 uint8_t buffer[16];
509 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
510 CfaExpr.push_back(2);
511 Register FramePtr = TRI->getFrameRegister(MF);
512 const Register MachineFramePtr =
513 STI.isTarget64BitILP32()
514 ? Register(getX86SubSuperRegister(FramePtr, 64))
515 : FramePtr;
516 unsigned DwarfFramePtr = MRI->getDwarfRegNum(MachineFramePtr, true);
517 CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfFramePtr));
518 CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer));
519 BuildCFI(MBB, MBBI, DL,
520 MCCFIInstruction::createEscape(nullptr, CfaExpr.str()),
521 MachineInstr::FrameSetup);
522 } else {
523 BuildCFI(MBB, MBBI, DL,
524 MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
526 } else {
527 BuildCFI(MBB, MBBI, DL,
528 MCCFIInstruction::createRestore(nullptr, DwarfReg));
531 if (auto *MI = X86FI->getStackPtrSaveMI()) {
532 int FI = MI->getOperand(1).getIndex();
533 int64_t Offset = MFI.getObjectOffset(FI) + 2 * SlotSize;
534 SmallString<64> CfaExpr;
535 Register FramePtr = TRI->getFrameRegister(MF);
536 const Register MachineFramePtr =
537 STI.isTarget64BitILP32()
538 ? Register(getX86SubSuperRegister(FramePtr, 64))
539 : FramePtr;
540 unsigned DwarfFramePtr = MRI->getDwarfRegNum(MachineFramePtr, true);
541 CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfFramePtr));
542 uint8_t buffer[16];
543 CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer));
544 CfaExpr.push_back(dwarf::DW_OP_deref);
546 SmallString<64> DefCfaExpr;
547 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
548 DefCfaExpr.append(buffer, buffer + encodeSLEB128(CfaExpr.size(), buffer));
549 DefCfaExpr.append(CfaExpr.str());
550 // DW_CFA_def_cfa_expression: DW_OP_breg5 offset, DW_OP_deref
551 BuildCFI(MBB, MBBI, DL,
552 MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str()),
553 MachineInstr::FrameSetup);
557 void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
558 MachineBasicBlock &MBB) const {
559 const MachineFunction &MF = *MBB.getParent();
561 // Insertion point.
562 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
564 // Fake a debug loc.
565 DebugLoc DL;
566 if (MBBI != MBB.end())
567 DL = MBBI->getDebugLoc();
569 // Zero out FP stack if referenced. Do this outside of the loop below so that
570 // it's done only once.
571 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
572 for (MCRegister Reg : RegsToZero.set_bits()) {
573 if (!X86::RFP80RegClass.contains(Reg))
574 continue;
576 unsigned NumFPRegs = ST.is64Bit() ? 8 : 7;
577 for (unsigned i = 0; i != NumFPRegs; ++i)
578 BuildMI(MBB, MBBI, DL, TII.get(X86::LD_F0));
580 for (unsigned i = 0; i != NumFPRegs; ++i)
581 BuildMI(MBB, MBBI, DL, TII.get(X86::ST_FPrr)).addReg(X86::ST0);
582 break;
585 // For GPRs, we only care to clear out the 32-bit register.
586 BitVector GPRsToZero(TRI->getNumRegs());
587 for (MCRegister Reg : RegsToZero.set_bits())
588 if (TRI->isGeneralPurposeRegister(MF, Reg)) {
589 GPRsToZero.set(getX86SubSuperRegister(Reg, 32));
590 RegsToZero.reset(Reg);
593 // Zero out the GPRs first.
594 for (MCRegister Reg : GPRsToZero.set_bits())
595 TII.buildClearRegister(Reg, MBB, MBBI, DL);
597 // Zero out the remaining registers.
598 for (MCRegister Reg : RegsToZero.set_bits())
599 TII.buildClearRegister(Reg, MBB, MBBI, DL);
602 void X86FrameLowering::emitStackProbe(
603 MachineFunction &MF, MachineBasicBlock &MBB,
604 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
605 std::optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {
606 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
607 if (STI.isTargetWindowsCoreCLR()) {
608 if (InProlog) {
609 BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING))
610 .addImm(0 /* no explicit stack size */);
611 } else {
612 emitStackProbeInline(MF, MBB, MBBI, DL, false);
614 } else {
615 emitStackProbeCall(MF, MBB, MBBI, DL, InProlog, InstrNum);
619 bool X86FrameLowering::stackProbeFunctionModifiesSP() const {
620 return STI.isOSWindows() && !STI.isTargetWin64();
623 void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
624 MachineBasicBlock &PrologMBB) const {
625 auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) {
626 return MI.getOpcode() == X86::STACKALLOC_W_PROBING;
628 if (Where != PrologMBB.end()) {
629 DebugLoc DL = PrologMBB.findDebugLoc(Where);
630 emitStackProbeInline(MF, PrologMBB, Where, DL, true);
631 Where->eraseFromParent();
635 void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
636 MachineBasicBlock &MBB,
637 MachineBasicBlock::iterator MBBI,
638 const DebugLoc &DL,
639 bool InProlog) const {
640 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
641 if (STI.isTargetWindowsCoreCLR() && STI.is64Bit())
642 emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog);
643 else
644 emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog);
647 void X86FrameLowering::emitStackProbeInlineGeneric(
648 MachineFunction &MF, MachineBasicBlock &MBB,
649 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
650 MachineInstr &AllocWithProbe = *MBBI;
651 uint64_t Offset = AllocWithProbe.getOperand(0).getImm();
653 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
654 const X86TargetLowering &TLI = *STI.getTargetLowering();
655 assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&
656 "different expansion expected for CoreCLR 64 bit");
658 const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
659 uint64_t ProbeChunk = StackProbeSize * 8;
661 uint64_t MaxAlign =
662 TRI->hasStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;
664 // Synthesize a loop or unroll it, depending on the number of iterations.
665 // BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
666 // between the unaligned rsp and current rsp.
667 if (Offset > ProbeChunk) {
668 emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,
669 MaxAlign % StackProbeSize);
670 } else {
671 emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,
672 MaxAlign % StackProbeSize);
676 void X86FrameLowering::emitStackProbeInlineGenericBlock(
677 MachineFunction &MF, MachineBasicBlock &MBB,
678 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
679 uint64_t AlignOffset) const {
681 const bool NeedsDwarfCFI = needsDwarfCFI(MF);
682 const bool HasFP = hasFP(MF);
683 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
684 const X86TargetLowering &TLI = *STI.getTargetLowering();
685 const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
686 const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
688 uint64_t CurrentOffset = 0;
690 assert(AlignOffset < StackProbeSize);
692 // If the offset is so small it fits within a page, there's nothing to do.
693 if (StackProbeSize < Offset + AlignOffset) {
695 uint64_t StackAdjustment = StackProbeSize - AlignOffset;
696 BuildStackAdjustment(MBB, MBBI, DL, -StackAdjustment, /*InEpilogue=*/false)
697 .setMIFlag(MachineInstr::FrameSetup);
698 if (!HasFP && NeedsDwarfCFI) {
699 BuildCFI(
700 MBB, MBBI, DL,
701 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
704 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
705 .setMIFlag(MachineInstr::FrameSetup),
706 StackPtr, false, 0)
707 .addImm(0)
708 .setMIFlag(MachineInstr::FrameSetup);
709 NumFrameExtraProbe++;
710 CurrentOffset = StackProbeSize - AlignOffset;
713 // For the next N - 1 pages, just probe. I tried to take advantage of
714 // natural probes but it implies much more logic and there was very few
715 // interesting natural probes to interleave.
716 while (CurrentOffset + StackProbeSize < Offset) {
717 BuildStackAdjustment(MBB, MBBI, DL, -StackProbeSize, /*InEpilogue=*/false)
718 .setMIFlag(MachineInstr::FrameSetup);
720 if (!HasFP && NeedsDwarfCFI) {
721 BuildCFI(
722 MBB, MBBI, DL,
723 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackProbeSize));
725 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
726 .setMIFlag(MachineInstr::FrameSetup),
727 StackPtr, false, 0)
728 .addImm(0)
729 .setMIFlag(MachineInstr::FrameSetup);
730 NumFrameExtraProbe++;
731 CurrentOffset += StackProbeSize;
734 // No need to probe the tail, it is smaller than a Page.
735 uint64_t ChunkSize = Offset - CurrentOffset;
736 if (ChunkSize == SlotSize) {
737 // Use push for slot sized adjustments as a size optimization,
738 // like emitSPUpdate does when not probing.
739 unsigned Reg = Is64Bit ? X86::RAX : X86::EAX;
740 unsigned Opc = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
741 BuildMI(MBB, MBBI, DL, TII.get(Opc))
742 .addReg(Reg, RegState::Undef)
743 .setMIFlag(MachineInstr::FrameSetup);
744 } else {
745 BuildStackAdjustment(MBB, MBBI, DL, -ChunkSize, /*InEpilogue=*/false)
746 .setMIFlag(MachineInstr::FrameSetup);
748 // No need to adjust Dwarf CFA offset here, the last position of the stack has
749 // been defined
752 void X86FrameLowering::emitStackProbeInlineGenericLoop(
753 MachineFunction &MF, MachineBasicBlock &MBB,
754 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
755 uint64_t AlignOffset) const {
756 assert(Offset && "null offset");
758 assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
759 MachineBasicBlock::LQR_Live &&
760 "Inline stack probe loop will clobber live EFLAGS.");
762 const bool NeedsDwarfCFI = needsDwarfCFI(MF);
763 const bool HasFP = hasFP(MF);
764 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
765 const X86TargetLowering &TLI = *STI.getTargetLowering();
766 const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
767 const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
769 if (AlignOffset) {
770 if (AlignOffset < StackProbeSize) {
771 // Perform a first smaller allocation followed by a probe.
772 BuildStackAdjustment(MBB, MBBI, DL, -AlignOffset, /*InEpilogue=*/false)
773 .setMIFlag(MachineInstr::FrameSetup);
775 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
776 .setMIFlag(MachineInstr::FrameSetup),
777 StackPtr, false, 0)
778 .addImm(0)
779 .setMIFlag(MachineInstr::FrameSetup);
780 NumFrameExtraProbe++;
781 Offset -= AlignOffset;
785 // Synthesize a loop
786 NumFrameLoopProbe++;
787 const BasicBlock *LLVM_BB = MBB.getBasicBlock();
789 MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);
790 MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);
792 MachineFunction::iterator MBBIter = ++MBB.getIterator();
793 MF.insert(MBBIter, testMBB);
794 MF.insert(MBBIter, tailMBB);
796 Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
797 : Is64Bit ? X86::R11D
798 : X86::EAX;
800 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
801 .addReg(StackPtr)
802 .setMIFlag(MachineInstr::FrameSetup);
804 // save loop bound
806 const unsigned BoundOffset = alignDown(Offset, StackProbeSize);
807 const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);
808 BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
809 .addReg(FinalStackProbed)
810 .addImm(BoundOffset)
811 .setMIFlag(MachineInstr::FrameSetup);
813 // while in the loop, use loop-invariant reg for CFI,
814 // instead of the stack pointer, which changes during the loop
815 if (!HasFP && NeedsDwarfCFI) {
816 // x32 uses the same DWARF register numbers as x86-64,
817 // so there isn't a register number for r11d, we must use r11 instead
818 const Register DwarfFinalStackProbed =
819 STI.isTarget64BitILP32()
820 ? Register(getX86SubSuperRegister(FinalStackProbed, 64))
821 : FinalStackProbed;
823 BuildCFI(MBB, MBBI, DL,
824 MCCFIInstruction::createDefCfaRegister(
825 nullptr, TRI->getDwarfRegNum(DwarfFinalStackProbed, true)));
826 BuildCFI(MBB, MBBI, DL,
827 MCCFIInstruction::createAdjustCfaOffset(nullptr, BoundOffset));
831 // allocate a page
832 BuildStackAdjustment(*testMBB, testMBB->end(), DL, -StackProbeSize,
833 /*InEpilogue=*/false)
834 .setMIFlag(MachineInstr::FrameSetup);
836 // touch the page
837 addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))
838 .setMIFlag(MachineInstr::FrameSetup),
839 StackPtr, false, 0)
840 .addImm(0)
841 .setMIFlag(MachineInstr::FrameSetup);
843 // cmp with stack pointer bound
844 BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
845 .addReg(StackPtr)
846 .addReg(FinalStackProbed)
847 .setMIFlag(MachineInstr::FrameSetup);
849 // jump
850 BuildMI(testMBB, DL, TII.get(X86::JCC_1))
851 .addMBB(testMBB)
852 .addImm(X86::COND_NE)
853 .setMIFlag(MachineInstr::FrameSetup);
854 testMBB->addSuccessor(testMBB);
855 testMBB->addSuccessor(tailMBB);
857 // BB management
858 tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());
859 tailMBB->transferSuccessorsAndUpdatePHIs(&MBB);
860 MBB.addSuccessor(testMBB);
862 // handle tail
863 const uint64_t TailOffset = Offset % StackProbeSize;
864 MachineBasicBlock::iterator TailMBBIter = tailMBB->begin();
865 if (TailOffset) {
866 BuildStackAdjustment(*tailMBB, TailMBBIter, DL, -TailOffset,
867 /*InEpilogue=*/false)
868 .setMIFlag(MachineInstr::FrameSetup);
871 // after the loop, switch back to stack pointer for CFI
872 if (!HasFP && NeedsDwarfCFI) {
873 // x32 uses the same DWARF register numbers as x86-64,
874 // so there isn't a register number for esp, we must use rsp instead
875 const Register DwarfStackPtr =
876 STI.isTarget64BitILP32()
877 ? Register(getX86SubSuperRegister(StackPtr, 64))
878 : Register(StackPtr);
880 BuildCFI(*tailMBB, TailMBBIter, DL,
881 MCCFIInstruction::createDefCfaRegister(
882 nullptr, TRI->getDwarfRegNum(DwarfStackPtr, true)));
885 // Update Live In information
886 fullyRecomputeLiveIns({tailMBB, testMBB});
889 void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
890 MachineFunction &MF, MachineBasicBlock &MBB,
891 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
892 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
893 assert(STI.is64Bit() && "different expansion needed for 32 bit");
894 assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
895 const TargetInstrInfo &TII = *STI.getInstrInfo();
896 const BasicBlock *LLVM_BB = MBB.getBasicBlock();
898 assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
899 MachineBasicBlock::LQR_Live &&
900 "Inline stack probe loop will clobber live EFLAGS.");
902 // RAX contains the number of bytes of desired stack adjustment.
903 // The handling here assumes this value has already been updated so as to
904 // maintain stack alignment.
906 // We need to exit with RSP modified by this amount and execute suitable
907 // page touches to notify the OS that we're growing the stack responsibly.
908 // All stack probing must be done without modifying RSP.
910 // MBB:
911 // SizeReg = RAX;
912 // ZeroReg = 0
913 // CopyReg = RSP
914 // Flags, TestReg = CopyReg - SizeReg
915 // FinalReg = !Flags.Ovf ? TestReg : ZeroReg
916 // LimitReg = gs magic thread env access
917 // if FinalReg >= LimitReg goto ContinueMBB
918 // RoundBB:
919 // RoundReg = page address of FinalReg
920 // LoopMBB:
921 // LoopReg = PHI(LimitReg,ProbeReg)
922 // ProbeReg = LoopReg - PageSize
923 // [ProbeReg] = 0
924 // if (ProbeReg > RoundReg) goto LoopMBB
925 // ContinueMBB:
926 // RSP = RSP - RAX
927 // [rest of original MBB]
929 // Set up the new basic blocks
930 MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
931 MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
932 MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
934 MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
935 MF.insert(MBBIter, RoundMBB);
936 MF.insert(MBBIter, LoopMBB);
937 MF.insert(MBBIter, ContinueMBB);
939 // Split MBB and move the tail portion down to ContinueMBB.
940 MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
941 ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
942 ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
944 // Some useful constants
945 const int64_t ThreadEnvironmentStackLimit = 0x10;
946 const int64_t PageSize = 0x1000;
947 const int64_t PageMask = ~(PageSize - 1);
949 // Registers we need. For the normal case we use virtual
950 // registers. For the prolog expansion we use RAX, RCX and RDX.
951 MachineRegisterInfo &MRI = MF.getRegInfo();
952 const TargetRegisterClass *RegClass = &X86::GR64RegClass;
953 const Register
954 SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass),
955 ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
956 CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
957 TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
958 FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
959 RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
960 LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
961 JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
962 ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass);
964 // SP-relative offsets where we can save RCX and RDX.
965 int64_t RCXShadowSlot = 0;
966 int64_t RDXShadowSlot = 0;
968 // If inlining in the prolog, save RCX and RDX.
969 if (InProlog) {
970 // Compute the offsets. We need to account for things already
971 // pushed onto the stack at this point: return address, frame
972 // pointer (if used), and callee saves.
973 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
974 const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
975 const bool HasFP = hasFP(MF);
977 // Check if we need to spill RCX and/or RDX.
978 // Here we assume that no earlier prologue instruction changes RCX and/or
979 // RDX, so checking the block live-ins is enough.
980 const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX);
981 const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX);
982 int64_t InitSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
983 // Assign the initial slot to both registers, then change RDX's slot if both
984 // need to be spilled.
985 if (IsRCXLiveIn)
986 RCXShadowSlot = InitSlot;
987 if (IsRDXLiveIn)
988 RDXShadowSlot = InitSlot;
989 if (IsRDXLiveIn && IsRCXLiveIn)
990 RDXShadowSlot += 8;
991 // Emit the saves if needed.
992 if (IsRCXLiveIn)
993 addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
994 RCXShadowSlot)
995 .addReg(X86::RCX);
996 if (IsRDXLiveIn)
997 addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
998 RDXShadowSlot)
999 .addReg(X86::RDX);
1000 } else {
1001 // Not in the prolog. Copy RAX to a virtual reg.
1002 BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
1005 // Add code to MBB to check for overflow and set the new target stack pointer
1006 // to zero if so.
1007 BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
1008 .addReg(ZeroReg, RegState::Undef)
1009 .addReg(ZeroReg, RegState::Undef);
1010 BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
1011 BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
1012 .addReg(CopyReg)
1013 .addReg(SizeReg);
1014 BuildMI(&MBB, DL, TII.get(X86::CMOV64rr), FinalReg)
1015 .addReg(TestReg)
1016 .addReg(ZeroReg)
1017 .addImm(X86::COND_B);
1019 // FinalReg now holds final stack pointer value, or zero if
1020 // allocation would overflow. Compare against the current stack
1021 // limit from the thread environment block. Note this limit is the
1022 // lowest touched page on the stack, not the point at which the OS
1023 // will cause an overflow exception, so this is just an optimization
1024 // to avoid unnecessarily touching pages that are below the current
1025 // SP but already committed to the stack by the OS.
1026 BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
1027 .addReg(0)
1028 .addImm(1)
1029 .addReg(0)
1030 .addImm(ThreadEnvironmentStackLimit)
1031 .addReg(X86::GS);
1032 BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
1033 // Jump if the desired stack pointer is at or above the stack limit.
1034 BuildMI(&MBB, DL, TII.get(X86::JCC_1))
1035 .addMBB(ContinueMBB)
1036 .addImm(X86::COND_AE);
1038 // Add code to roundMBB to round the final stack pointer to a page boundary.
1039 if (InProlog)
1040 RoundMBB->addLiveIn(FinalReg);
1041 BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
1042 .addReg(FinalReg)
1043 .addImm(PageMask);
1044 BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
1046 // LimitReg now holds the current stack limit, RoundedReg page-rounded
1047 // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
1048 // and probe until we reach RoundedReg.
1049 if (!InProlog) {
1050 BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
1051 .addReg(LimitReg)
1052 .addMBB(RoundMBB)
1053 .addReg(ProbeReg)
1054 .addMBB(LoopMBB);
1057 if (InProlog)
1058 LoopMBB->addLiveIn(JoinReg);
1059 addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
1060 false, -PageSize);
1062 // Probe by storing a byte onto the stack.
1063 BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
1064 .addReg(ProbeReg)
1065 .addImm(1)
1066 .addReg(0)
1067 .addImm(0)
1068 .addReg(0)
1069 .addImm(0);
1071 if (InProlog)
1072 LoopMBB->addLiveIn(RoundedReg);
1073 BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
1074 .addReg(RoundedReg)
1075 .addReg(ProbeReg);
1076 BuildMI(LoopMBB, DL, TII.get(X86::JCC_1))
1077 .addMBB(LoopMBB)
1078 .addImm(X86::COND_NE);
1080 MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
1082 // If in prolog, restore RDX and RCX.
1083 if (InProlog) {
1084 if (RCXShadowSlot) // It means we spilled RCX in the prologue.
1085 addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
1086 TII.get(X86::MOV64rm), X86::RCX),
1087 X86::RSP, false, RCXShadowSlot);
1088 if (RDXShadowSlot) // It means we spilled RDX in the prologue.
1089 addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL,
1090 TII.get(X86::MOV64rm), X86::RDX),
1091 X86::RSP, false, RDXShadowSlot);
1094 // Now that the probing is done, add code to continueMBB to update
1095 // the stack pointer for real.
1096 BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
1097 .addReg(X86::RSP)
1098 .addReg(SizeReg);
1100 // Add the control flow edges we need.
1101 MBB.addSuccessor(ContinueMBB);
1102 MBB.addSuccessor(RoundMBB);
1103 RoundMBB->addSuccessor(LoopMBB);
1104 LoopMBB->addSuccessor(ContinueMBB);
1105 LoopMBB->addSuccessor(LoopMBB);
1107 if (InProlog) {
1108 LivePhysRegs LiveRegs;
1109 computeAndAddLiveIns(LiveRegs, *ContinueMBB);
1112 // Mark all the instructions added to the prolog as frame setup.
1113 if (InProlog) {
1114 for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
1115 BeforeMBBI->setFlag(MachineInstr::FrameSetup);
1117 for (MachineInstr &MI : *RoundMBB) {
1118 MI.setFlag(MachineInstr::FrameSetup);
1120 for (MachineInstr &MI : *LoopMBB) {
1121 MI.setFlag(MachineInstr::FrameSetup);
1123 for (MachineInstr &MI :
1124 llvm::make_range(ContinueMBB->begin(), ContinueMBBI)) {
1125 MI.setFlag(MachineInstr::FrameSetup);
1130 void X86FrameLowering::emitStackProbeCall(
1131 MachineFunction &MF, MachineBasicBlock &MBB,
1132 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog,
1133 std::optional<MachineFunction::DebugInstrOperandPair> InstrNum) const {
1134 bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
1136 // FIXME: Add indirect thunk support and remove this.
1137 if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())
1138 report_fatal_error("Emitting stack probe calls on 64-bit with the large "
1139 "code model and indirect thunks not yet implemented.");
1141 assert(MBB.computeRegisterLiveness(TRI, X86::EFLAGS, MBBI) !=
1142 MachineBasicBlock::LQR_Live &&
1143 "Stack probe calls will clobber live EFLAGS.");
1145 unsigned CallOp;
1146 if (Is64Bit)
1147 CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
1148 else
1149 CallOp = X86::CALLpcrel32;
1151 StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
1153 MachineInstrBuilder CI;
1154 MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
1156 // All current stack probes take AX and SP as input, clobber flags, and
1157 // preserve all registers. x86_64 probes leave RSP unmodified.
1158 if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
1159 // For the large code model, we have to call through a register. Use R11,
1160 // as it is scratch in all supported calling conventions.
1161 BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
1162 .addExternalSymbol(MF.createExternalSymbolName(Symbol));
1163 CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
1164 } else {
1165 CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
1166 .addExternalSymbol(MF.createExternalSymbolName(Symbol));
1169 unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;
1170 unsigned SP = Uses64BitFramePtr ? X86::RSP : X86::ESP;
1171 CI.addReg(AX, RegState::Implicit)
1172 .addReg(SP, RegState::Implicit)
1173 .addReg(AX, RegState::Define | RegState::Implicit)
1174 .addReg(SP, RegState::Define | RegState::Implicit)
1175 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
1177 MachineInstr *ModInst = CI;
1178 if (STI.isTargetWin64() || !STI.isOSWindows()) {
1179 // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
1180 // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
1181 // themselves. They also does not clobber %rax so we can reuse it when
1182 // adjusting %rsp.
1183 // All other platforms do not specify a particular ABI for the stack probe
1184 // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
1185 ModInst =
1186 BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Uses64BitFramePtr)), SP)
1187 .addReg(SP)
1188 .addReg(AX);
1191 // DebugInfo variable locations -- if there's an instruction number for the
1192 // allocation (i.e., DYN_ALLOC_*), substitute it for the instruction that
1193 // modifies SP.
1194 if (InstrNum) {
1195 if (STI.isTargetWin64() || !STI.isOSWindows()) {
1196 // Label destination operand of the subtract.
1197 MF.makeDebugValueSubstitution(*InstrNum,
1198 {ModInst->getDebugInstrNum(), 0});
1199 } else {
1200 // Label the call. The operand number is the penultimate operand, zero
1201 // based.
1202 unsigned SPDefOperand = ModInst->getNumOperands() - 2;
1203 MF.makeDebugValueSubstitution(
1204 *InstrNum, {ModInst->getDebugInstrNum(), SPDefOperand});
1208 if (InProlog) {
1209 // Apply the frame setup flag to all inserted instrs.
1210 for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
1211 ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
1215 static unsigned calculateSetFPREG(uint64_t SPAdjust) {
1216 // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
1217 // and might require smaller successive adjustments.
1218 const uint64_t Win64MaxSEHOffset = 128;
1219 uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
1220 // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
1221 return SEHFrameOffset & -16;
1224 // If we're forcing a stack realignment we can't rely on just the frame
1225 // info, we need to know the ABI stack alignment as well in case we
1226 // have a call out. Otherwise just make sure we have some alignment - we'll
1227 // go with the minimum SlotSize.
1228 uint64_t
1229 X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
1230 const MachineFrameInfo &MFI = MF.getFrameInfo();
1231 Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment.
1232 Align StackAlign = getStackAlign();
1233 bool HasRealign = MF.getFunction().hasFnAttribute("stackrealign");
1234 if (HasRealign) {
1235 if (MFI.hasCalls())
1236 MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
1237 else if (MaxAlign < SlotSize)
1238 MaxAlign = Align(SlotSize);
1241 if (!Is64Bit && MF.getFunction().getCallingConv() == CallingConv::X86_INTR) {
1242 if (HasRealign)
1243 MaxAlign = (MaxAlign > 16) ? MaxAlign : Align(16);
1244 else
1245 MaxAlign = Align(16);
1247 return MaxAlign.value();
1250 void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
1251 MachineBasicBlock::iterator MBBI,
1252 const DebugLoc &DL, unsigned Reg,
1253 uint64_t MaxAlign) const {
1254 uint64_t Val = -MaxAlign;
1255 unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
1257 MachineFunction &MF = *MBB.getParent();
1258 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
1259 const X86TargetLowering &TLI = *STI.getTargetLowering();
1260 const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
1261 const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
1263 // We want to make sure that (in worst case) less than StackProbeSize bytes
1264 // are not probed after the AND. This assumption is used in
1265 // emitStackProbeInlineGeneric.
1266 if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
1268 NumFrameLoopProbe++;
1269 MachineBasicBlock *entryMBB =
1270 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1271 MachineBasicBlock *headMBB =
1272 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1273 MachineBasicBlock *bodyMBB =
1274 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1275 MachineBasicBlock *footMBB =
1276 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1278 MachineFunction::iterator MBBIter = MBB.getIterator();
1279 MF.insert(MBBIter, entryMBB);
1280 MF.insert(MBBIter, headMBB);
1281 MF.insert(MBBIter, bodyMBB);
1282 MF.insert(MBBIter, footMBB);
1283 const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
1284 Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
1285 : Is64Bit ? X86::R11D
1286 : X86::EAX;
1288 // Setup entry block
1291 entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI);
1292 BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
1293 .addReg(StackPtr)
1294 .setMIFlag(MachineInstr::FrameSetup);
1295 MachineInstr *MI =
1296 BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)
1297 .addReg(FinalStackProbed)
1298 .addImm(Val)
1299 .setMIFlag(MachineInstr::FrameSetup);
1301 // The EFLAGS implicit def is dead.
1302 MI->getOperand(3).setIsDead();
1304 BuildMI(entryMBB, DL,
1305 TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1306 .addReg(FinalStackProbed)
1307 .addReg(StackPtr)
1308 .setMIFlag(MachineInstr::FrameSetup);
1309 BuildMI(entryMBB, DL, TII.get(X86::JCC_1))
1310 .addMBB(&MBB)
1311 .addImm(X86::COND_E)
1312 .setMIFlag(MachineInstr::FrameSetup);
1313 entryMBB->addSuccessor(headMBB);
1314 entryMBB->addSuccessor(&MBB);
1317 // Loop entry block
1320 const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);
1321 BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
1322 .addReg(StackPtr)
1323 .addImm(StackProbeSize)
1324 .setMIFlag(MachineInstr::FrameSetup);
1326 BuildMI(headMBB, DL,
1327 TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1328 .addReg(StackPtr)
1329 .addReg(FinalStackProbed)
1330 .setMIFlag(MachineInstr::FrameSetup);
1332 // jump to the footer if StackPtr < FinalStackProbed
1333 BuildMI(headMBB, DL, TII.get(X86::JCC_1))
1334 .addMBB(footMBB)
1335 .addImm(X86::COND_B)
1336 .setMIFlag(MachineInstr::FrameSetup);
1338 headMBB->addSuccessor(bodyMBB);
1339 headMBB->addSuccessor(footMBB);
1342 // setup loop body
1344 addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))
1345 .setMIFlag(MachineInstr::FrameSetup),
1346 StackPtr, false, 0)
1347 .addImm(0)
1348 .setMIFlag(MachineInstr::FrameSetup);
1350 const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);
1351 BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
1352 .addReg(StackPtr)
1353 .addImm(StackProbeSize)
1354 .setMIFlag(MachineInstr::FrameSetup);
1356 // cmp with stack pointer bound
1357 BuildMI(bodyMBB, DL,
1358 TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1359 .addReg(FinalStackProbed)
1360 .addReg(StackPtr)
1361 .setMIFlag(MachineInstr::FrameSetup);
1363 // jump back while FinalStackProbed < StackPtr
1364 BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))
1365 .addMBB(bodyMBB)
1366 .addImm(X86::COND_B)
1367 .setMIFlag(MachineInstr::FrameSetup);
1368 bodyMBB->addSuccessor(bodyMBB);
1369 bodyMBB->addSuccessor(footMBB);
1372 // setup loop footer
1374 BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)
1375 .addReg(FinalStackProbed)
1376 .setMIFlag(MachineInstr::FrameSetup);
1377 addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))
1378 .setMIFlag(MachineInstr::FrameSetup),
1379 StackPtr, false, 0)
1380 .addImm(0)
1381 .setMIFlag(MachineInstr::FrameSetup);
1382 footMBB->addSuccessor(&MBB);
1385 fullyRecomputeLiveIns({footMBB, bodyMBB, headMBB, &MBB});
1387 } else {
1388 MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
1389 .addReg(Reg)
1390 .addImm(Val)
1391 .setMIFlag(MachineInstr::FrameSetup);
1393 // The EFLAGS implicit def is dead.
1394 MI->getOperand(3).setIsDead();
1398 bool X86FrameLowering::has128ByteRedZone(const MachineFunction &MF) const {
1399 // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
1400 // clobbered by any interrupt handler.
1401 assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
1402 "MF used frame lowering for wrong subtarget");
1403 const Function &Fn = MF.getFunction();
1404 const bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
1405 return Is64Bit && !IsWin64CC && !Fn.hasFnAttribute(Attribute::NoRedZone);
1408 /// Return true if we need to use the restricted Windows x64 prologue and
1409 /// epilogue code patterns that can be described with WinCFI (.seh_*
1410 /// directives).
1411 bool X86FrameLowering::isWin64Prologue(const MachineFunction &MF) const {
1412 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
1415 bool X86FrameLowering::needsDwarfCFI(const MachineFunction &MF) const {
1416 return !isWin64Prologue(MF) && MF.needsFrameMoves();
1419 /// Return true if an opcode is part of the REP group of instructions
1420 static bool isOpcodeRep(unsigned Opcode) {
1421 switch (Opcode) {
1422 case X86::REPNE_PREFIX:
1423 case X86::REP_MOVSB_32:
1424 case X86::REP_MOVSB_64:
1425 case X86::REP_MOVSD_32:
1426 case X86::REP_MOVSD_64:
1427 case X86::REP_MOVSQ_32:
1428 case X86::REP_MOVSQ_64:
1429 case X86::REP_MOVSW_32:
1430 case X86::REP_MOVSW_64:
1431 case X86::REP_PREFIX:
1432 case X86::REP_STOSB_32:
1433 case X86::REP_STOSB_64:
1434 case X86::REP_STOSD_32:
1435 case X86::REP_STOSD_64:
1436 case X86::REP_STOSQ_32:
1437 case X86::REP_STOSQ_64:
1438 case X86::REP_STOSW_32:
1439 case X86::REP_STOSW_64:
1440 return true;
1441 default:
1442 break;
1444 return false;
1447 /// emitPrologue - Push callee-saved registers onto the stack, which
1448 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
1449 /// space for local variables. Also emit labels used by the exception handler to
1450 /// generate the exception handling frames.
1453 Here's a gist of what gets emitted:
1455 ; Establish frame pointer, if needed
1456 [if needs FP]
1457 push %rbp
1458 .cfi_def_cfa_offset 16
1459 .cfi_offset %rbp, -16
1460 .seh_pushreg %rpb
1461 mov %rsp, %rbp
1462 .cfi_def_cfa_register %rbp
1464 ; Spill general-purpose registers
1465 [for all callee-saved GPRs]
1466 pushq %<reg>
1467 [if not needs FP]
1468 .cfi_def_cfa_offset (offset from RETADDR)
1469 .seh_pushreg %<reg>
1471 ; If the required stack alignment > default stack alignment
1472 ; rsp needs to be re-aligned. This creates a "re-alignment gap"
1473 ; of unknown size in the stack frame.
1474 [if stack needs re-alignment]
1475 and $MASK, %rsp
1477 ; Allocate space for locals
1478 [if target is Windows and allocated space > 4096 bytes]
1479 ; Windows needs special care for allocations larger
1480 ; than one page.
1481 mov $NNN, %rax
1482 call ___chkstk_ms/___chkstk
1483 sub %rax, %rsp
1484 [else]
1485 sub $NNN, %rsp
1487 [if needs FP]
1488 .seh_stackalloc (size of XMM spill slots)
1489 .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
1490 [else]
1491 .seh_stackalloc NNN
1493 ; Spill XMMs
1494 ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
1495 ; they may get spilled on any platform, if the current function
1496 ; calls @llvm.eh.unwind.init
1497 [if needs FP]
1498 [for all callee-saved XMM registers]
1499 movaps %<xmm reg>, -MMM(%rbp)
1500 [for all callee-saved XMM registers]
1501 .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
1502 ; i.e. the offset relative to (%rbp - SEHFrameOffset)
1503 [else]
1504 [for all callee-saved XMM registers]
1505 movaps %<xmm reg>, KKK(%rsp)
1506 [for all callee-saved XMM registers]
1507 .seh_savexmm %<xmm reg>, KKK
1509 .seh_endprologue
1511 [if needs base pointer]
1512 mov %rsp, %rbx
1513 [if needs to restore base pointer]
1514 mov %rsp, -MMM(%rbp)
1516 ; Emit CFI info
1517 [if needs FP]
1518 [for all callee-saved registers]
1519 .cfi_offset %<reg>, (offset from %rbp)
1520 [else]
1521 .cfi_def_cfa_offset (offset from RETADDR)
1522 [for all callee-saved registers]
1523 .cfi_offset %<reg>, (offset from %rsp)
1525 Notes:
1526 - .seh directives are emitted only for Windows 64 ABI
1527 - .cv_fpo directives are emitted on win32 when emitting CodeView
1528 - .cfi directives are emitted for all other ABIs
1529 - for 32-bit code, substitute %e?? registers for %r??
1532 void X86FrameLowering::emitPrologue(MachineFunction &MF,
1533 MachineBasicBlock &MBB) const {
1534 assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
1535 "MF used frame lowering for wrong subtarget");
1536 MachineBasicBlock::iterator MBBI = MBB.begin();
1537 MachineFrameInfo &MFI = MF.getFrameInfo();
1538 const Function &Fn = MF.getFunction();
1539 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
1540 uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
1541 uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
1542 bool IsFunclet = MBB.isEHFuncletEntry();
1543 EHPersonality Personality = EHPersonality::Unknown;
1544 if (Fn.hasPersonalityFn())
1545 Personality = classifyEHPersonality(Fn.getPersonalityFn());
1546 bool FnHasClrFunclet =
1547 MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
1548 bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
1549 bool HasFP = hasFP(MF);
1550 bool IsWin64Prologue = isWin64Prologue(MF);
1551 bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
1552 // FIXME: Emit FPO data for EH funclets.
1553 bool NeedsWinFPO = !IsFunclet && STI.isTargetWin32() &&
1554 MF.getFunction().getParent()->getCodeViewFlag();
1555 bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
1556 bool NeedsDwarfCFI = needsDwarfCFI(MF);
1557 Register FramePtr = TRI->getFrameRegister(MF);
1558 const Register MachineFramePtr =
1559 STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
1560 : FramePtr;
1561 Register BasePtr = TRI->getBaseRegister();
1562 bool HasWinCFI = false;
1564 // Debug location must be unknown since the first debug location is used
1565 // to determine the end of the prologue.
1566 DebugLoc DL;
1567 Register ArgBaseReg;
1569 // Emit extra prolog for argument stack slot reference.
1570 if (auto *MI = X86FI->getStackPtrSaveMI()) {
1571 // MI is lea instruction that created in X86ArgumentStackSlotPass.
1572 // Creat extra prolog for stack realignment.
1573 ArgBaseReg = MI->getOperand(0).getReg();
1574 // leal 4(%esp), %basereg
1575 // .cfi_def_cfa %basereg, 0
1576 // andl $-128, %esp
1577 // pushl -4(%basereg)
1578 BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::LEA64r : X86::LEA32r),
1579 ArgBaseReg)
1580 .addUse(StackPtr)
1581 .addImm(1)
1582 .addUse(X86::NoRegister)
1583 .addImm(SlotSize)
1584 .addUse(X86::NoRegister)
1585 .setMIFlag(MachineInstr::FrameSetup);
1586 if (NeedsDwarfCFI) {
1587 // .cfi_def_cfa %basereg, 0
1588 unsigned DwarfStackPtr = TRI->getDwarfRegNum(ArgBaseReg, true);
1589 BuildCFI(MBB, MBBI, DL,
1590 MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, 0),
1591 MachineInstr::FrameSetup);
1593 BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
1594 int64_t Offset = -(int64_t)SlotSize;
1595 BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm))
1596 .addReg(ArgBaseReg)
1597 .addImm(1)
1598 .addReg(X86::NoRegister)
1599 .addImm(Offset)
1600 .addReg(X86::NoRegister)
1601 .setMIFlag(MachineInstr::FrameSetup);
1604 // Space reserved for stack-based arguments when making a (ABI-guaranteed)
1605 // tail call.
1606 unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
1607 if (TailCallArgReserveSize && IsWin64Prologue)
1608 report_fatal_error("Can't handle guaranteed tail call under win64 yet");
1610 const bool EmitStackProbeCall =
1611 STI.getTargetLowering()->hasStackProbeSymbol(MF);
1612 unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
1614 if (HasFP && X86FI->hasSwiftAsyncContext()) {
1615 switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1616 case SwiftAsyncFramePointerMode::DeploymentBased:
1617 if (STI.swiftAsyncContextIsDynamicallySet()) {
1618 // The special symbol below is absolute and has a *value* suitable to be
1619 // combined with the frame pointer directly.
1620 BuildMI(MBB, MBBI, DL, TII.get(X86::OR64rm), MachineFramePtr)
1621 .addUse(MachineFramePtr)
1622 .addUse(X86::RIP)
1623 .addImm(1)
1624 .addUse(X86::NoRegister)
1625 .addExternalSymbol("swift_async_extendedFramePointerFlags",
1626 X86II::MO_GOTPCREL)
1627 .addUse(X86::NoRegister);
1628 break;
1630 [[fallthrough]];
1632 case SwiftAsyncFramePointerMode::Always:
1633 assert(
1634 !IsWin64Prologue &&
1635 "win64 prologue does not set the bit 60 in the saved frame pointer");
1636 BuildMI(MBB, MBBI, DL, TII.get(X86::BTS64ri8), MachineFramePtr)
1637 .addUse(MachineFramePtr)
1638 .addImm(60)
1639 .setMIFlag(MachineInstr::FrameSetup);
1640 break;
1642 case SwiftAsyncFramePointerMode::Never:
1643 break;
1647 // Re-align the stack on 64-bit if the x86-interrupt calling convention is
1648 // used and an error code was pushed, since the x86-64 ABI requires a 16-byte
1649 // stack alignment.
1650 if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
1651 Fn.arg_size() == 2) {
1652 StackSize += 8;
1653 MFI.setStackSize(StackSize);
1655 // Update the stack pointer by pushing a register. This is the instruction
1656 // emitted that would be end up being emitted by a call to `emitSPUpdate`.
1657 // Hard-coding the update to a push avoids emitting a second
1658 // `STACKALLOC_W_PROBING` instruction in the save block: We know that stack
1659 // probing isn't needed anyways for an 8-byte update.
1660 // Pushing a register leaves us in a similar situation to a regular
1661 // function call where we know that the address at (rsp-8) is writeable.
1662 // That way we avoid any off-by-ones with stack probing for additional
1663 // stack pointer updates later on.
1664 BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1665 .addReg(X86::RAX, RegState::Undef)
1666 .setMIFlag(MachineInstr::FrameSetup);
1669 // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
1670 // function, and use up to 128 bytes of stack space, don't have a frame
1671 // pointer, calls, or dynamic alloca then we do not need to adjust the
1672 // stack pointer (we fit in the Red Zone). We also check that we don't
1673 // push and pop from the stack.
1674 if (has128ByteRedZone(MF) && !TRI->hasStackRealignment(MF) &&
1675 !MFI.hasVarSizedObjects() && // No dynamic alloca.
1676 !MFI.adjustsStack() && // No calls.
1677 !EmitStackProbeCall && // No stack probes.
1678 !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
1679 !MF.shouldSplitStack()) { // Regular stack
1680 uint64_t MinSize =
1681 X86FI->getCalleeSavedFrameSize() - X86FI->getTCReturnAddrDelta();
1682 if (HasFP)
1683 MinSize += SlotSize;
1684 X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
1685 StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
1686 MFI.setStackSize(StackSize);
1689 // Insert stack pointer adjustment for later moving of return addr. Only
1690 // applies to tail call optimized functions where the callee argument stack
1691 // size is bigger than the callers.
1692 if (TailCallArgReserveSize != 0) {
1693 BuildStackAdjustment(MBB, MBBI, DL, -(int)TailCallArgReserveSize,
1694 /*InEpilogue=*/false)
1695 .setMIFlag(MachineInstr::FrameSetup);
1698 // Mapping for machine moves:
1700 // DST: VirtualFP AND
1701 // SRC: VirtualFP => DW_CFA_def_cfa_offset
1702 // ELSE => DW_CFA_def_cfa
1704 // SRC: VirtualFP AND
1705 // DST: Register => DW_CFA_def_cfa_register
1707 // ELSE
1708 // OFFSET < 0 => DW_CFA_offset_extended_sf
1709 // REG < 64 => DW_CFA_offset + Reg
1710 // ELSE => DW_CFA_offset_extended
1712 uint64_t NumBytes = 0;
1713 int stackGrowth = -SlotSize;
1715 // Find the funclet establisher parameter
1716 Register Establisher = X86::NoRegister;
1717 if (IsClrFunclet)
1718 Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
1719 else if (IsFunclet)
1720 Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
1722 if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
1723 // Immediately spill establisher into the home slot.
1724 // The runtime cares about this.
1725 // MOV64mr %rdx, 16(%rsp)
1726 unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
1727 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
1728 .addReg(Establisher)
1729 .setMIFlag(MachineInstr::FrameSetup);
1730 MBB.addLiveIn(Establisher);
1733 if (HasFP) {
1734 assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");
1736 // Calculate required stack adjustment.
1737 uint64_t FrameSize = StackSize - SlotSize;
1738 NumBytes =
1739 FrameSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
1741 // Callee-saved registers are pushed on stack before the stack is realigned.
1742 if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
1743 NumBytes = alignTo(NumBytes, MaxAlign);
1745 // Save EBP/RBP into the appropriate stack slot.
1746 BuildMI(MBB, MBBI, DL,
1747 TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
1748 .addReg(MachineFramePtr, RegState::Kill)
1749 .setMIFlag(MachineInstr::FrameSetup);
1751 if (NeedsDwarfCFI && !ArgBaseReg.isValid()) {
1752 // Mark the place where EBP/RBP was saved.
1753 // Define the current CFA rule to use the provided offset.
1754 assert(StackSize);
1755 BuildCFI(MBB, MBBI, DL,
1756 MCCFIInstruction::cfiDefCfaOffset(
1757 nullptr, -2 * stackGrowth + (int)TailCallArgReserveSize),
1758 MachineInstr::FrameSetup);
1760 // Change the rule for the FramePtr to be an "offset" rule.
1761 unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
1762 BuildCFI(MBB, MBBI, DL,
1763 MCCFIInstruction::createOffset(nullptr, DwarfFramePtr,
1764 2 * stackGrowth -
1765 (int)TailCallArgReserveSize),
1766 MachineInstr::FrameSetup);
1769 if (NeedsWinCFI) {
1770 HasWinCFI = true;
1771 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1772 .addImm(FramePtr)
1773 .setMIFlag(MachineInstr::FrameSetup);
1776 if (!IsFunclet) {
1777 if (X86FI->hasSwiftAsyncContext()) {
1778 assert(!IsWin64Prologue &&
1779 "win64 prologue does not store async context right below rbp");
1780 const auto &Attrs = MF.getFunction().getAttributes();
1782 // Before we update the live frame pointer we have to ensure there's a
1783 // valid (or null) asynchronous context in its slot just before FP in
1784 // the frame record, so store it now.
1785 if (Attrs.hasAttrSomewhere(Attribute::SwiftAsync)) {
1786 // We have an initial context in r14, store it just before the frame
1787 // pointer.
1788 MBB.addLiveIn(X86::R14);
1789 BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1790 .addReg(X86::R14)
1791 .setMIFlag(MachineInstr::FrameSetup);
1792 } else {
1793 // No initial context, store null so that there's no pointer that
1794 // could be misused.
1795 BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64i32))
1796 .addImm(0)
1797 .setMIFlag(MachineInstr::FrameSetup);
1800 if (NeedsWinCFI) {
1801 HasWinCFI = true;
1802 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1803 .addImm(X86::R14)
1804 .setMIFlag(MachineInstr::FrameSetup);
1807 BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr)
1808 .addUse(X86::RSP)
1809 .addImm(1)
1810 .addUse(X86::NoRegister)
1811 .addImm(8)
1812 .addUse(X86::NoRegister)
1813 .setMIFlag(MachineInstr::FrameSetup);
1814 BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64ri32), X86::RSP)
1815 .addUse(X86::RSP)
1816 .addImm(8)
1817 .setMIFlag(MachineInstr::FrameSetup);
1820 if (!IsWin64Prologue && !IsFunclet) {
1821 // Update EBP with the new base value.
1822 if (!X86FI->hasSwiftAsyncContext())
1823 BuildMI(MBB, MBBI, DL,
1824 TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
1825 FramePtr)
1826 .addReg(StackPtr)
1827 .setMIFlag(MachineInstr::FrameSetup);
1829 if (NeedsDwarfCFI) {
1830 if (ArgBaseReg.isValid()) {
1831 SmallString<64> CfaExpr;
1832 CfaExpr.push_back(dwarf::DW_CFA_expression);
1833 uint8_t buffer[16];
1834 unsigned DwarfReg = TRI->getDwarfRegNum(MachineFramePtr, true);
1835 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
1836 CfaExpr.push_back(2);
1837 CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
1838 CfaExpr.push_back(0);
1839 // DW_CFA_expression: reg5 DW_OP_breg5 +0
1840 BuildCFI(MBB, MBBI, DL,
1841 MCCFIInstruction::createEscape(nullptr, CfaExpr.str()),
1842 MachineInstr::FrameSetup);
1843 } else {
1844 // Mark effective beginning of when frame pointer becomes valid.
1845 // Define the current CFA to use the EBP/RBP register.
1846 unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
1847 BuildCFI(
1848 MBB, MBBI, DL,
1849 MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr),
1850 MachineInstr::FrameSetup);
1854 if (NeedsWinFPO) {
1855 // .cv_fpo_setframe $FramePtr
1856 HasWinCFI = true;
1857 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
1858 .addImm(FramePtr)
1859 .addImm(0)
1860 .setMIFlag(MachineInstr::FrameSetup);
1864 } else {
1865 assert(!IsFunclet && "funclets without FPs not yet implemented");
1866 NumBytes =
1867 StackSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
1870 // Update the offset adjustment, which is mainly used by codeview to translate
1871 // from ESP to VFRAME relative local variable offsets.
1872 if (!IsFunclet) {
1873 if (HasFP && TRI->hasStackRealignment(MF))
1874 MFI.setOffsetAdjustment(-NumBytes);
1875 else
1876 MFI.setOffsetAdjustment(-StackSize);
1879 // For EH funclets, only allocate enough space for outgoing calls. Save the
1880 // NumBytes value that we would've used for the parent frame.
1881 unsigned ParentFrameNumBytes = NumBytes;
1882 if (IsFunclet)
1883 NumBytes = getWinEHFuncletFrameSize(MF);
1885 // Skip the callee-saved push instructions.
1886 bool PushedRegs = false;
1887 int StackOffset = 2 * stackGrowth;
1888 MachineBasicBlock::const_iterator LastCSPush = MBBI;
1889 auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {
1890 if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup))
1891 return false;
1892 unsigned Opc = MBBI->getOpcode();
1893 return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r ||
1894 Opc == X86::PUSH2 || Opc == X86::PUSH2P;
1897 while (IsCSPush(MBBI)) {
1898 PushedRegs = true;
1899 Register Reg = MBBI->getOperand(0).getReg();
1900 LastCSPush = MBBI;
1901 ++MBBI;
1902 unsigned Opc = LastCSPush->getOpcode();
1904 if (!HasFP && NeedsDwarfCFI) {
1905 // Mark callee-saved push instruction.
1906 // Define the current CFA rule to use the provided offset.
1907 assert(StackSize);
1908 // Compared to push, push2 introduces more stack offset (one more
1909 // register).
1910 if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
1911 StackOffset += stackGrowth;
1912 BuildCFI(MBB, MBBI, DL,
1913 MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),
1914 MachineInstr::FrameSetup);
1915 StackOffset += stackGrowth;
1918 if (NeedsWinCFI) {
1919 HasWinCFI = true;
1920 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1921 .addImm(Reg)
1922 .setMIFlag(MachineInstr::FrameSetup);
1923 if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
1924 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1925 .addImm(LastCSPush->getOperand(1).getReg())
1926 .setMIFlag(MachineInstr::FrameSetup);
1930 // Realign stack after we pushed callee-saved registers (so that we'll be
1931 // able to calculate their offsets from the frame pointer).
1932 // Don't do this for Win64, it needs to realign the stack after the prologue.
1933 if (!IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF) &&
1934 !ArgBaseReg.isValid()) {
1935 assert(HasFP && "There should be a frame pointer if stack is realigned.");
1936 BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
1938 if (NeedsWinCFI) {
1939 HasWinCFI = true;
1940 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlign))
1941 .addImm(MaxAlign)
1942 .setMIFlag(MachineInstr::FrameSetup);
1946 // If there is an SUB32ri of ESP immediately before this instruction, merge
1947 // the two. This can be the case when tail call elimination is enabled and
1948 // the callee has more arguments then the caller.
1949 NumBytes -= mergeSPUpdates(MBB, MBBI, true);
1951 // Adjust stack pointer: ESP -= numbytes.
1953 // Windows and cygwin/mingw require a prologue helper routine when allocating
1954 // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
1955 // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
1956 // stack and adjust the stack pointer in one go. The 64-bit version of
1957 // __chkstk is only responsible for probing the stack. The 64-bit prologue is
1958 // responsible for adjusting the stack pointer. Touching the stack at 4K
1959 // increments is necessary to ensure that the guard pages used by the OS
1960 // virtual memory manager are allocated in correct sequence.
1961 uint64_t AlignedNumBytes = NumBytes;
1962 if (IsWin64Prologue && !IsFunclet && TRI->hasStackRealignment(MF))
1963 AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
1964 if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {
1965 assert(!X86FI->getUsesRedZone() &&
1966 "The Red Zone is not accounted for in stack probes");
1968 // Check whether EAX is livein for this block.
1969 bool isEAXAlive = isEAXLiveIn(MBB);
1971 if (isEAXAlive) {
1972 if (Is64Bit) {
1973 // Save RAX
1974 BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
1975 .addReg(X86::RAX, RegState::Kill)
1976 .setMIFlag(MachineInstr::FrameSetup);
1977 } else {
1978 // Save EAX
1979 BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
1980 .addReg(X86::EAX, RegState::Kill)
1981 .setMIFlag(MachineInstr::FrameSetup);
1985 if (Is64Bit) {
1986 // Handle the 64-bit Windows ABI case where we need to call __chkstk.
1987 // Function prologue is responsible for adjusting the stack pointer.
1988 int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
1989 BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Alloc)), X86::RAX)
1990 .addImm(Alloc)
1991 .setMIFlag(MachineInstr::FrameSetup);
1992 } else {
1993 // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
1994 // We'll also use 4 already allocated bytes for EAX.
1995 BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
1996 .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
1997 .setMIFlag(MachineInstr::FrameSetup);
2000 // Call __chkstk, __chkstk_ms, or __alloca.
2001 emitStackProbe(MF, MBB, MBBI, DL, true);
2003 if (isEAXAlive) {
2004 // Restore RAX/EAX
2005 MachineInstr *MI;
2006 if (Is64Bit)
2007 MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV64rm), X86::RAX),
2008 StackPtr, false, NumBytes - 8);
2009 else
2010 MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
2011 StackPtr, false, NumBytes - 4);
2012 MI->setFlag(MachineInstr::FrameSetup);
2013 MBB.insert(MBBI, MI);
2015 } else if (NumBytes) {
2016 emitSPUpdate(MBB, MBBI, DL, -(int64_t)NumBytes, /*InEpilogue=*/false);
2019 if (NeedsWinCFI && NumBytes) {
2020 HasWinCFI = true;
2021 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
2022 .addImm(NumBytes)
2023 .setMIFlag(MachineInstr::FrameSetup);
2026 int SEHFrameOffset = 0;
2027 unsigned SPOrEstablisher;
2028 if (IsFunclet) {
2029 if (IsClrFunclet) {
2030 // The establisher parameter passed to a CLR funclet is actually a pointer
2031 // to the (mostly empty) frame of its nearest enclosing funclet; we have
2032 // to find the root function establisher frame by loading the PSPSym from
2033 // the intermediate frame.
2034 unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
2035 MachinePointerInfo NoInfo;
2036 MBB.addLiveIn(Establisher);
2037 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
2038 Establisher, false, PSPSlotOffset)
2039 .addMemOperand(MF.getMachineMemOperand(
2040 NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize)));
2042 // Save the root establisher back into the current funclet's (mostly
2043 // empty) frame, in case a sub-funclet or the GC needs it.
2044 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
2045 false, PSPSlotOffset)
2046 .addReg(Establisher)
2047 .addMemOperand(MF.getMachineMemOperand(
2048 NoInfo,
2049 MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
2050 SlotSize, Align(SlotSize)));
2052 SPOrEstablisher = Establisher;
2053 } else {
2054 SPOrEstablisher = StackPtr;
2057 if (IsWin64Prologue && HasFP) {
2058 // Set RBP to a small fixed offset from RSP. In the funclet case, we base
2059 // this calculation on the incoming establisher, which holds the value of
2060 // RSP from the parent frame at the end of the prologue.
2061 SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
2062 if (SEHFrameOffset)
2063 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
2064 SPOrEstablisher, false, SEHFrameOffset);
2065 else
2066 BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
2067 .addReg(SPOrEstablisher);
2069 // If this is not a funclet, emit the CFI describing our frame pointer.
2070 if (NeedsWinCFI && !IsFunclet) {
2071 assert(!NeedsWinFPO && "this setframe incompatible with FPO data");
2072 HasWinCFI = true;
2073 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
2074 .addImm(FramePtr)
2075 .addImm(SEHFrameOffset)
2076 .setMIFlag(MachineInstr::FrameSetup);
2077 if (isAsynchronousEHPersonality(Personality))
2078 MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
2080 } else if (IsFunclet && STI.is32Bit()) {
2081 // Reset EBP / ESI to something good for funclets.
2082 MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
2083 // If we're a catch funclet, we can be returned to via catchret. Save ESP
2084 // into the registration node so that the runtime will restore it for us.
2085 if (!MBB.isCleanupFuncletEntry()) {
2086 assert(Personality == EHPersonality::MSVC_CXX);
2087 Register FrameReg;
2088 int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
2089 int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();
2090 // ESP is the first field, so no extra displacement is needed.
2091 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
2092 false, EHRegOffset)
2093 .addReg(X86::ESP);
2097 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
2098 const MachineInstr &FrameInstr = *MBBI;
2099 ++MBBI;
2101 if (NeedsWinCFI) {
2102 int FI;
2103 if (Register Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
2104 if (X86::FR64RegClass.contains(Reg)) {
2105 int Offset;
2106 Register IgnoredFrameReg;
2107 if (IsWin64Prologue && IsFunclet)
2108 Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
2109 else
2110 Offset =
2111 getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() +
2112 SEHFrameOffset;
2114 HasWinCFI = true;
2115 assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
2116 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
2117 .addImm(Reg)
2118 .addImm(Offset)
2119 .setMIFlag(MachineInstr::FrameSetup);
2125 if (NeedsWinCFI && HasWinCFI)
2126 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
2127 .setMIFlag(MachineInstr::FrameSetup);
2129 if (FnHasClrFunclet && !IsFunclet) {
2130 // Save the so-called Initial-SP (i.e. the value of the stack pointer
2131 // immediately after the prolog) into the PSPSlot so that funclets
2132 // and the GC can recover it.
2133 unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
2134 auto PSPInfo = MachinePointerInfo::getFixedStack(
2135 MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
2136 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
2137 PSPSlotOffset)
2138 .addReg(StackPtr)
2139 .addMemOperand(MF.getMachineMemOperand(
2140 PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
2141 SlotSize, Align(SlotSize)));
2144 // Realign stack after we spilled callee-saved registers (so that we'll be
2145 // able to calculate their offsets from the frame pointer).
2146 // Win64 requires aligning the stack after the prologue.
2147 if (IsWin64Prologue && TRI->hasStackRealignment(MF)) {
2148 assert(HasFP && "There should be a frame pointer if stack is realigned.");
2149 BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
2152 // We already dealt with stack realignment and funclets above.
2153 if (IsFunclet && STI.is32Bit())
2154 return;
2156 // If we need a base pointer, set it up here. It's whatever the value
2157 // of the stack pointer is at this point. Any variable size objects
2158 // will be allocated after this, so we can still use the base pointer
2159 // to reference locals.
2160 if (TRI->hasBasePointer(MF)) {
2161 // Update the base pointer with the current stack pointer.
2162 unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
2163 BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
2164 .addReg(SPOrEstablisher)
2165 .setMIFlag(MachineInstr::FrameSetup);
2166 if (X86FI->getRestoreBasePointer()) {
2167 // Stash value of base pointer. Saving RSP instead of EBP shortens
2168 // dependence chain. Used by SjLj EH.
2169 unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
2170 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true,
2171 X86FI->getRestoreBasePointerOffset())
2172 .addReg(SPOrEstablisher)
2173 .setMIFlag(MachineInstr::FrameSetup);
2176 if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
2177 // Stash the value of the frame pointer relative to the base pointer for
2178 // Win32 EH. This supports Win32 EH, which does the inverse of the above:
2179 // it recovers the frame pointer from the base pointer rather than the
2180 // other way around.
2181 unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
2182 Register UsedReg;
2183 int Offset =
2184 getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
2185 .getFixed();
2186 assert(UsedReg == BasePtr);
2187 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
2188 .addReg(FramePtr)
2189 .setMIFlag(MachineInstr::FrameSetup);
2192 if (ArgBaseReg.isValid()) {
2193 // Save argument base pointer.
2194 auto *MI = X86FI->getStackPtrSaveMI();
2195 int FI = MI->getOperand(1).getIndex();
2196 unsigned MOVmr = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
2197 // movl %basereg, offset(%ebp)
2198 addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), FI)
2199 .addReg(ArgBaseReg)
2200 .setMIFlag(MachineInstr::FrameSetup);
2203 if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
2204 // Mark end of stack pointer adjustment.
2205 if (!HasFP && NumBytes) {
2206 // Define the current CFA rule to use the provided offset.
2207 assert(StackSize);
2208 BuildCFI(
2209 MBB, MBBI, DL,
2210 MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth),
2211 MachineInstr::FrameSetup);
2214 // Emit DWARF info specifying the offsets of the callee-saved registers.
2215 emitCalleeSavedFrameMoves(MBB, MBBI, DL, true);
2218 // X86 Interrupt handling function cannot assume anything about the direction
2219 // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
2220 // in each prologue of interrupt handler function.
2222 // Create "cld" instruction only in these cases:
2223 // 1. The interrupt handling function uses any of the "rep" instructions.
2224 // 2. Interrupt handling function calls another function.
2225 // 3. If there are any inline asm blocks, as we do not know what they do
2227 // TODO: We should also emit cld if we detect the use of std, but as of now,
2228 // the compiler does not even emit that instruction or even define it, so in
2229 // practice, this would only happen with inline asm, which we cover anyway.
2230 if (Fn.getCallingConv() == CallingConv::X86_INTR) {
2231 bool NeedsCLD = false;
2233 for (const MachineBasicBlock &B : MF) {
2234 for (const MachineInstr &MI : B) {
2235 if (MI.isCall()) {
2236 NeedsCLD = true;
2237 break;
2240 if (isOpcodeRep(MI.getOpcode())) {
2241 NeedsCLD = true;
2242 break;
2245 if (MI.isInlineAsm()) {
2246 // TODO: Parse asm for rep instructions or call sites?
2247 // For now, let's play it safe and emit a cld instruction
2248 // just in case.
2249 NeedsCLD = true;
2250 break;
2255 if (NeedsCLD) {
2256 BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
2257 .setMIFlag(MachineInstr::FrameSetup);
2261 // At this point we know if the function has WinCFI or not.
2262 MF.setHasWinCFI(HasWinCFI);
2265 bool X86FrameLowering::canUseLEAForSPInEpilogue(
2266 const MachineFunction &MF) const {
2267 // We can't use LEA instructions for adjusting the stack pointer if we don't
2268 // have a frame pointer in the Win64 ABI. Only ADD instructions may be used
2269 // to deallocate the stack.
2270 // This means that we can use LEA for SP in two situations:
2271 // 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
2272 // 2. We *have* a frame pointer which means we are permitted to use LEA.
2273 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
2276 static bool isFuncletReturnInstr(MachineInstr &MI) {
2277 switch (MI.getOpcode()) {
2278 case X86::CATCHRET:
2279 case X86::CLEANUPRET:
2280 return true;
2281 default:
2282 return false;
2284 llvm_unreachable("impossible");
2287 // CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
2288 // stack. It holds a pointer to the bottom of the root function frame. The
2289 // establisher frame pointer passed to a nested funclet may point to the
2290 // (mostly empty) frame of its parent funclet, but it will need to find
2291 // the frame of the root function to access locals. To facilitate this,
2292 // every funclet copies the pointer to the bottom of the root function
2293 // frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
2294 // same offset for the PSPSym in the root function frame that's used in the
2295 // funclets' frames allows each funclet to dynamically accept any ancestor
2296 // frame as its establisher argument (the runtime doesn't guarantee the
2297 // immediate parent for some reason lost to history), and also allows the GC,
2298 // which uses the PSPSym for some bookkeeping, to find it in any funclet's
2299 // frame with only a single offset reported for the entire method.
2300 unsigned
2301 X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
2302 const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
2303 Register SPReg;
2304 int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
2305 /*IgnoreSPUpdates*/ true)
2306 .getFixed();
2307 assert(Offset >= 0 && SPReg == TRI->getStackRegister());
2308 return static_cast<unsigned>(Offset);
2311 unsigned
2312 X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
2313 const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2314 // This is the size of the pushed CSRs.
2315 unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2316 // This is the size of callee saved XMMs.
2317 const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2318 unsigned XMMSize =
2319 WinEHXMMSlotInfo.size() * TRI->getSpillSize(X86::VR128RegClass);
2320 // This is the amount of stack a funclet needs to allocate.
2321 unsigned UsedSize;
2322 EHPersonality Personality =
2323 classifyEHPersonality(MF.getFunction().getPersonalityFn());
2324 if (Personality == EHPersonality::CoreCLR) {
2325 // CLR funclets need to hold enough space to include the PSPSym, at the
2326 // same offset from the stack pointer (immediately after the prolog) as it
2327 // resides at in the main function.
2328 UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
2329 } else {
2330 // Other funclets just need enough stack for outgoing call arguments.
2331 UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
2333 // RBP is not included in the callee saved register block. After pushing RBP,
2334 // everything is 16 byte aligned. Everything we allocate before an outgoing
2335 // call must also be 16 byte aligned.
2336 unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign());
2337 // Subtract out the size of the callee saved registers. This is how much stack
2338 // each funclet will allocate.
2339 return FrameSizeMinusRBP + XMMSize - CSSize;
2342 static bool isTailCallOpcode(unsigned Opc) {
2343 return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
2344 Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
2345 Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64;
2348 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
2349 MachineBasicBlock &MBB) const {
2350 const MachineFrameInfo &MFI = MF.getFrameInfo();
2351 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2352 MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator();
2353 MachineBasicBlock::iterator MBBI = Terminator;
2354 DebugLoc DL;
2355 if (MBBI != MBB.end())
2356 DL = MBBI->getDebugLoc();
2357 // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
2358 const bool Is64BitILP32 = STI.isTarget64BitILP32();
2359 Register FramePtr = TRI->getFrameRegister(MF);
2360 Register MachineFramePtr =
2361 Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
2363 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
2364 bool NeedsWin64CFI =
2365 IsWin64Prologue && MF.getFunction().needsUnwindTableEntry();
2366 bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);
2368 // Get the number of bytes to allocate from the FrameInfo.
2369 uint64_t StackSize = MFI.getStackSize();
2370 uint64_t MaxAlign = calculateMaxStackAlign(MF);
2371 unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2372 unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
2373 bool HasFP = hasFP(MF);
2374 uint64_t NumBytes = 0;
2376 bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() &&
2377 !MF.getTarget().getTargetTriple().isOSWindows()) &&
2378 MF.needsFrameMoves();
2380 Register ArgBaseReg;
2381 if (auto *MI = X86FI->getStackPtrSaveMI()) {
2382 unsigned Opc = X86::LEA32r;
2383 Register StackReg = X86::ESP;
2384 ArgBaseReg = MI->getOperand(0).getReg();
2385 if (STI.is64Bit()) {
2386 Opc = X86::LEA64r;
2387 StackReg = X86::RSP;
2389 // leal -4(%basereg), %esp
2390 // .cfi_def_cfa %esp, 4
2391 BuildMI(MBB, MBBI, DL, TII.get(Opc), StackReg)
2392 .addUse(ArgBaseReg)
2393 .addImm(1)
2394 .addUse(X86::NoRegister)
2395 .addImm(-(int64_t)SlotSize)
2396 .addUse(X86::NoRegister)
2397 .setMIFlag(MachineInstr::FrameDestroy);
2398 if (NeedsDwarfCFI) {
2399 unsigned DwarfStackPtr = TRI->getDwarfRegNum(StackReg, true);
2400 BuildCFI(MBB, MBBI, DL,
2401 MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize),
2402 MachineInstr::FrameDestroy);
2403 --MBBI;
2405 --MBBI;
2408 if (IsFunclet) {
2409 assert(HasFP && "EH funclets without FP not yet implemented");
2410 NumBytes = getWinEHFuncletFrameSize(MF);
2411 } else if (HasFP) {
2412 // Calculate required stack adjustment.
2413 uint64_t FrameSize = StackSize - SlotSize;
2414 NumBytes = FrameSize - CSSize - TailCallArgReserveSize;
2416 // Callee-saved registers were pushed on stack before the stack was
2417 // realigned.
2418 if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
2419 NumBytes = alignTo(FrameSize, MaxAlign);
2420 } else {
2421 NumBytes = StackSize - CSSize - TailCallArgReserveSize;
2423 uint64_t SEHStackAllocAmt = NumBytes;
2425 // AfterPop is the position to insert .cfi_restore.
2426 MachineBasicBlock::iterator AfterPop = MBBI;
2427 if (HasFP) {
2428 if (X86FI->hasSwiftAsyncContext()) {
2429 // Discard the context.
2430 int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
2431 emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
2433 // Pop EBP.
2434 BuildMI(MBB, MBBI, DL,
2435 TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),
2436 MachineFramePtr)
2437 .setMIFlag(MachineInstr::FrameDestroy);
2439 // We need to reset FP to its untagged state on return. Bit 60 is currently
2440 // used to show the presence of an extended frame.
2441 if (X86FI->hasSwiftAsyncContext()) {
2442 BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8), MachineFramePtr)
2443 .addUse(MachineFramePtr)
2444 .addImm(60)
2445 .setMIFlag(MachineInstr::FrameDestroy);
2448 if (NeedsDwarfCFI) {
2449 if (!ArgBaseReg.isValid()) {
2450 unsigned DwarfStackPtr =
2451 TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
2452 BuildCFI(MBB, MBBI, DL,
2453 MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize),
2454 MachineInstr::FrameDestroy);
2456 if (!MBB.succ_empty() && !MBB.isReturnBlock()) {
2457 unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
2458 BuildCFI(MBB, AfterPop, DL,
2459 MCCFIInstruction::createRestore(nullptr, DwarfFramePtr),
2460 MachineInstr::FrameDestroy);
2461 --MBBI;
2462 --AfterPop;
2464 --MBBI;
2468 MachineBasicBlock::iterator FirstCSPop = MBBI;
2469 // Skip the callee-saved pop instructions.
2470 while (MBBI != MBB.begin()) {
2471 MachineBasicBlock::iterator PI = std::prev(MBBI);
2472 unsigned Opc = PI->getOpcode();
2474 if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
2475 if (!PI->getFlag(MachineInstr::FrameDestroy) ||
2476 (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&
2477 Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
2478 Opc != X86::POP2P && Opc != X86::LEA64r))
2479 break;
2480 FirstCSPop = PI;
2483 --MBBI;
2485 if (ArgBaseReg.isValid()) {
2486 // Restore argument base pointer.
2487 auto *MI = X86FI->getStackPtrSaveMI();
2488 int FI = MI->getOperand(1).getIndex();
2489 unsigned MOVrm = Is64Bit ? X86::MOV64rm : X86::MOV32rm;
2490 // movl offset(%ebp), %basereg
2491 addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(MOVrm), ArgBaseReg), FI)
2492 .setMIFlag(MachineInstr::FrameDestroy);
2494 MBBI = FirstCSPop;
2496 if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET)
2497 emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator);
2499 if (MBBI != MBB.end())
2500 DL = MBBI->getDebugLoc();
2501 // If there is an ADD32ri or SUB32ri of ESP immediately before this
2502 // instruction, merge the two instructions.
2503 if (NumBytes || MFI.hasVarSizedObjects())
2504 NumBytes += mergeSPUpdates(MBB, MBBI, true);
2506 // If dynamic alloca is used, then reset esp to point to the last callee-saved
2507 // slot before popping them off! Same applies for the case, when stack was
2508 // realigned. Don't do this if this was a funclet epilogue, since the funclets
2509 // will not do realignment or dynamic stack allocation.
2510 if (((TRI->hasStackRealignment(MF)) || MFI.hasVarSizedObjects()) &&
2511 !IsFunclet) {
2512 if (TRI->hasStackRealignment(MF))
2513 MBBI = FirstCSPop;
2514 unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
2515 uint64_t LEAAmount =
2516 IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
2518 if (X86FI->hasSwiftAsyncContext())
2519 LEAAmount -= 16;
2521 // There are only two legal forms of epilogue:
2522 // - add SEHAllocationSize, %rsp
2523 // - lea SEHAllocationSize(%FramePtr), %rsp
2525 // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
2526 // However, we may use this sequence if we have a frame pointer because the
2527 // effects of the prologue can safely be undone.
2528 if (LEAAmount != 0) {
2529 unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
2530 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr,
2531 false, LEAAmount);
2532 --MBBI;
2533 } else {
2534 unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
2535 BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr).addReg(FramePtr);
2536 --MBBI;
2538 } else if (NumBytes) {
2539 // Adjust stack pointer back: ESP += numbytes.
2540 emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);
2541 if (!HasFP && NeedsDwarfCFI) {
2542 // Define the current CFA rule to use the provided offset.
2543 BuildCFI(MBB, MBBI, DL,
2544 MCCFIInstruction::cfiDefCfaOffset(
2545 nullptr, CSSize + TailCallArgReserveSize + SlotSize),
2546 MachineInstr::FrameDestroy);
2548 --MBBI;
2551 // Windows unwinder will not invoke function's exception handler if IP is
2552 // either in prologue or in epilogue. This behavior causes a problem when a
2553 // call immediately precedes an epilogue, because the return address points
2554 // into the epilogue. To cope with that, we insert an epilogue marker here,
2555 // then replace it with a 'nop' if it ends up immediately after a CALL in the
2556 // final emitted code.
2557 if (NeedsWin64CFI && MF.hasWinCFI())
2558 BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
2560 if (!HasFP && NeedsDwarfCFI) {
2561 MBBI = FirstCSPop;
2562 int64_t Offset = -(int64_t)CSSize - SlotSize;
2563 // Mark callee-saved pop instruction.
2564 // Define the current CFA rule to use the provided offset.
2565 while (MBBI != MBB.end()) {
2566 MachineBasicBlock::iterator PI = MBBI;
2567 unsigned Opc = PI->getOpcode();
2568 ++MBBI;
2569 if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r ||
2570 Opc == X86::POP2 || Opc == X86::POP2P) {
2571 Offset += SlotSize;
2572 // Compared to pop, pop2 introduces more stack offset (one more
2573 // register).
2574 if (Opc == X86::POP2 || Opc == X86::POP2P)
2575 Offset += SlotSize;
2576 BuildCFI(MBB, MBBI, DL,
2577 MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),
2578 MachineInstr::FrameDestroy);
2583 // Emit DWARF info specifying the restores of the callee-saved registers.
2584 // For epilogue with return inside or being other block without successor,
2585 // no need to generate .cfi_restore for callee-saved registers.
2586 if (NeedsDwarfCFI && !MBB.succ_empty())
2587 emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false);
2589 if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
2590 // Add the return addr area delta back since we are not tail calling.
2591 int Offset = -1 * X86FI->getTCReturnAddrDelta();
2592 assert(Offset >= 0 && "TCDelta should never be positive");
2593 if (Offset) {
2594 // Check for possible merge with preceding ADD instruction.
2595 Offset += mergeSPUpdates(MBB, Terminator, true);
2596 emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
2600 // Emit tilerelease for AMX kernel.
2601 if (X86FI->getAMXProgModel() == AMXProgModelEnum::ManagedRA)
2602 BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
2605 StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
2606 int FI,
2607 Register &FrameReg) const {
2608 const MachineFrameInfo &MFI = MF.getFrameInfo();
2610 bool IsFixed = MFI.isFixedObjectIndex(FI);
2611 // We can't calculate offset from frame pointer if the stack is realigned,
2612 // so enforce usage of stack/base pointer. The base pointer is used when we
2613 // have dynamic allocas in addition to dynamic realignment.
2614 if (TRI->hasBasePointer(MF))
2615 FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();
2616 else if (TRI->hasStackRealignment(MF))
2617 FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();
2618 else
2619 FrameReg = TRI->getFrameRegister(MF);
2621 // Offset will hold the offset from the stack pointer at function entry to the
2622 // object.
2623 // We need to factor in additional offsets applied during the prologue to the
2624 // frame, base, and stack pointer depending on which is used.
2625 int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
2626 const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2627 unsigned CSSize = X86FI->getCalleeSavedFrameSize();
2628 uint64_t StackSize = MFI.getStackSize();
2629 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
2630 int64_t FPDelta = 0;
2632 // In an x86 interrupt, remove the offset we added to account for the return
2633 // address from any stack object allocated in the caller's frame. Interrupts
2634 // do not have a standard return address. Fixed objects in the current frame,
2635 // such as SSE register spills, should not get this treatment.
2636 if (MF.getFunction().getCallingConv() == CallingConv::X86_INTR &&
2637 Offset >= 0) {
2638 Offset += getOffsetOfLocalArea();
2641 if (IsWin64Prologue) {
2642 assert(!MFI.hasCalls() || (StackSize % 16) == 8);
2644 // Calculate required stack adjustment.
2645 uint64_t FrameSize = StackSize - SlotSize;
2646 // If required, include space for extra hidden slot for stashing base
2647 // pointer.
2648 if (X86FI->getRestoreBasePointer())
2649 FrameSize += SlotSize;
2650 uint64_t NumBytes = FrameSize - CSSize;
2652 uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
2653 if (FI && FI == X86FI->getFAIndex())
2654 return StackOffset::getFixed(-SEHFrameOffset);
2656 // FPDelta is the offset from the "traditional" FP location of the old base
2657 // pointer followed by return address and the location required by the
2658 // restricted Win64 prologue.
2659 // Add FPDelta to all offsets below that go through the frame pointer.
2660 FPDelta = FrameSize - SEHFrameOffset;
2661 assert((!MFI.hasCalls() || (FPDelta % 16) == 0) &&
2662 "FPDelta isn't aligned per the Win64 ABI!");
2665 if (FrameReg == TRI->getFramePtr()) {
2666 // Skip saved EBP/RBP
2667 Offset += SlotSize;
2669 // Account for restricted Windows prologue.
2670 Offset += FPDelta;
2672 // Skip the RETADDR move area
2673 int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
2674 if (TailCallReturnAddrDelta < 0)
2675 Offset -= TailCallReturnAddrDelta;
2677 return StackOffset::getFixed(Offset);
2680 // FrameReg is either the stack pointer or a base pointer. But the base is
2681 // located at the end of the statically known StackSize so the distinction
2682 // doesn't really matter.
2683 if (TRI->hasStackRealignment(MF) || TRI->hasBasePointer(MF))
2684 assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
2685 return StackOffset::getFixed(Offset + StackSize);
2688 int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
2689 Register &FrameReg) const {
2690 const MachineFrameInfo &MFI = MF.getFrameInfo();
2691 const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2692 const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2693 const auto it = WinEHXMMSlotInfo.find(FI);
2695 if (it == WinEHXMMSlotInfo.end())
2696 return getFrameIndexReference(MF, FI, FrameReg).getFixed();
2698 FrameReg = TRI->getStackRegister();
2699 return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
2700 it->second;
2703 StackOffset
2704 X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
2705 Register &FrameReg,
2706 int Adjustment) const {
2707 const MachineFrameInfo &MFI = MF.getFrameInfo();
2708 FrameReg = TRI->getStackRegister();
2709 return StackOffset::getFixed(MFI.getObjectOffset(FI) -
2710 getOffsetOfLocalArea() + Adjustment);
2713 StackOffset
2714 X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
2715 int FI, Register &FrameReg,
2716 bool IgnoreSPUpdates) const {
2718 const MachineFrameInfo &MFI = MF.getFrameInfo();
2719 // Does not include any dynamic realign.
2720 const uint64_t StackSize = MFI.getStackSize();
2721 // LLVM arranges the stack as follows:
2722 // ...
2723 // ARG2
2724 // ARG1
2725 // RETADDR
2726 // PUSH RBP <-- RBP points here
2727 // PUSH CSRs
2728 // ~~~~~~~ <-- possible stack realignment (non-win64)
2729 // ...
2730 // STACK OBJECTS
2731 // ... <-- RSP after prologue points here
2732 // ~~~~~~~ <-- possible stack realignment (win64)
2734 // if (hasVarSizedObjects()):
2735 // ... <-- "base pointer" (ESI/RBX) points here
2736 // DYNAMIC ALLOCAS
2737 // ... <-- RSP points here
2739 // Case 1: In the simple case of no stack realignment and no dynamic
2740 // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
2741 // with fixed offsets from RSP.
2743 // Case 2: In the case of stack realignment with no dynamic allocas, fixed
2744 // stack objects are addressed with RBP and regular stack objects with RSP.
2746 // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
2747 // to address stack arguments for outgoing calls and nothing else. The "base
2748 // pointer" points to local variables, and RBP points to fixed objects.
2750 // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
2751 // answer we give is relative to the SP after the prologue, and not the
2752 // SP in the middle of the function.
2754 if (MFI.isFixedObjectIndex(FI) && TRI->hasStackRealignment(MF) &&
2755 !STI.isTargetWin64())
2756 return getFrameIndexReference(MF, FI, FrameReg);
2758 // If !hasReservedCallFrame the function might have SP adjustement in the
2759 // body. So, even though the offset is statically known, it depends on where
2760 // we are in the function.
2761 if (!IgnoreSPUpdates && !hasReservedCallFrame(MF))
2762 return getFrameIndexReference(MF, FI, FrameReg);
2764 // We don't handle tail calls, and shouldn't be seeing them either.
2765 assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
2766 "we don't handle this case!");
2768 // This is how the math works out:
2770 // %rsp grows (i.e. gets lower) left to right. Each box below is
2771 // one word (eight bytes). Obj0 is the stack slot we're trying to
2772 // get to.
2774 // ----------------------------------
2775 // | BP | Obj0 | Obj1 | ... | ObjN |
2776 // ----------------------------------
2777 // ^ ^ ^ ^
2778 // A B C E
2780 // A is the incoming stack pointer.
2781 // (B - A) is the local area offset (-8 for x86-64) [1]
2782 // (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
2784 // |(E - B)| is the StackSize (absolute value, positive). For a
2785 // stack that grown down, this works out to be (B - E). [3]
2787 // E is also the value of %rsp after stack has been set up, and we
2788 // want (C - E) -- the value we can add to %rsp to get to Obj0. Now
2789 // (C - E) == (C - A) - (B - A) + (B - E)
2790 // { Using [1], [2] and [3] above }
2791 // == getObjectOffset - LocalAreaOffset + StackSize
2793 return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize);
2796 bool X86FrameLowering::assignCalleeSavedSpillSlots(
2797 MachineFunction &MF, const TargetRegisterInfo *TRI,
2798 std::vector<CalleeSavedInfo> &CSI) const {
2799 MachineFrameInfo &MFI = MF.getFrameInfo();
2800 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2802 unsigned CalleeSavedFrameSize = 0;
2803 unsigned XMMCalleeSavedFrameSize = 0;
2804 auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
2805 int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
2807 int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
2809 if (TailCallReturnAddrDelta < 0) {
2810 // create RETURNADDR area
2811 // arg
2812 // arg
2813 // RETADDR
2814 // { ...
2815 // RETADDR area
2816 // ...
2817 // }
2818 // [EBP]
2819 MFI.CreateFixedObject(-TailCallReturnAddrDelta,
2820 TailCallReturnAddrDelta - SlotSize, true);
2823 // Spill the BasePtr if it's used.
2824 if (this->TRI->hasBasePointer(MF)) {
2825 // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
2826 if (MF.hasEHFunclets()) {
2827 int FI = MFI.CreateSpillStackObject(SlotSize, Align(SlotSize));
2828 X86FI->setHasSEHFramePtrSave(true);
2829 X86FI->setSEHFramePtrSaveIndex(FI);
2833 if (hasFP(MF)) {
2834 // emitPrologue always spills frame register the first thing.
2835 SpillSlotOffset -= SlotSize;
2836 MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2838 // The async context lives directly before the frame pointer, and we
2839 // allocate a second slot to preserve stack alignment.
2840 if (X86FI->hasSwiftAsyncContext()) {
2841 SpillSlotOffset -= SlotSize;
2842 MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2843 SpillSlotOffset -= SlotSize;
2846 // Since emitPrologue and emitEpilogue will handle spilling and restoring of
2847 // the frame register, we can delete it from CSI list and not have to worry
2848 // about avoiding it later.
2849 Register FPReg = TRI->getFrameRegister(MF);
2850 for (unsigned i = 0; i < CSI.size(); ++i) {
2851 if (TRI->regsOverlap(CSI[i].getReg(), FPReg)) {
2852 CSI.erase(CSI.begin() + i);
2853 break;
2858 // Strategy:
2859 // 1. Use push2 when
2860 // a) number of CSR > 1 if no need padding
2861 // b) number of CSR > 2 if need padding
2862 // 2. When the number of CSR push is odd
2863 // a. Start to use push2 from the 1st push if stack is 16B aligned.
2864 // b. Start to use push2 from the 2nd push if stack is not 16B aligned.
2865 // 3. When the number of CSR push is even, start to use push2 from the 1st
2866 // push and make the stack 16B aligned before the push
2867 unsigned NumRegsForPush2 = 0;
2868 if (STI.hasPush2Pop2()) {
2869 unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {
2870 return X86::GR64RegClass.contains(I.getReg());
2872 bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0);
2873 bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1;
2874 X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2);
2875 NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0;
2876 if (X86FI->padForPush2Pop2()) {
2877 SpillSlotOffset -= SlotSize;
2878 MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2882 // Assign slots for GPRs. It increases frame size.
2883 for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
2884 Register Reg = I.getReg();
2886 if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
2887 continue;
2889 // A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned
2890 // or only an odd number of registers in the candidates.
2891 if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 &&
2892 (SpillSlotOffset % 16 == 0 ||
2893 X86FI->getNumCandidatesForPush2Pop2() % 2))
2894 X86FI->addCandidateForPush2Pop2(Reg);
2896 SpillSlotOffset -= SlotSize;
2897 CalleeSavedFrameSize += SlotSize;
2899 int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2900 I.setFrameIdx(SlotIndex);
2903 // Adjust the offset of spill slot as we know the accurate callee saved frame
2904 // size.
2905 if (X86FI->getRestoreBasePointer()) {
2906 SpillSlotOffset -= SlotSize;
2907 CalleeSavedFrameSize += SlotSize;
2909 MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2910 // TODO: saving the slot index is better?
2911 X86FI->setRestoreBasePointer(CalleeSavedFrameSize);
2913 assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 &&
2914 "Expect even candidates for push2/pop2");
2915 if (X86FI->getNumCandidatesForPush2Pop2())
2916 ++NumFunctionUsingPush2Pop2;
2917 X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
2918 MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);
2920 // Assign slots for XMMs.
2921 for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
2922 Register Reg = I.getReg();
2923 if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
2924 continue;
2926 // If this is k-register make sure we lookup via the largest legal type.
2927 MVT VT = MVT::Other;
2928 if (X86::VK16RegClass.contains(Reg))
2929 VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
2931 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
2932 unsigned Size = TRI->getSpillSize(*RC);
2933 Align Alignment = TRI->getSpillAlign(*RC);
2934 // ensure alignment
2935 assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
2936 SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment);
2938 // spill into slot
2939 SpillSlotOffset -= Size;
2940 int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
2941 I.setFrameIdx(SlotIndex);
2942 MFI.ensureMaxAlignment(Alignment);
2944 // Save the start offset and size of XMM in stack frame for funclets.
2945 if (X86::VR128RegClass.contains(Reg)) {
2946 WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;
2947 XMMCalleeSavedFrameSize += Size;
2951 return true;
2954 bool X86FrameLowering::spillCalleeSavedRegisters(
2955 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2956 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2957 DebugLoc DL = MBB.findDebugLoc(MI);
2959 // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
2960 // for us, and there are no XMM CSRs on Win32.
2961 if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
2962 return true;
2964 // Push GPRs. It increases frame size.
2965 const MachineFunction &MF = *MBB.getParent();
2966 const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2967 if (X86FI->padForPush2Pop2())
2968 emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false);
2970 // Update LiveIn of the basic block and decide whether we can add a kill flag
2971 // to the use.
2972 auto UpdateLiveInCheckCanKill = [&](Register Reg) {
2973 const MachineRegisterInfo &MRI = MF.getRegInfo();
2974 // Do not set a kill flag on values that are also marked as live-in. This
2975 // happens with the @llvm-returnaddress intrinsic and with arguments
2976 // passed in callee saved registers.
2977 // Omitting the kill flags is conservatively correct even if the live-in
2978 // is not used after all.
2979 if (MRI.isLiveIn(Reg))
2980 return false;
2981 MBB.addLiveIn(Reg);
2982 // Check if any subregister is live-in
2983 for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg)
2984 if (MRI.isLiveIn(*AReg))
2985 return false;
2986 return true;
2988 auto UpdateLiveInGetKillRegState = [&](Register Reg) {
2989 return getKillRegState(UpdateLiveInCheckCanKill(Reg));
2992 for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) {
2993 Register Reg = RI->getReg();
2994 if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
2995 continue;
2997 if (X86FI->isCandidateForPush2Pop2(Reg)) {
2998 Register Reg2 = (++RI)->getReg();
2999 BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI)))
3000 .addReg(Reg, UpdateLiveInGetKillRegState(Reg))
3001 .addReg(Reg2, UpdateLiveInGetKillRegState(Reg2))
3002 .setMIFlag(MachineInstr::FrameSetup);
3003 } else {
3004 BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI)))
3005 .addReg(Reg, UpdateLiveInGetKillRegState(Reg))
3006 .setMIFlag(MachineInstr::FrameSetup);
3010 if (X86FI->getRestoreBasePointer()) {
3011 unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
3012 Register BaseReg = this->TRI->getBaseRegister();
3013 BuildMI(MBB, MI, DL, TII.get(Opc))
3014 .addReg(BaseReg, getKillRegState(true))
3015 .setMIFlag(MachineInstr::FrameSetup);
3018 // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
3019 // It can be done by spilling XMMs to stack frame.
3020 for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
3021 Register Reg = I.getReg();
3022 if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
3023 continue;
3025 // If this is k-register make sure we lookup via the largest legal type.
3026 MVT VT = MVT::Other;
3027 if (X86::VK16RegClass.contains(Reg))
3028 VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
3030 // Add the callee-saved register as live-in. It's killed at the spill.
3031 MBB.addLiveIn(Reg);
3032 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
3034 TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI,
3035 Register());
3036 --MI;
3037 MI->setFlag(MachineInstr::FrameSetup);
3038 ++MI;
3041 return true;
3044 void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
3045 MachineBasicBlock::iterator MBBI,
3046 MachineInstr *CatchRet) const {
3047 // SEH shouldn't use catchret.
3048 assert(!isAsynchronousEHPersonality(classifyEHPersonality(
3049 MBB.getParent()->getFunction().getPersonalityFn())) &&
3050 "SEH should not use CATCHRET");
3051 const DebugLoc &DL = CatchRet->getDebugLoc();
3052 MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB();
3054 // Fill EAX/RAX with the address of the target block.
3055 if (STI.is64Bit()) {
3056 // LEA64r CatchRetTarget(%rip), %rax
3057 BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX)
3058 .addReg(X86::RIP)
3059 .addImm(0)
3060 .addReg(0)
3061 .addMBB(CatchRetTarget)
3062 .addReg(0);
3063 } else {
3064 // MOV32ri $CatchRetTarget, %eax
3065 BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
3066 .addMBB(CatchRetTarget);
3069 // Record that we've taken the address of CatchRetTarget and no longer just
3070 // reference it in a terminator.
3071 CatchRetTarget->setMachineBlockAddressTaken();
3074 bool X86FrameLowering::restoreCalleeSavedRegisters(
3075 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
3076 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
3077 if (CSI.empty())
3078 return false;
3080 if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
3081 // Don't restore CSRs in 32-bit EH funclets. Matches
3082 // spillCalleeSavedRegisters.
3083 if (STI.is32Bit())
3084 return true;
3085 // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
3086 // funclets. emitEpilogue transforms these to normal jumps.
3087 if (MI->getOpcode() == X86::CATCHRET) {
3088 const Function &F = MBB.getParent()->getFunction();
3089 bool IsSEH = isAsynchronousEHPersonality(
3090 classifyEHPersonality(F.getPersonalityFn()));
3091 if (IsSEH)
3092 return true;
3096 DebugLoc DL = MBB.findDebugLoc(MI);
3098 // Reload XMMs from stack frame.
3099 for (const CalleeSavedInfo &I : CSI) {
3100 Register Reg = I.getReg();
3101 if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
3102 continue;
3104 // If this is k-register make sure we lookup via the largest legal type.
3105 MVT VT = MVT::Other;
3106 if (X86::VK16RegClass.contains(Reg))
3107 VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
3109 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
3110 TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI,
3111 Register());
3114 // Clear the stack slot for spill base pointer register.
3115 MachineFunction &MF = *MBB.getParent();
3116 const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3117 if (X86FI->getRestoreBasePointer()) {
3118 unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
3119 Register BaseReg = this->TRI->getBaseRegister();
3120 BuildMI(MBB, MI, DL, TII.get(Opc), BaseReg)
3121 .setMIFlag(MachineInstr::FrameDestroy);
3124 // POP GPRs.
3125 for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) {
3126 Register Reg = I->getReg();
3127 if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
3128 continue;
3130 if (X86FI->isCandidateForPush2Pop2(Reg))
3131 BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg)
3132 .addReg((++I)->getReg(), RegState::Define)
3133 .setMIFlag(MachineInstr::FrameDestroy);
3134 else
3135 BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg)
3136 .setMIFlag(MachineInstr::FrameDestroy);
3138 if (X86FI->padForPush2Pop2())
3139 emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true);
3141 return true;
3144 void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
3145 BitVector &SavedRegs,
3146 RegScavenger *RS) const {
3147 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
3149 // Spill the BasePtr if it's used.
3150 if (TRI->hasBasePointer(MF)) {
3151 Register BasePtr = TRI->getBaseRegister();
3152 if (STI.isTarget64BitILP32())
3153 BasePtr = getX86SubSuperRegister(BasePtr, 64);
3154 SavedRegs.set(BasePtr);
3158 static bool HasNestArgument(const MachineFunction *MF) {
3159 const Function &F = MF->getFunction();
3160 for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
3161 I++) {
3162 if (I->hasNestAttr() && !I->use_empty())
3163 return true;
3165 return false;
3168 /// GetScratchRegister - Get a temp register for performing work in the
3169 /// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
3170 /// and the properties of the function either one or two registers will be
3171 /// needed. Set primary to true for the first register, false for the second.
3172 static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64,
3173 const MachineFunction &MF, bool Primary) {
3174 CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();
3176 // Erlang stuff.
3177 if (CallingConvention == CallingConv::HiPE) {
3178 if (Is64Bit)
3179 return Primary ? X86::R14 : X86::R13;
3180 else
3181 return Primary ? X86::EBX : X86::EDI;
3184 if (Is64Bit) {
3185 if (IsLP64)
3186 return Primary ? X86::R11 : X86::R12;
3187 else
3188 return Primary ? X86::R11D : X86::R12D;
3191 bool IsNested = HasNestArgument(&MF);
3193 if (CallingConvention == CallingConv::X86_FastCall ||
3194 CallingConvention == CallingConv::Fast ||
3195 CallingConvention == CallingConv::Tail) {
3196 if (IsNested)
3197 report_fatal_error("Segmented stacks does not support fastcall with "
3198 "nested function.");
3199 return Primary ? X86::EAX : X86::ECX;
3201 if (IsNested)
3202 return Primary ? X86::EDX : X86::EAX;
3203 return Primary ? X86::ECX : X86::EAX;
3206 // The stack limit in the TCB is set to this many bytes above the actual stack
3207 // limit.
3208 static const uint64_t kSplitStackAvailable = 256;
3210 void X86FrameLowering::adjustForSegmentedStacks(
3211 MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
3212 MachineFrameInfo &MFI = MF.getFrameInfo();
3213 uint64_t StackSize;
3214 unsigned TlsReg, TlsOffset;
3215 DebugLoc DL;
3217 // To support shrink-wrapping we would need to insert the new blocks
3218 // at the right place and update the branches to PrologueMBB.
3219 assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
3221 unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
3222 assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
3223 "Scratch register is live-in");
3225 if (MF.getFunction().isVarArg())
3226 report_fatal_error("Segmented stacks do not support vararg functions.");
3227 if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
3228 !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
3229 !STI.isTargetDragonFly())
3230 report_fatal_error("Segmented stacks not supported on this platform.");
3232 // Eventually StackSize will be calculated by a link-time pass; which will
3233 // also decide whether checking code needs to be injected into this particular
3234 // prologue.
3235 StackSize = MFI.getStackSize();
3237 if (!MFI.needsSplitStackProlog())
3238 return;
3240 MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
3241 MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
3242 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3243 bool IsNested = false;
3245 // We need to know if the function has a nest argument only in 64 bit mode.
3246 if (Is64Bit)
3247 IsNested = HasNestArgument(&MF);
3249 // The MOV R10, RAX needs to be in a different block, since the RET we emit in
3250 // allocMBB needs to be last (terminating) instruction.
3252 for (const auto &LI : PrologueMBB.liveins()) {
3253 allocMBB->addLiveIn(LI);
3254 checkMBB->addLiveIn(LI);
3257 if (IsNested)
3258 allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
3260 MF.push_front(allocMBB);
3261 MF.push_front(checkMBB);
3263 // When the frame size is less than 256 we just compare the stack
3264 // boundary directly to the value of the stack pointer, per gcc.
3265 bool CompareStackPointer = StackSize < kSplitStackAvailable;
3267 // Read the limit off the current stacklet off the stack_guard location.
3268 if (Is64Bit) {
3269 if (STI.isTargetLinux()) {
3270 TlsReg = X86::FS;
3271 TlsOffset = IsLP64 ? 0x70 : 0x40;
3272 } else if (STI.isTargetDarwin()) {
3273 TlsReg = X86::GS;
3274 TlsOffset = 0x60 + 90 * 8; // See pthread_machdep.h. Steal TLS slot 90.
3275 } else if (STI.isTargetWin64()) {
3276 TlsReg = X86::GS;
3277 TlsOffset = 0x28; // pvArbitrary, reserved for application use
3278 } else if (STI.isTargetFreeBSD()) {
3279 TlsReg = X86::FS;
3280 TlsOffset = 0x18;
3281 } else if (STI.isTargetDragonFly()) {
3282 TlsReg = X86::FS;
3283 TlsOffset = 0x20; // use tls_tcb.tcb_segstack
3284 } else {
3285 report_fatal_error("Segmented stacks not supported on this platform.");
3288 if (CompareStackPointer)
3289 ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
3290 else
3291 BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r),
3292 ScratchReg)
3293 .addReg(X86::RSP)
3294 .addImm(1)
3295 .addReg(0)
3296 .addImm(-StackSize)
3297 .addReg(0);
3299 BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm))
3300 .addReg(ScratchReg)
3301 .addReg(0)
3302 .addImm(1)
3303 .addReg(0)
3304 .addImm(TlsOffset)
3305 .addReg(TlsReg);
3306 } else {
3307 if (STI.isTargetLinux()) {
3308 TlsReg = X86::GS;
3309 TlsOffset = 0x30;
3310 } else if (STI.isTargetDarwin()) {
3311 TlsReg = X86::GS;
3312 TlsOffset = 0x48 + 90 * 4;
3313 } else if (STI.isTargetWin32()) {
3314 TlsReg = X86::FS;
3315 TlsOffset = 0x14; // pvArbitrary, reserved for application use
3316 } else if (STI.isTargetDragonFly()) {
3317 TlsReg = X86::FS;
3318 TlsOffset = 0x10; // use tls_tcb.tcb_segstack
3319 } else if (STI.isTargetFreeBSD()) {
3320 report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
3321 } else {
3322 report_fatal_error("Segmented stacks not supported on this platform.");
3325 if (CompareStackPointer)
3326 ScratchReg = X86::ESP;
3327 else
3328 BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg)
3329 .addReg(X86::ESP)
3330 .addImm(1)
3331 .addReg(0)
3332 .addImm(-StackSize)
3333 .addReg(0);
3335 if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
3336 STI.isTargetDragonFly()) {
3337 BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
3338 .addReg(ScratchReg)
3339 .addReg(0)
3340 .addImm(0)
3341 .addReg(0)
3342 .addImm(TlsOffset)
3343 .addReg(TlsReg);
3344 } else if (STI.isTargetDarwin()) {
3346 // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
3347 unsigned ScratchReg2;
3348 bool SaveScratch2;
3349 if (CompareStackPointer) {
3350 // The primary scratch register is available for holding the TLS offset.
3351 ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
3352 SaveScratch2 = false;
3353 } else {
3354 // Need to use a second register to hold the TLS offset
3355 ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
3357 // Unfortunately, with fastcc the second scratch register may hold an
3358 // argument.
3359 SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
3362 // If Scratch2 is live-in then it needs to be saved.
3363 assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
3364 "Scratch register is live-in and not saved");
3366 if (SaveScratch2)
3367 BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
3368 .addReg(ScratchReg2, RegState::Kill);
3370 BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
3371 .addImm(TlsOffset);
3372 BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
3373 .addReg(ScratchReg)
3374 .addReg(ScratchReg2)
3375 .addImm(1)
3376 .addReg(0)
3377 .addImm(0)
3378 .addReg(TlsReg);
3380 if (SaveScratch2)
3381 BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
3385 // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
3386 // It jumps to normal execution of the function body.
3387 BuildMI(checkMBB, DL, TII.get(X86::JCC_1))
3388 .addMBB(&PrologueMBB)
3389 .addImm(X86::COND_A);
3391 // On 32 bit we first push the arguments size and then the frame size. On 64
3392 // bit, we pass the stack frame size in r10 and the argument size in r11.
3393 if (Is64Bit) {
3394 // Functions with nested arguments use R10, so it needs to be saved across
3395 // the call to _morestack
3397 const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
3398 const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
3399 const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
3400 const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
3402 if (IsNested)
3403 BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
3405 BuildMI(allocMBB, DL, TII.get(getMOVriOpcode(IsLP64, StackSize)), Reg10)
3406 .addImm(StackSize);
3407 BuildMI(allocMBB, DL,
3408 TII.get(getMOVriOpcode(IsLP64, X86FI->getArgumentStackSize())),
3409 Reg11)
3410 .addImm(X86FI->getArgumentStackSize());
3411 } else {
3412 BuildMI(allocMBB, DL, TII.get(X86::PUSH32i))
3413 .addImm(X86FI->getArgumentStackSize());
3414 BuildMI(allocMBB, DL, TII.get(X86::PUSH32i)).addImm(StackSize);
3417 // __morestack is in libgcc
3418 if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
3419 // Under the large code model, we cannot assume that __morestack lives
3420 // within 2^31 bytes of the call site, so we cannot use pc-relative
3421 // addressing. We cannot perform the call via a temporary register,
3422 // as the rax register may be used to store the static chain, and all
3423 // other suitable registers may be either callee-save or used for
3424 // parameter passing. We cannot use the stack at this point either
3425 // because __morestack manipulates the stack directly.
3427 // To avoid these issues, perform an indirect call via a read-only memory
3428 // location containing the address.
3430 // This solution is not perfect, as it assumes that the .rodata section
3431 // is laid out within 2^31 bytes of each function body, but this seems
3432 // to be sufficient for JIT.
3433 // FIXME: Add retpoline support and remove the error here..
3434 if (STI.useIndirectThunkCalls())
3435 report_fatal_error("Emitting morestack calls on 64-bit with the large "
3436 "code model and thunks not yet implemented.");
3437 BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
3438 .addReg(X86::RIP)
3439 .addImm(0)
3440 .addReg(0)
3441 .addExternalSymbol("__morestack_addr")
3442 .addReg(0);
3443 } else {
3444 if (Is64Bit)
3445 BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
3446 .addExternalSymbol("__morestack");
3447 else
3448 BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
3449 .addExternalSymbol("__morestack");
3452 if (IsNested)
3453 BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
3454 else
3455 BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
3457 allocMBB->addSuccessor(&PrologueMBB);
3459 checkMBB->addSuccessor(allocMBB, BranchProbability::getZero());
3460 checkMBB->addSuccessor(&PrologueMBB, BranchProbability::getOne());
3462 #ifdef EXPENSIVE_CHECKS
3463 MF.verify();
3464 #endif
3467 /// Lookup an ERTS parameter in the !hipe.literals named metadata node.
3468 /// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
3469 /// to fields it needs, through a named metadata node "hipe.literals" containing
3470 /// name-value pairs.
3471 static unsigned getHiPELiteral(NamedMDNode *HiPELiteralsMD,
3472 const StringRef LiteralName) {
3473 for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
3474 MDNode *Node = HiPELiteralsMD->getOperand(i);
3475 if (Node->getNumOperands() != 2)
3476 continue;
3477 MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
3478 ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
3479 if (!NodeName || !NodeVal)
3480 continue;
3481 ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
3482 if (ValConst && NodeName->getString() == LiteralName) {
3483 return ValConst->getZExtValue();
3487 report_fatal_error("HiPE literal " + LiteralName +
3488 " required but not provided");
3491 // Return true if there are no non-ehpad successors to MBB and there are no
3492 // non-meta instructions between MBBI and MBB.end().
3493 static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
3494 MachineBasicBlock::const_iterator MBBI) {
3495 return llvm::all_of(
3496 MBB.successors(),
3497 [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
3498 std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
3499 return MI.isMetaInstruction();
3503 /// Erlang programs may need a special prologue to handle the stack size they
3504 /// might need at runtime. That is because Erlang/OTP does not implement a C
3505 /// stack but uses a custom implementation of hybrid stack/heap architecture.
3506 /// (for more information see Eric Stenman's Ph.D. thesis:
3507 /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
3509 /// CheckStack:
3510 /// temp0 = sp - MaxStack
3511 /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
3512 /// OldStart:
3513 /// ...
3514 /// IncStack:
3515 /// call inc_stack # doubles the stack space
3516 /// temp0 = sp - MaxStack
3517 /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
3518 void X86FrameLowering::adjustForHiPEPrologue(
3519 MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
3520 MachineFrameInfo &MFI = MF.getFrameInfo();
3521 DebugLoc DL;
3523 // To support shrink-wrapping we would need to insert the new blocks
3524 // at the right place and update the branches to PrologueMBB.
3525 assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
3527 // HiPE-specific values
3528 NamedMDNode *HiPELiteralsMD =
3529 MF.getFunction().getParent()->getNamedMetadata("hipe.literals");
3530 if (!HiPELiteralsMD)
3531 report_fatal_error(
3532 "Can't generate HiPE prologue without runtime parameters");
3533 const unsigned HipeLeafWords = getHiPELiteral(
3534 HiPELiteralsMD, Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
3535 const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
3536 const unsigned Guaranteed = HipeLeafWords * SlotSize;
3537 unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs
3538 ? MF.getFunction().arg_size() - CCRegisteredArgs
3539 : 0;
3540 unsigned MaxStack = MFI.getStackSize() + CallerStkArity * SlotSize + SlotSize;
3542 assert(STI.isTargetLinux() &&
3543 "HiPE prologue is only supported on Linux operating systems.");
3545 // Compute the largest caller's frame that is needed to fit the callees'
3546 // frames. This 'MaxStack' is computed from:
3548 // a) the fixed frame size, which is the space needed for all spilled temps,
3549 // b) outgoing on-stack parameter areas, and
3550 // c) the minimum stack space this function needs to make available for the
3551 // functions it calls (a tunable ABI property).
3552 if (MFI.hasCalls()) {
3553 unsigned MoreStackForCalls = 0;
3555 for (auto &MBB : MF) {
3556 for (auto &MI : MBB) {
3557 if (!MI.isCall())
3558 continue;
3560 // Get callee operand.
3561 const MachineOperand &MO = MI.getOperand(0);
3563 // Only take account of global function calls (no closures etc.).
3564 if (!MO.isGlobal())
3565 continue;
3567 const Function *F = dyn_cast<Function>(MO.getGlobal());
3568 if (!F)
3569 continue;
3571 // Do not update 'MaxStack' for primitive and built-in functions
3572 // (encoded with names either starting with "erlang."/"bif_" or not
3573 // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
3574 // "_", such as the BIF "suspend_0") as they are executed on another
3575 // stack.
3576 if (F->getName().contains("erlang.") || F->getName().contains("bif_") ||
3577 F->getName().find_first_of("._") == StringRef::npos)
3578 continue;
3580 unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs
3581 ? F->arg_size() - CCRegisteredArgs
3582 : 0;
3583 if (HipeLeafWords - 1 > CalleeStkArity)
3584 MoreStackForCalls =
3585 std::max(MoreStackForCalls,
3586 (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
3589 MaxStack += MoreStackForCalls;
3592 // If the stack frame needed is larger than the guaranteed then runtime checks
3593 // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
3594 if (MaxStack > Guaranteed) {
3595 MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
3596 MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
3598 for (const auto &LI : PrologueMBB.liveins()) {
3599 stackCheckMBB->addLiveIn(LI);
3600 incStackMBB->addLiveIn(LI);
3603 MF.push_front(incStackMBB);
3604 MF.push_front(stackCheckMBB);
3606 unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
3607 unsigned LEAop, CMPop, CALLop;
3608 SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
3609 if (Is64Bit) {
3610 SPReg = X86::RSP;
3611 PReg = X86::RBP;
3612 LEAop = X86::LEA64r;
3613 CMPop = X86::CMP64rm;
3614 CALLop = X86::CALL64pcrel32;
3615 } else {
3616 SPReg = X86::ESP;
3617 PReg = X86::EBP;
3618 LEAop = X86::LEA32r;
3619 CMPop = X86::CMP32rm;
3620 CALLop = X86::CALLpcrel32;
3623 ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
3624 assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
3625 "HiPE prologue scratch register is live-in");
3627 // Create new MBB for StackCheck:
3628 addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg,
3629 false, -MaxStack);
3630 // SPLimitOffset is in a fixed heap location (pointed by BP).
3631 addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)).addReg(ScratchReg),
3632 PReg, false, SPLimitOffset);
3633 BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1))
3634 .addMBB(&PrologueMBB)
3635 .addImm(X86::COND_AE);
3637 // Create new MBB for IncStack:
3638 BuildMI(incStackMBB, DL, TII.get(CALLop)).addExternalSymbol("inc_stack_0");
3639 addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg,
3640 false, -MaxStack);
3641 addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)).addReg(ScratchReg),
3642 PReg, false, SPLimitOffset);
3643 BuildMI(incStackMBB, DL, TII.get(X86::JCC_1))
3644 .addMBB(incStackMBB)
3645 .addImm(X86::COND_LE);
3647 stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
3648 stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
3649 incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
3650 incStackMBB->addSuccessor(incStackMBB, {1, 100});
3652 #ifdef EXPENSIVE_CHECKS
3653 MF.verify();
3654 #endif
3657 bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
3658 MachineBasicBlock::iterator MBBI,
3659 const DebugLoc &DL,
3660 int Offset) const {
3661 if (Offset <= 0)
3662 return false;
3664 if (Offset % SlotSize)
3665 return false;
3667 int NumPops = Offset / SlotSize;
3668 // This is only worth it if we have at most 2 pops.
3669 if (NumPops != 1 && NumPops != 2)
3670 return false;
3672 // Handle only the trivial case where the adjustment directly follows
3673 // a call. This is the most common one, anyway.
3674 if (MBBI == MBB.begin())
3675 return false;
3676 MachineBasicBlock::iterator Prev = std::prev(MBBI);
3677 if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
3678 return false;
3680 unsigned Regs[2];
3681 unsigned FoundRegs = 0;
3683 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3684 const MachineOperand &RegMask = Prev->getOperand(1);
3686 auto &RegClass =
3687 Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
3688 // Try to find up to NumPops free registers.
3689 for (auto Candidate : RegClass) {
3690 // Poor man's liveness:
3691 // Since we're immediately after a call, any register that is clobbered
3692 // by the call and not defined by it can be considered dead.
3693 if (!RegMask.clobbersPhysReg(Candidate))
3694 continue;
3696 // Don't clobber reserved registers
3697 if (MRI.isReserved(Candidate))
3698 continue;
3700 bool IsDef = false;
3701 for (const MachineOperand &MO : Prev->implicit_operands()) {
3702 if (MO.isReg() && MO.isDef() &&
3703 TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
3704 IsDef = true;
3705 break;
3709 if (IsDef)
3710 continue;
3712 Regs[FoundRegs++] = Candidate;
3713 if (FoundRegs == (unsigned)NumPops)
3714 break;
3717 if (FoundRegs == 0)
3718 return false;
3720 // If we found only one free register, but need two, reuse the same one twice.
3721 while (FoundRegs < (unsigned)NumPops)
3722 Regs[FoundRegs++] = Regs[0];
3724 for (int i = 0; i < NumPops; ++i)
3725 BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r),
3726 Regs[i]);
3728 return true;
3731 MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
3732 MachineFunction &MF, MachineBasicBlock &MBB,
3733 MachineBasicBlock::iterator I) const {
3734 bool reserveCallFrame = hasReservedCallFrame(MF);
3735 unsigned Opcode = I->getOpcode();
3736 bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
3737 DebugLoc DL = I->getDebugLoc(); // copy DebugLoc as I will be erased.
3738 uint64_t Amount = TII.getFrameSize(*I);
3739 uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
3740 I = MBB.erase(I);
3741 auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
3743 // Try to avoid emitting dead SP adjustments if the block end is unreachable,
3744 // typically because the function is marked noreturn (abort, throw,
3745 // assert_fail, etc).
3746 if (isDestroy && blockEndIsUnreachable(MBB, I))
3747 return I;
3749 if (!reserveCallFrame) {
3750 // If the stack pointer can be changed after prologue, turn the
3751 // adjcallstackup instruction into a 'sub ESP, <amt>' and the
3752 // adjcallstackdown instruction into 'add ESP, <amt>'
3754 // We need to keep the stack aligned properly. To do this, we round the
3755 // amount of space needed for the outgoing arguments up to the next
3756 // alignment boundary.
3757 Amount = alignTo(Amount, getStackAlign());
3759 const Function &F = MF.getFunction();
3760 bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
3761 bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves();
3763 // If we have any exception handlers in this function, and we adjust
3764 // the SP before calls, we may need to indicate this to the unwinder
3765 // using GNU_ARGS_SIZE. Note that this may be necessary even when
3766 // Amount == 0, because the preceding function may have set a non-0
3767 // GNU_ARGS_SIZE.
3768 // TODO: We don't need to reset this between subsequent functions,
3769 // if it didn't change.
3770 bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();
3772 if (HasDwarfEHHandlers && !isDestroy &&
3773 MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
3774 BuildCFI(MBB, InsertPos, DL,
3775 MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
3777 if (Amount == 0)
3778 return I;
3780 // Factor out the amount that gets handled inside the sequence
3781 // (Pushes of argument for frame setup, callee pops for frame destroy)
3782 Amount -= InternalAmt;
3784 // TODO: This is needed only if we require precise CFA.
3785 // If this is a callee-pop calling convention, emit a CFA adjust for
3786 // the amount the callee popped.
3787 if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
3788 BuildCFI(MBB, InsertPos, DL,
3789 MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
3791 // Add Amount to SP to destroy a frame, or subtract to setup.
3792 int64_t StackAdjustment = isDestroy ? Amount : -Amount;
3794 if (StackAdjustment) {
3795 // Merge with any previous or following adjustment instruction. Note: the
3796 // instructions merged with here do not have CFI, so their stack
3797 // adjustments do not feed into CfaAdjustment.
3798 StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
3799 StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
3801 if (StackAdjustment) {
3802 if (!(F.hasMinSize() &&
3803 adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
3804 BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
3805 /*InEpilogue=*/false);
3809 if (DwarfCFI && !hasFP(MF)) {
3810 // If we don't have FP, but need to generate unwind information,
3811 // we need to set the correct CFA offset after the stack adjustment.
3812 // How much we adjust the CFA offset depends on whether we're emitting
3813 // CFI only for EH purposes or for debugging. EH only requires the CFA
3814 // offset to be correct at each call site, while for debugging we want
3815 // it to be more precise.
3817 int64_t CfaAdjustment = -StackAdjustment;
3818 // TODO: When not using precise CFA, we also need to adjust for the
3819 // InternalAmt here.
3820 if (CfaAdjustment) {
3821 BuildCFI(
3822 MBB, InsertPos, DL,
3823 MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));
3827 return I;
3830 if (InternalAmt) {
3831 MachineBasicBlock::iterator CI = I;
3832 MachineBasicBlock::iterator B = MBB.begin();
3833 while (CI != B && !std::prev(CI)->isCall())
3834 --CI;
3835 BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false);
3838 return I;
3841 bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
3842 assert(MBB.getParent() && "Block is not attached to a function!");
3843 const MachineFunction &MF = *MBB.getParent();
3844 if (!MBB.isLiveIn(X86::EFLAGS))
3845 return true;
3847 // If stack probes have to loop inline or call, that will clobber EFLAGS.
3848 // FIXME: we could allow cases that will use emitStackProbeInlineGenericBlock.
3849 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
3850 const X86TargetLowering &TLI = *STI.getTargetLowering();
3851 if (TLI.hasInlineStackProbe(MF) || TLI.hasStackProbeSymbol(MF))
3852 return false;
3854 const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3855 return !TRI->hasStackRealignment(MF) && !X86FI->hasSwiftAsyncContext();
3858 bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
3859 assert(MBB.getParent() && "Block is not attached to a function!");
3861 // Win64 has strict requirements in terms of epilogue and we are
3862 // not taking a chance at messing with them.
3863 // I.e., unless this block is already an exit block, we can't use
3864 // it as an epilogue.
3865 if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
3866 return false;
3868 // Swift async context epilogue has a BTR instruction that clobbers parts of
3869 // EFLAGS.
3870 const MachineFunction &MF = *MBB.getParent();
3871 if (MF.getInfo<X86MachineFunctionInfo>()->hasSwiftAsyncContext())
3872 return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
3874 if (canUseLEAForSPInEpilogue(*MBB.getParent()))
3875 return true;
3877 // If we cannot use LEA to adjust SP, we may need to use ADD, which
3878 // clobbers the EFLAGS. Check that we do not need to preserve it,
3879 // otherwise, conservatively assume this is not
3880 // safe to insert the epilogue here.
3881 return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
3884 bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
3885 // If we may need to emit frameless compact unwind information, give
3886 // up as this is currently broken: PR25614.
3887 bool CompactUnwind =
3888 MF.getContext().getObjectFileInfo()->getCompactUnwindSection() != nullptr;
3889 return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||
3890 !CompactUnwind) &&
3891 // The lowering of segmented stack and HiPE only support entry
3892 // blocks as prologue blocks: PR26107. This limitation may be
3893 // lifted if we fix:
3894 // - adjustForSegmentedStacks
3895 // - adjustForHiPEPrologue
3896 MF.getFunction().getCallingConv() != CallingConv::HiPE &&
3897 !MF.shouldSplitStack();
3900 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
3901 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
3902 const DebugLoc &DL, bool RestoreSP) const {
3903 assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
3904 assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
3905 assert(STI.is32Bit() && !Uses64BitFramePtr &&
3906 "restoring EBP/ESI on non-32-bit target");
3908 MachineFunction &MF = *MBB.getParent();
3909 Register FramePtr = TRI->getFrameRegister(MF);
3910 Register BasePtr = TRI->getBaseRegister();
3911 WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
3912 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
3913 MachineFrameInfo &MFI = MF.getFrameInfo();
3915 // FIXME: Don't set FrameSetup flag in catchret case.
3917 int FI = FuncInfo.EHRegNodeFrameIndex;
3918 int EHRegSize = MFI.getObjectSize(FI);
3920 if (RestoreSP) {
3921 // MOV32rm -EHRegSize(%ebp), %esp
3922 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
3923 X86::EBP, true, -EHRegSize)
3924 .setMIFlag(MachineInstr::FrameSetup);
3927 Register UsedReg;
3928 int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
3929 int EndOffset = -EHRegOffset - EHRegSize;
3930 FuncInfo.EHRegNodeEndOffset = EndOffset;
3932 if (UsedReg == FramePtr) {
3933 // ADD $offset, %ebp
3934 unsigned ADDri = getADDriOpcode(false);
3935 BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
3936 .addReg(FramePtr)
3937 .addImm(EndOffset)
3938 .setMIFlag(MachineInstr::FrameSetup)
3939 ->getOperand(3)
3940 .setIsDead();
3941 assert(EndOffset >= 0 &&
3942 "end of registration object above normal EBP position!");
3943 } else if (UsedReg == BasePtr) {
3944 // LEA offset(%ebp), %esi
3945 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
3946 FramePtr, false, EndOffset)
3947 .setMIFlag(MachineInstr::FrameSetup);
3948 // MOV32rm SavedEBPOffset(%esi), %ebp
3949 assert(X86FI->getHasSEHFramePtrSave());
3950 int Offset =
3951 getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
3952 .getFixed();
3953 assert(UsedReg == BasePtr);
3954 addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
3955 UsedReg, true, Offset)
3956 .setMIFlag(MachineInstr::FrameSetup);
3957 } else {
3958 llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
3960 return MBBI;
3963 int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
3964 return TRI->getSlotSize();
3967 Register
3968 X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const {
3969 return StackPtr;
3972 TargetFrameLowering::DwarfFrameBase
3973 X86FrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
3974 const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
3975 Register FrameRegister = RI->getFrameRegister(MF);
3976 if (getInitialCFARegister(MF) == FrameRegister &&
3977 MF.getInfo<X86MachineFunctionInfo>()->hasCFIAdjustCfa()) {
3978 DwarfFrameBase FrameBase;
3979 FrameBase.Kind = DwarfFrameBase::CFA;
3980 FrameBase.Location.Offset =
3981 -MF.getFrameInfo().getStackSize() - getInitialCFAOffset(MF);
3982 return FrameBase;
3985 return DwarfFrameBase{DwarfFrameBase::Register, {FrameRegister}};
3988 namespace {
3989 // Struct used by orderFrameObjects to help sort the stack objects.
3990 struct X86FrameSortingObject {
3991 bool IsValid = false; // true if we care about this Object.
3992 unsigned ObjectIndex = 0; // Index of Object into MFI list.
3993 unsigned ObjectSize = 0; // Size of Object in bytes.
3994 Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
3995 unsigned ObjectNumUses = 0; // Object static number of uses.
3998 // The comparison function we use for std::sort to order our local
3999 // stack symbols. The current algorithm is to use an estimated
4000 // "density". This takes into consideration the size and number of
4001 // uses each object has in order to roughly minimize code size.
4002 // So, for example, an object of size 16B that is referenced 5 times
4003 // will get higher priority than 4 4B objects referenced 1 time each.
4004 // It's not perfect and we may be able to squeeze a few more bytes out of
4005 // it (for example : 0(esp) requires fewer bytes, symbols allocated at the
4006 // fringe end can have special consideration, given their size is less
4007 // important, etc.), but the algorithmic complexity grows too much to be
4008 // worth the extra gains we get. This gets us pretty close.
4009 // The final order leaves us with objects with highest priority going
4010 // at the end of our list.
4011 struct X86FrameSortingComparator {
4012 inline bool operator()(const X86FrameSortingObject &A,
4013 const X86FrameSortingObject &B) const {
4014 uint64_t DensityAScaled, DensityBScaled;
4016 // For consistency in our comparison, all invalid objects are placed
4017 // at the end. This also allows us to stop walking when we hit the
4018 // first invalid item after it's all sorted.
4019 if (!A.IsValid)
4020 return false;
4021 if (!B.IsValid)
4022 return true;
4024 // The density is calculated by doing :
4025 // (double)DensityA = A.ObjectNumUses / A.ObjectSize
4026 // (double)DensityB = B.ObjectNumUses / B.ObjectSize
4027 // Since this approach may cause inconsistencies in
4028 // the floating point <, >, == comparisons, depending on the floating
4029 // point model with which the compiler was built, we're going
4030 // to scale both sides by multiplying with
4031 // A.ObjectSize * B.ObjectSize. This ends up factoring away
4032 // the division and, with it, the need for any floating point
4033 // arithmetic.
4034 DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
4035 static_cast<uint64_t>(B.ObjectSize);
4036 DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
4037 static_cast<uint64_t>(A.ObjectSize);
4039 // If the two densities are equal, prioritize highest alignment
4040 // objects. This allows for similar alignment objects
4041 // to be packed together (given the same density).
4042 // There's room for improvement here, also, since we can pack
4043 // similar alignment (different density) objects next to each
4044 // other to save padding. This will also require further
4045 // complexity/iterations, and the overall gain isn't worth it,
4046 // in general. Something to keep in mind, though.
4047 if (DensityAScaled == DensityBScaled)
4048 return A.ObjectAlignment < B.ObjectAlignment;
4050 return DensityAScaled < DensityBScaled;
4053 } // namespace
4055 // Order the symbols in the local stack.
4056 // We want to place the local stack objects in some sort of sensible order.
4057 // The heuristic we use is to try and pack them according to static number
4058 // of uses and size of object in order to minimize code size.
4059 void X86FrameLowering::orderFrameObjects(
4060 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
4061 const MachineFrameInfo &MFI = MF.getFrameInfo();
4063 // Don't waste time if there's nothing to do.
4064 if (ObjectsToAllocate.empty())
4065 return;
4067 // Create an array of all MFI objects. We won't need all of these
4068 // objects, but we're going to create a full array of them to make
4069 // it easier to index into when we're counting "uses" down below.
4070 // We want to be able to easily/cheaply access an object by simply
4071 // indexing into it, instead of having to search for it every time.
4072 std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());
4074 // Walk the objects we care about and mark them as such in our working
4075 // struct.
4076 for (auto &Obj : ObjectsToAllocate) {
4077 SortingObjects[Obj].IsValid = true;
4078 SortingObjects[Obj].ObjectIndex = Obj;
4079 SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj);
4080 // Set the size.
4081 int ObjectSize = MFI.getObjectSize(Obj);
4082 if (ObjectSize == 0)
4083 // Variable size. Just use 4.
4084 SortingObjects[Obj].ObjectSize = 4;
4085 else
4086 SortingObjects[Obj].ObjectSize = ObjectSize;
4089 // Count the number of uses for each object.
4090 for (auto &MBB : MF) {
4091 for (auto &MI : MBB) {
4092 if (MI.isDebugInstr())
4093 continue;
4094 for (const MachineOperand &MO : MI.operands()) {
4095 // Check to see if it's a local stack symbol.
4096 if (!MO.isFI())
4097 continue;
4098 int Index = MO.getIndex();
4099 // Check to see if it falls within our range, and is tagged
4100 // to require ordering.
4101 if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
4102 SortingObjects[Index].IsValid)
4103 SortingObjects[Index].ObjectNumUses++;
4108 // Sort the objects using X86FrameSortingAlgorithm (see its comment for
4109 // info).
4110 llvm::stable_sort(SortingObjects, X86FrameSortingComparator());
4112 // Now modify the original list to represent the final order that
4113 // we want. The order will depend on whether we're going to access them
4114 // from the stack pointer or the frame pointer. For SP, the list should
4115 // end up with the END containing objects that we want with smaller offsets.
4116 // For FP, it should be flipped.
4117 int i = 0;
4118 for (auto &Obj : SortingObjects) {
4119 // All invalid items are sorted at the end, so it's safe to stop.
4120 if (!Obj.IsValid)
4121 break;
4122 ObjectsToAllocate[i++] = Obj.ObjectIndex;
4125 // Flip it if we're accessing off of the FP.
4126 if (!TRI->hasStackRealignment(MF) && hasFP(MF))
4127 std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
4130 unsigned
4131 X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
4132 // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
4133 unsigned Offset = 16;
4134 // RBP is immediately pushed.
4135 Offset += SlotSize;
4136 // All callee-saved registers are then pushed.
4137 Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
4138 // Every funclet allocates enough stack space for the largest outgoing call.
4139 Offset += getWinEHFuncletFrameSize(MF);
4140 return Offset;
4143 void X86FrameLowering::processFunctionBeforeFrameFinalized(
4144 MachineFunction &MF, RegScavenger *RS) const {
4145 // Mark the function as not having WinCFI. We will set it back to true in
4146 // emitPrologue if it gets called and emits CFI.
4147 MF.setHasWinCFI(false);
4149 // If we are using Windows x64 CFI, ensure that the stack is always 8 byte
4150 // aligned. The format doesn't support misaligned stack adjustments.
4151 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
4152 MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize));
4154 // If this function isn't doing Win64-style C++ EH, we don't need to do
4155 // anything.
4156 if (STI.is64Bit() && MF.hasEHFunclets() &&
4157 classifyEHPersonality(MF.getFunction().getPersonalityFn()) ==
4158 EHPersonality::MSVC_CXX) {
4159 adjustFrameForMsvcCxxEh(MF);
4163 void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
4164 // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
4165 // relative to RSP after the prologue. Find the offset of the last fixed
4166 // object, so that we can allocate a slot immediately following it. If there
4167 // were no fixed objects, use offset -SlotSize, which is immediately after the
4168 // return address. Fixed objects have negative frame indices.
4169 MachineFrameInfo &MFI = MF.getFrameInfo();
4170 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
4171 int64_t MinFixedObjOffset = -SlotSize;
4172 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)
4173 MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));
4175 for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
4176 for (WinEHHandlerType &H : TBME.HandlerArray) {
4177 int FrameIndex = H.CatchObj.FrameIndex;
4178 if (FrameIndex != INT_MAX) {
4179 // Ensure alignment.
4180 unsigned Align = MFI.getObjectAlign(FrameIndex).value();
4181 MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
4182 MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
4183 MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
4188 // Ensure alignment.
4189 MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
4190 int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
4191 int UnwindHelpFI =
4192 MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*IsImmutable=*/false);
4193 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
4195 // Store -2 into UnwindHelp on function entry. We have to scan forwards past
4196 // other frame setup instructions.
4197 MachineBasicBlock &MBB = MF.front();
4198 auto MBBI = MBB.begin();
4199 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
4200 ++MBBI;
4202 DebugLoc DL = MBB.findDebugLoc(MBBI);
4203 addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
4204 UnwindHelpFI)
4205 .addImm(-2);
4208 void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced(
4209 MachineFunction &MF, RegScavenger *RS) const {
4210 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
4212 if (STI.is32Bit() && MF.hasEHFunclets())
4213 restoreWinEHStackPointersInParent(MF);
4214 // We have emitted prolog and epilog. Don't need stack pointer saving
4215 // instruction any more.
4216 if (MachineInstr *MI = X86FI->getStackPtrSaveMI()) {
4217 MI->eraseFromParent();
4218 X86FI->setStackPtrSaveMI(nullptr);
4222 void X86FrameLowering::restoreWinEHStackPointersInParent(
4223 MachineFunction &MF) const {
4224 // 32-bit functions have to restore stack pointers when control is transferred
4225 // back to the parent function. These blocks are identified as eh pads that
4226 // are not funclet entries.
4227 bool IsSEH = isAsynchronousEHPersonality(
4228 classifyEHPersonality(MF.getFunction().getPersonalityFn()));
4229 for (MachineBasicBlock &MBB : MF) {
4230 bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry();
4231 if (NeedsRestore)
4232 restoreWin32EHStackPointers(MBB, MBB.begin(), DebugLoc(),
4233 /*RestoreSP=*/IsSEH);
4237 // Compute the alignment gap between current SP after spilling FP/BP and the
4238 // next properly aligned stack offset.
4239 static int computeFPBPAlignmentGap(MachineFunction &MF,
4240 const TargetRegisterClass *RC,
4241 unsigned NumSpilledRegs) {
4242 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
4243 unsigned AllocSize = TRI->getSpillSize(*RC) * NumSpilledRegs;
4244 Align StackAlign = MF.getSubtarget().getFrameLowering()->getStackAlign();
4245 unsigned AlignedSize = alignTo(AllocSize, StackAlign);
4246 return AlignedSize - AllocSize;
4249 void X86FrameLowering::spillFPBPUsingSP(MachineFunction &MF,
4250 MachineBasicBlock::iterator BeforeMI,
4251 Register FP, Register BP,
4252 int SPAdjust) const {
4253 assert(FP.isValid() || BP.isValid());
4255 MachineBasicBlock *MBB = BeforeMI->getParent();
4256 DebugLoc DL = BeforeMI->getDebugLoc();
4258 // Spill FP.
4259 if (FP.isValid()) {
4260 BuildMI(*MBB, BeforeMI, DL,
4261 TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
4262 .addReg(FP);
4265 // Spill BP.
4266 if (BP.isValid()) {
4267 BuildMI(*MBB, BeforeMI, DL,
4268 TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
4269 .addReg(BP);
4272 // Make sure SP is aligned.
4273 if (SPAdjust)
4274 emitSPUpdate(*MBB, BeforeMI, DL, -SPAdjust, false);
4276 // Emit unwinding information.
4277 if (FP.isValid() && needsDwarfCFI(MF)) {
4278 // Emit .cfi_remember_state to remember old frame.
4279 unsigned CFIIndex =
4280 MF.addFrameInst(MCCFIInstruction::createRememberState(nullptr));
4281 BuildMI(*MBB, BeforeMI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
4282 .addCFIIndex(CFIIndex);
4284 // Setup new CFA value with DW_CFA_def_cfa_expression:
4285 // DW_OP_breg7+offset, DW_OP_deref, DW_OP_consts 16, DW_OP_plus
4286 SmallString<64> CfaExpr;
4287 uint8_t buffer[16];
4288 int Offset = SPAdjust;
4289 if (BP.isValid())
4290 Offset += TRI->getSpillSize(*TRI->getMinimalPhysRegClass(BP));
4291 // If BeforeMI is a frame setup instruction, we need to adjust the position
4292 // and offset of the new cfi instruction.
4293 if (TII.isFrameSetup(*BeforeMI)) {
4294 Offset += alignTo(TII.getFrameSize(*BeforeMI), getStackAlign());
4295 BeforeMI = std::next(BeforeMI);
4297 Register StackPtr = TRI->getStackRegister();
4298 if (STI.isTarget64BitILP32())
4299 StackPtr = Register(getX86SubSuperRegister(StackPtr, 64));
4300 unsigned DwarfStackPtr = TRI->getDwarfRegNum(StackPtr, true);
4301 CfaExpr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfStackPtr));
4302 CfaExpr.append(buffer, buffer + encodeSLEB128(Offset, buffer));
4303 CfaExpr.push_back(dwarf::DW_OP_deref);
4304 CfaExpr.push_back(dwarf::DW_OP_consts);
4305 CfaExpr.append(buffer, buffer + encodeSLEB128(SlotSize * 2, buffer));
4306 CfaExpr.push_back((uint8_t)dwarf::DW_OP_plus);
4308 SmallString<64> DefCfaExpr;
4309 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
4310 DefCfaExpr.append(buffer, buffer + encodeSLEB128(CfaExpr.size(), buffer));
4311 DefCfaExpr.append(CfaExpr.str());
4312 BuildCFI(*MBB, BeforeMI, DL,
4313 MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str()),
4314 MachineInstr::FrameSetup);
4318 void X86FrameLowering::restoreFPBPUsingSP(MachineFunction &MF,
4319 MachineBasicBlock::iterator AfterMI,
4320 Register FP, Register BP,
4321 int SPAdjust) const {
4322 assert(FP.isValid() || BP.isValid());
4324 // Adjust SP so it points to spilled FP or BP.
4325 MachineBasicBlock *MBB = AfterMI->getParent();
4326 MachineBasicBlock::iterator Pos = std::next(AfterMI);
4327 DebugLoc DL = AfterMI->getDebugLoc();
4328 if (SPAdjust)
4329 emitSPUpdate(*MBB, Pos, DL, SPAdjust, false);
4331 // Restore BP.
4332 if (BP.isValid()) {
4333 BuildMI(*MBB, Pos, DL,
4334 TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())), BP);
4337 // Restore FP.
4338 if (FP.isValid()) {
4339 BuildMI(*MBB, Pos, DL,
4340 TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())), FP);
4342 // Emit unwinding information.
4343 if (needsDwarfCFI(MF)) {
4344 // Restore original frame with .cfi_restore_state.
4345 unsigned CFIIndex =
4346 MF.addFrameInst(MCCFIInstruction::createRestoreState(nullptr));
4347 BuildMI(*MBB, Pos, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
4348 .addCFIIndex(CFIIndex);
4353 void X86FrameLowering::saveAndRestoreFPBPUsingSP(
4354 MachineFunction &MF, MachineBasicBlock::iterator BeforeMI,
4355 MachineBasicBlock::iterator AfterMI, bool SpillFP, bool SpillBP) const {
4356 assert(SpillFP || SpillBP);
4358 Register FP, BP;
4359 const TargetRegisterClass *RC;
4360 unsigned NumRegs = 0;
4362 if (SpillFP) {
4363 FP = TRI->getFrameRegister(MF);
4364 if (STI.isTarget64BitILP32())
4365 FP = Register(getX86SubSuperRegister(FP, 64));
4366 RC = TRI->getMinimalPhysRegClass(FP);
4367 ++NumRegs;
4369 if (SpillBP) {
4370 BP = TRI->getBaseRegister();
4371 if (STI.isTarget64BitILP32())
4372 BP = Register(getX86SubSuperRegister(BP, 64));
4373 RC = TRI->getMinimalPhysRegClass(BP);
4374 ++NumRegs;
4376 int SPAdjust = computeFPBPAlignmentGap(MF, RC, NumRegs);
4378 spillFPBPUsingSP(MF, BeforeMI, FP, BP, SPAdjust);
4379 restoreFPBPUsingSP(MF, AfterMI, FP, BP, SPAdjust);
4382 bool X86FrameLowering::skipSpillFPBP(
4383 MachineFunction &MF, MachineBasicBlock::reverse_iterator &MI) const {
4384 if (MI->getOpcode() == X86::LCMPXCHG16B_SAVE_RBX) {
4385 // The pseudo instruction LCMPXCHG16B_SAVE_RBX is generated in the form
4386 // SaveRbx = COPY RBX
4387 // SaveRbx = LCMPXCHG16B_SAVE_RBX ..., SaveRbx, implicit-def rbx
4388 // And later LCMPXCHG16B_SAVE_RBX is expanded to restore RBX from SaveRbx.
4389 // We should skip this instruction sequence.
4390 int FI;
4391 unsigned Reg;
4392 while (!(MI->getOpcode() == TargetOpcode::COPY &&
4393 MI->getOperand(1).getReg() == X86::RBX) &&
4394 !((Reg = TII.isStoreToStackSlot(*MI, FI)) && Reg == X86::RBX))
4395 ++MI;
4396 return true;
4398 return false;
4401 static bool isFPBPAccess(const MachineInstr &MI, Register FP, Register BP,
4402 const TargetRegisterInfo *TRI, bool &AccessFP,
4403 bool &AccessBP) {
4404 AccessFP = AccessBP = false;
4405 if (FP) {
4406 if (MI.findRegisterUseOperandIdx(FP, TRI, false) != -1 ||
4407 MI.findRegisterDefOperandIdx(FP, TRI, false, true) != -1)
4408 AccessFP = true;
4410 if (BP) {
4411 if (MI.findRegisterUseOperandIdx(BP, TRI, false) != -1 ||
4412 MI.findRegisterDefOperandIdx(BP, TRI, false, true) != -1)
4413 AccessBP = true;
4415 return AccessFP || AccessBP;
4418 // Invoke instruction has been lowered to normal function call. We try to figure
4419 // out if MI comes from Invoke.
4420 // Do we have any better method?
4421 static bool isInvoke(const MachineInstr &MI, bool InsideEHLabels) {
4422 if (!MI.isCall())
4423 return false;
4424 if (InsideEHLabels)
4425 return true;
4427 const MachineBasicBlock *MBB = MI.getParent();
4428 if (!MBB->hasEHPadSuccessor())
4429 return false;
4431 // Check if there is another call instruction from MI to the end of MBB.
4432 MachineBasicBlock::const_iterator MBBI = MI, ME = MBB->end();
4433 for (++MBBI; MBBI != ME; ++MBBI)
4434 if (MBBI->isCall())
4435 return false;
4436 return true;
4439 /// Given the live range of FP or BP (DefMI, KillMI), check if there is any
4440 /// interfered stack access in the range, usually generated by register spill.
4441 void X86FrameLowering::checkInterferedAccess(
4442 MachineFunction &MF, MachineBasicBlock::reverse_iterator DefMI,
4443 MachineBasicBlock::reverse_iterator KillMI, bool SpillFP,
4444 bool SpillBP) const {
4445 if (DefMI == KillMI)
4446 return;
4447 if (TRI->hasBasePointer(MF)) {
4448 if (!SpillBP)
4449 return;
4450 } else {
4451 if (!SpillFP)
4452 return;
4455 auto MI = KillMI;
4456 while (MI != DefMI) {
4457 if (any_of(MI->operands(),
4458 [](const MachineOperand &MO) { return MO.isFI(); }))
4459 MF.getContext().reportError(SMLoc(),
4460 "Interference usage of base pointer/frame "
4461 "pointer.");
4462 MI++;
4466 /// If a function uses base pointer and the base pointer is clobbered by inline
4467 /// asm, RA doesn't detect this case, and after the inline asm, the base pointer
4468 /// contains garbage value.
4469 /// For example if a 32b x86 function uses base pointer esi, and esi is
4470 /// clobbered by following inline asm
4471 /// asm("rep movsb" : "+D"(ptr), "+S"(x), "+c"(c)::"memory");
4472 /// We need to save esi before the asm and restore it after the asm.
4474 /// The problem can also occur to frame pointer if there is a function call, and
4475 /// the callee uses a different calling convention and clobbers the fp.
4477 /// Because normal frame objects (spill slots) are accessed through fp/bp
4478 /// register, so we can't spill fp/bp to normal spill slots.
4480 /// FIXME: There are 2 possible enhancements:
4481 /// 1. In many cases there are different physical registers not clobbered by
4482 /// inline asm, we can use one of them as base pointer. Or use a virtual
4483 /// register as base pointer and let RA allocate a physical register to it.
4484 /// 2. If there is no other instructions access stack with fp/bp from the
4485 /// inline asm to the epilog, and no cfi requirement for a correct fp, we can
4486 /// skip the save and restore operations.
4487 void X86FrameLowering::spillFPBP(MachineFunction &MF) const {
4488 Register FP, BP;
4489 const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
4490 if (TFI.hasFP(MF))
4491 FP = TRI->getFrameRegister(MF);
4492 if (TRI->hasBasePointer(MF))
4493 BP = TRI->getBaseRegister();
4495 // Currently only inline asm and function call can clobbers fp/bp. So we can
4496 // do some quick test and return early.
4497 if (!MF.hasInlineAsm()) {
4498 X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
4499 if (!X86FI->getFPClobberedByCall())
4500 FP = 0;
4501 if (!X86FI->getBPClobberedByCall())
4502 BP = 0;
4504 if (!FP && !BP)
4505 return;
4507 for (MachineBasicBlock &MBB : MF) {
4508 bool InsideEHLabels = false;
4509 auto MI = MBB.rbegin(), ME = MBB.rend();
4510 auto TermMI = MBB.getFirstTerminator();
4511 if (TermMI == MBB.begin())
4512 continue;
4513 MI = *(std::prev(TermMI));
4515 while (MI != ME) {
4516 // Skip frame setup/destroy instructions.
4517 // Skip Invoke (call inside try block) instructions.
4518 // Skip instructions handled by target.
4519 if (MI->getFlag(MachineInstr::MIFlag::FrameSetup) ||
4520 MI->getFlag(MachineInstr::MIFlag::FrameDestroy) ||
4521 isInvoke(*MI, InsideEHLabels) || skipSpillFPBP(MF, MI)) {
4522 ++MI;
4523 continue;
4526 if (MI->getOpcode() == TargetOpcode::EH_LABEL) {
4527 InsideEHLabels = !InsideEHLabels;
4528 ++MI;
4529 continue;
4532 bool AccessFP, AccessBP;
4533 // Check if fp or bp is used in MI.
4534 if (!isFPBPAccess(*MI, FP, BP, TRI, AccessFP, AccessBP)) {
4535 ++MI;
4536 continue;
4539 // Look for the range [DefMI, KillMI] in which fp or bp is defined and
4540 // used.
4541 bool FPLive = false, BPLive = false;
4542 bool SpillFP = false, SpillBP = false;
4543 auto DefMI = MI, KillMI = MI;
4544 do {
4545 SpillFP |= AccessFP;
4546 SpillBP |= AccessBP;
4548 // Maintain FPLive and BPLive.
4549 if (FPLive && MI->findRegisterDefOperandIdx(FP, TRI, false, true) != -1)
4550 FPLive = false;
4551 if (FP && MI->findRegisterUseOperandIdx(FP, TRI, false) != -1)
4552 FPLive = true;
4553 if (BPLive && MI->findRegisterDefOperandIdx(BP, TRI, false, true) != -1)
4554 BPLive = false;
4555 if (BP && MI->findRegisterUseOperandIdx(BP, TRI, false) != -1)
4556 BPLive = true;
4558 DefMI = MI++;
4559 } while ((MI != ME) &&
4560 (FPLive || BPLive ||
4561 isFPBPAccess(*MI, FP, BP, TRI, AccessFP, AccessBP)));
4563 // Don't need to save/restore if FP is accessed through llvm.frameaddress.
4564 if (FPLive && !SpillBP)
4565 continue;
4567 // If the bp is clobbered by a call, we should save and restore outside of
4568 // the frame setup instructions.
4569 if (KillMI->isCall() && DefMI != ME) {
4570 auto FrameSetup = std::next(DefMI);
4571 // Look for frame setup instruction toward the start of the BB.
4572 // If we reach another call instruction, it means no frame setup
4573 // instruction for the current call instruction.
4574 while (FrameSetup != ME && !TII.isFrameSetup(*FrameSetup) &&
4575 !FrameSetup->isCall())
4576 ++FrameSetup;
4577 // If a frame setup instruction is found, we need to find out the
4578 // corresponding frame destroy instruction.
4579 if (FrameSetup != ME && TII.isFrameSetup(*FrameSetup) &&
4580 (TII.getFrameSize(*FrameSetup) ||
4581 TII.getFrameAdjustment(*FrameSetup))) {
4582 while (!TII.isFrameInstr(*KillMI))
4583 --KillMI;
4584 DefMI = FrameSetup;
4585 MI = DefMI;
4586 ++MI;
4590 checkInterferedAccess(MF, DefMI, KillMI, SpillFP, SpillBP);
4592 // Call target function to spill and restore FP and BP registers.
4593 saveAndRestoreFPBPUsingSP(MF, &(*DefMI), &(*KillMI), SpillFP, SpillBP);