[x86] fix assert with horizontal math + broadcast of vector (PR43402)
[llvm-core.git] / lib / Target / AMDGPU / SIFrameLowering.cpp
blob45c06ebb547acaa3be71929ba2c8e2d1c5e0d84e
1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
22 using namespace llvm;
24 #define DEBUG_TYPE "frame-info"
27 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
28 const MachineFunction &MF) {
29 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
30 ST.getMaxNumSGPRs(MF) / 4);
33 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
34 const MachineFunction &MF) {
35 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
36 ST.getMaxNumSGPRs(MF));
39 // Find a scratch register that we can use at the start of the prologue to
40 // re-align the stack pointer. We avoid using callee-save registers since they
41 // may appear to be free when this is called from canUseAsPrologue (during
42 // shrink wrapping), but then no longer be free when this is called from
43 // emitPrologue.
45 // FIXME: This is a bit conservative, since in the above case we could use one
46 // of the callee-save registers as a scratch temp to re-align the stack pointer,
47 // but we would then have to make sure that we were in fact saving at least one
48 // callee-save register in the prologue, which is additional complexity that
49 // doesn't seem worth the benefit.
50 static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
51 LivePhysRegs &LiveRegs,
52 const TargetRegisterClass &RC,
53 bool Unused = false) {
54 // Mark callee saved registers as used so we will not choose them.
55 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
56 for (unsigned i = 0; CSRegs[i]; ++i)
57 LiveRegs.addReg(CSRegs[i]);
59 if (Unused) {
60 // We are looking for a register that can be used throughout the entire
61 // function, so any use is unacceptable.
62 for (unsigned Reg : RC) {
63 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
64 return Reg;
66 } else {
67 for (unsigned Reg : RC) {
68 if (LiveRegs.available(MRI, Reg))
69 return Reg;
73 // If we require an unused register, this is used in contexts where failure is
74 // an option and has an alternative plan. In other contexts, this must
75 // succeed0.
76 if (!Unused)
77 report_fatal_error("failed to find free scratch register");
79 return AMDGPU::NoRegister;
82 static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
83 LivePhysRegs LiveRegs;
84 LiveRegs.init(*MRI.getTargetRegisterInfo());
85 return findScratchNonCalleeSaveRegister(
86 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
89 // We need to specially emit stack operations here because a different frame
90 // register is used than in the rest of the function, as getFrameRegister would
91 // use.
92 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
93 MachineBasicBlock::iterator I,
94 const SIInstrInfo *TII, unsigned SpillReg,
95 unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
96 MachineFunction *MF = MBB.getParent();
97 MachineFrameInfo &MFI = MF->getFrameInfo();
99 int64_t Offset = MFI.getObjectOffset(FI);
101 MachineMemOperand *MMO = MF->getMachineMemOperand(
102 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
103 MFI.getObjectAlignment(FI));
105 if (isUInt<12>(Offset)) {
106 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
107 .addReg(SpillReg, RegState::Kill)
108 .addReg(ScratchRsrcReg)
109 .addReg(SPReg)
110 .addImm(Offset)
111 .addImm(0) // glc
112 .addImm(0) // slc
113 .addImm(0) // tfe
114 .addImm(0) // dlc
115 .addMemOperand(MMO);
116 return;
119 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
120 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
122 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
123 .addImm(Offset);
125 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
126 .addReg(SpillReg, RegState::Kill)
127 .addReg(OffsetReg, RegState::Kill)
128 .addReg(ScratchRsrcReg)
129 .addReg(SPReg)
130 .addImm(0)
131 .addImm(0) // glc
132 .addImm(0) // slc
133 .addImm(0) // tfe
134 .addImm(0) // dlc
135 .addMemOperand(MMO);
138 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
139 MachineBasicBlock::iterator I,
140 const SIInstrInfo *TII, unsigned SpillReg,
141 unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
142 MachineFunction *MF = MBB.getParent();
143 MachineFrameInfo &MFI = MF->getFrameInfo();
144 int64_t Offset = MFI.getObjectOffset(FI);
146 MachineMemOperand *MMO = MF->getMachineMemOperand(
147 MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
148 MFI.getObjectAlignment(FI));
150 if (isUInt<12>(Offset)) {
151 BuildMI(MBB, I, DebugLoc(),
152 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
153 .addReg(ScratchRsrcReg)
154 .addReg(SPReg)
155 .addImm(Offset)
156 .addImm(0) // glc
157 .addImm(0) // slc
158 .addImm(0) // tfe
159 .addImm(0) // dlc
160 .addMemOperand(MMO);
161 return;
164 MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
165 MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
167 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
168 .addImm(Offset);
170 BuildMI(MBB, I, DebugLoc(),
171 TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
172 .addReg(OffsetReg, RegState::Kill)
173 .addReg(ScratchRsrcReg)
174 .addReg(SPReg)
175 .addImm(0)
176 .addImm(0) // glc
177 .addImm(0) // slc
178 .addImm(0) // tfe
179 .addImm(0) // dlc
180 .addMemOperand(MMO);
183 void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
184 MachineFunction &MF,
185 MachineBasicBlock &MBB) const {
186 const SIInstrInfo *TII = ST.getInstrInfo();
187 const SIRegisterInfo* TRI = &TII->getRegisterInfo();
188 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
190 // We don't need this if we only have spills since there is no user facing
191 // scratch.
193 // TODO: If we know we don't have flat instructions earlier, we can omit
194 // this from the input registers.
196 // TODO: We only need to know if we access scratch space through a flat
197 // pointer. Because we only detect if flat instructions are used at all,
198 // this will be used more often than necessary on VI.
200 // Debug location must be unknown since the first debug location is used to
201 // determine the end of the prologue.
202 DebugLoc DL;
203 MachineBasicBlock::iterator I = MBB.begin();
205 Register FlatScratchInitReg =
206 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
208 MachineRegisterInfo &MRI = MF.getRegInfo();
209 MRI.addLiveIn(FlatScratchInitReg);
210 MBB.addLiveIn(FlatScratchInitReg);
212 Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
213 Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
215 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
217 // Do a 64-bit pointer add.
218 if (ST.flatScratchIsPointer()) {
219 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
220 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
221 .addReg(FlatScrInitLo)
222 .addReg(ScratchWaveOffsetReg);
223 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
224 .addReg(FlatScrInitHi)
225 .addImm(0);
226 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
227 addReg(FlatScrInitLo).
228 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
229 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
230 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
231 addReg(FlatScrInitHi).
232 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
233 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
234 return;
237 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
238 .addReg(FlatScrInitLo)
239 .addReg(ScratchWaveOffsetReg);
240 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
241 .addReg(FlatScrInitHi)
242 .addImm(0);
244 return;
247 assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
249 // Copy the size in bytes.
250 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
251 .addReg(FlatScrInitHi, RegState::Kill);
253 // Add wave offset in bytes to private base offset.
254 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
255 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
256 .addReg(FlatScrInitLo)
257 .addReg(ScratchWaveOffsetReg);
259 // Convert offset to 256-byte units.
260 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
261 .addReg(FlatScrInitLo, RegState::Kill)
262 .addImm(8);
265 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
266 const GCNSubtarget &ST,
267 const SIInstrInfo *TII,
268 const SIRegisterInfo *TRI,
269 SIMachineFunctionInfo *MFI,
270 MachineFunction &MF) const {
271 MachineRegisterInfo &MRI = MF.getRegInfo();
273 // We need to insert initialization of the scratch resource descriptor.
274 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
275 if (ScratchRsrcReg == AMDGPU::NoRegister ||
276 !MRI.isPhysRegUsed(ScratchRsrcReg))
277 return AMDGPU::NoRegister;
279 if (ST.hasSGPRInitBug() ||
280 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
281 return ScratchRsrcReg;
283 // We reserved the last registers for this. Shift it down to the end of those
284 // which were actually used.
286 // FIXME: It might be safer to use a pseudoregister before replacement.
288 // FIXME: We should be able to eliminate unused input registers. We only
289 // cannot do this for the resources required for scratch access. For now we
290 // skip over user SGPRs and may leave unused holes.
292 // We find the resource first because it has an alignment requirement.
294 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
295 ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
296 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
298 // Skip the last N reserved elements because they should have already been
299 // reserved for VCC etc.
300 for (MCPhysReg Reg : AllSGPR128s) {
301 // Pick the first unallocated one. Make sure we don't clobber the other
302 // reserved input we needed.
303 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
304 MRI.replaceRegWith(ScratchRsrcReg, Reg);
305 MFI->setScratchRSrcReg(Reg);
306 return Reg;
310 return ScratchRsrcReg;
313 // Shift down registers reserved for the scratch wave offset.
314 std::pair<unsigned, bool>
315 SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
316 const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
317 SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
318 MachineRegisterInfo &MRI = MF.getRegInfo();
319 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
321 assert(MFI->isEntryFunction());
323 // No replacement necessary.
324 if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
325 (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) {
326 return std::make_pair(AMDGPU::NoRegister, false);
329 if (ST.hasSGPRInitBug())
330 return std::make_pair(ScratchWaveOffsetReg, false);
332 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
334 ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
335 if (NumPreloaded > AllSGPRs.size())
336 return std::make_pair(ScratchWaveOffsetReg, false);
338 AllSGPRs = AllSGPRs.slice(NumPreloaded);
340 // We need to drop register from the end of the list that we cannot use
341 // for the scratch wave offset.
342 // + 2 s102 and s103 do not exist on VI.
343 // + 2 for vcc
344 // + 2 for xnack_mask
345 // + 2 for flat_scratch
346 // + 4 for registers reserved for scratch resource register
347 // + 1 for register reserved for scratch wave offset. (By exluding this
348 // register from the list to consider, it means that when this
349 // register is being used for the scratch wave offset and there
350 // are no other free SGPRs, then the value will stay in this register.
351 // + 1 if stack pointer is used.
352 // ----
353 // 13 (+1)
354 unsigned ReservedRegCount = 13;
356 if (AllSGPRs.size() < ReservedRegCount)
357 return std::make_pair(ScratchWaveOffsetReg, false);
359 bool HandledScratchWaveOffsetReg =
360 ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
361 bool FPAdjusted = false;
363 for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
364 // Pick the first unallocated SGPR. Be careful not to pick an alias of the
365 // scratch descriptor, since we haven’t added its uses yet.
366 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
367 if (!HandledScratchWaveOffsetReg) {
368 HandledScratchWaveOffsetReg = true;
370 MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
371 if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
372 assert(!hasFP(MF));
373 MFI->setStackPtrOffsetReg(Reg);
376 MFI->setScratchWaveOffsetReg(Reg);
377 MFI->setFrameOffsetReg(Reg);
378 ScratchWaveOffsetReg = Reg;
379 FPAdjusted = true;
380 break;
385 return std::make_pair(ScratchWaveOffsetReg, FPAdjusted);
388 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
389 MachineBasicBlock &MBB) const {
390 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
392 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
394 // If we only have SGPR spills, we won't actually be using scratch memory
395 // since these spill to VGPRs.
397 // FIXME: We should be cleaning up these unused SGPR spill frame indices
398 // somewhere.
400 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
401 const SIInstrInfo *TII = ST.getInstrInfo();
402 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
403 MachineRegisterInfo &MRI = MF.getRegInfo();
404 const Function &F = MF.getFunction();
406 // We need to do the replacement of the private segment buffer and wave offset
407 // register even if there are no stack objects. There could be stores to undef
408 // or a constant without an associated object.
410 // FIXME: We still have implicit uses on SGPR spill instructions in case they
411 // need to spill to vector memory. It's likely that will not happen, but at
412 // this point it appears we need the setup. This part of the prolog should be
413 // emitted after frame indices are eliminated.
415 if (MFI->hasFlatScratchInit())
416 emitFlatScratchInit(ST, MF, MBB);
418 unsigned ScratchRsrcReg
419 = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
421 unsigned ScratchWaveOffsetReg;
422 bool FPAdjusted;
423 std::tie(ScratchWaveOffsetReg, FPAdjusted) =
424 getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
426 // We need to insert initialization of the scratch resource descriptor.
427 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
428 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
430 unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
431 if (ST.isAmdHsaOrMesa(F)) {
432 PreloadedPrivateBufferReg = MFI->getPreloadedReg(
433 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
436 bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
437 MRI.isPhysRegUsed(ScratchWaveOffsetReg);
438 bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
439 MRI.isPhysRegUsed(ScratchRsrcReg);
441 // FIXME: Hack to not crash in situations which emitted an error.
442 if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
443 return;
445 // We added live-ins during argument lowering, but since they were not used
446 // they were deleted. We're adding the uses now, so add them back.
447 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
448 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
450 if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
451 assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
452 MRI.addLiveIn(PreloadedPrivateBufferReg);
453 MBB.addLiveIn(PreloadedPrivateBufferReg);
456 // Make the register selected live throughout the function.
457 for (MachineBasicBlock &OtherBB : MF) {
458 if (&OtherBB == &MBB)
459 continue;
461 if (OffsetRegUsed || FPAdjusted)
462 OtherBB.addLiveIn(ScratchWaveOffsetReg);
464 if (ResourceRegUsed)
465 OtherBB.addLiveIn(ScratchRsrcReg);
468 DebugLoc DL;
469 MachineBasicBlock::iterator I = MBB.begin();
471 // If we reserved the original input registers, we don't need to copy to the
472 // reserved registers.
474 bool CopyBuffer = ResourceRegUsed &&
475 PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
476 ST.isAmdHsaOrMesa(F) &&
477 ScratchRsrcReg != PreloadedPrivateBufferReg;
479 // This needs to be careful of the copying order to avoid overwriting one of
480 // the input registers before it's been copied to it's final
481 // destination. Usually the offset should be copied first.
482 bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
483 ScratchWaveOffsetReg);
484 if (CopyBuffer && CopyBufferFirst) {
485 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
486 .addReg(PreloadedPrivateBufferReg, RegState::Kill);
489 unsigned SPReg = MFI->getStackPtrOffsetReg();
490 assert(SPReg != AMDGPU::SP_REG);
492 // FIXME: Remove the isPhysRegUsed checks
493 const bool HasFP = hasFP(MF);
495 if (HasFP || OffsetRegUsed) {
496 assert(ScratchWaveOffsetReg);
497 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
498 .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
501 if (CopyBuffer && !CopyBufferFirst) {
502 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
503 .addReg(PreloadedPrivateBufferReg, RegState::Kill);
506 if (ResourceRegUsed) {
507 emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
508 PreloadedPrivateBufferReg, ScratchRsrcReg);
511 if (HasFP) {
512 DebugLoc DL;
513 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
514 int64_t StackSize = FrameInfo.getStackSize();
516 // On kernel entry, the private scratch wave offset is the SP value.
517 if (StackSize == 0) {
518 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
519 .addReg(MFI->getScratchWaveOffsetReg());
520 } else {
521 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
522 .addReg(MFI->getScratchWaveOffsetReg())
523 .addImm(StackSize * ST.getWavefrontSize());
528 // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
529 void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
530 MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
531 MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
532 unsigned ScratchRsrcReg) const {
534 const SIInstrInfo *TII = ST.getInstrInfo();
535 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
536 const Function &Fn = MF.getFunction();
537 DebugLoc DL;
539 if (ST.isAmdPalOS()) {
540 // The pointer to the GIT is formed from the offset passed in and either
541 // the amdgpu-git-ptr-high function attribute or the top part of the PC
542 Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
543 Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
544 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
546 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
548 if (MFI->getGITPtrHigh() != 0xffffffff) {
549 BuildMI(MBB, I, DL, SMovB32, RsrcHi)
550 .addImm(MFI->getGITPtrHigh())
551 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
552 } else {
553 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
554 BuildMI(MBB, I, DL, GetPC64, Rsrc01);
556 auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
557 if (ST.hasMergedShaders()) {
558 switch (MF.getFunction().getCallingConv()) {
559 case CallingConv::AMDGPU_HS:
560 case CallingConv::AMDGPU_GS:
561 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
562 // ES+GS merged shader on gfx9+.
563 GitPtrLo = AMDGPU::SGPR8;
564 break;
565 default:
566 break;
569 MF.getRegInfo().addLiveIn(GitPtrLo);
570 MBB.addLiveIn(GitPtrLo);
571 BuildMI(MBB, I, DL, SMovB32, RsrcLo)
572 .addReg(GitPtrLo)
573 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
575 // We now have the GIT ptr - now get the scratch descriptor from the entry
576 // at offset 0 (or offset 16 for a compute shader).
577 PointerType *PtrTy =
578 PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
579 AMDGPUAS::CONSTANT_ADDRESS);
580 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
581 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
582 auto MMO = MF.getMachineMemOperand(PtrInfo,
583 MachineMemOperand::MOLoad |
584 MachineMemOperand::MOInvariant |
585 MachineMemOperand::MODereferenceable,
586 16, 4);
587 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
588 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
589 unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
590 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
591 .addReg(Rsrc01)
592 .addImm(EncodedOffset) // offset
593 .addImm(0) // glc
594 .addImm(0) // dlc
595 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
596 .addMemOperand(MMO);
597 return;
599 if (ST.isMesaGfxShader(Fn)
600 || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
601 assert(!ST.isAmdHsaOrMesa(Fn));
602 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
604 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
605 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
607 // Use relocations to get the pointer, and setup the other bits manually.
608 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
610 if (MFI->hasImplicitBufferPtr()) {
611 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
613 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
614 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
616 BuildMI(MBB, I, DL, Mov64, Rsrc01)
617 .addReg(MFI->getImplicitBufferPtrUserSGPR())
618 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
619 } else {
620 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
622 PointerType *PtrTy =
623 PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
624 AMDGPUAS::CONSTANT_ADDRESS);
625 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
626 auto MMO = MF.getMachineMemOperand(PtrInfo,
627 MachineMemOperand::MOLoad |
628 MachineMemOperand::MOInvariant |
629 MachineMemOperand::MODereferenceable,
630 8, 4);
631 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
632 .addReg(MFI->getImplicitBufferPtrUserSGPR())
633 .addImm(0) // offset
634 .addImm(0) // glc
635 .addImm(0) // dlc
636 .addMemOperand(MMO)
637 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
639 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
640 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
642 } else {
643 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
644 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
646 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
647 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
648 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
650 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
651 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
652 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
656 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
657 .addImm(Rsrc23 & 0xffffffff)
658 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
660 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
661 .addImm(Rsrc23 >> 32)
662 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
666 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
667 switch (ID) {
668 case TargetStackID::Default:
669 case TargetStackID::NoAlloc:
670 case TargetStackID::SGPRSpill:
671 return true;
673 llvm_unreachable("Invalid TargetStackID::Value");
676 void SIFrameLowering::emitPrologue(MachineFunction &MF,
677 MachineBasicBlock &MBB) const {
678 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
679 if (FuncInfo->isEntryFunction()) {
680 emitEntryFunctionPrologue(MF, MBB);
681 return;
684 const MachineFrameInfo &MFI = MF.getFrameInfo();
685 MachineRegisterInfo &MRI = MF.getRegInfo();
686 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
687 const SIInstrInfo *TII = ST.getInstrInfo();
688 const SIRegisterInfo &TRI = TII->getRegisterInfo();
690 unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
691 unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
692 LivePhysRegs LiveRegs;
694 MachineBasicBlock::iterator MBBI = MBB.begin();
695 DebugLoc DL;
697 bool HasFP = false;
698 uint32_t NumBytes = MFI.getStackSize();
699 uint32_t RoundedSize = NumBytes;
700 // To avoid clobbering VGPRs in lanes that weren't active on function entry,
701 // turn on all lanes before doing the spill to memory.
702 unsigned ScratchExecCopy = AMDGPU::NoRegister;
704 // Emit the copy if we need an FP, and are using a free SGPR to save it.
705 if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
706 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
707 .addReg(FramePtrReg)
708 .setMIFlag(MachineInstr::FrameSetup);
711 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
712 : FuncInfo->getSGPRSpillVGPRs()) {
713 if (!Reg.FI.hasValue())
714 continue;
716 if (ScratchExecCopy == AMDGPU::NoRegister) {
717 if (LiveRegs.empty()) {
718 LiveRegs.init(TRI);
719 LiveRegs.addLiveIns(MBB);
720 if (FuncInfo->SGPRForFPSaveRestoreCopy)
721 LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
724 ScratchExecCopy
725 = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
726 *TRI.getWaveMaskRegClass());
727 assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
729 const unsigned OrSaveExec = ST.isWave32() ?
730 AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
731 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
732 ScratchExecCopy)
733 .addImm(-1);
736 buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
737 FuncInfo->getScratchRSrcReg(),
738 StackPtrReg,
739 Reg.FI.getValue());
742 if (ScratchExecCopy != AMDGPU::NoRegister) {
743 // FIXME: Split block and make terminator.
744 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
745 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
746 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
747 .addReg(ScratchExecCopy, RegState::Kill);
748 LiveRegs.addReg(ScratchExecCopy);
752 if (FuncInfo->FramePointerSaveIndex) {
753 const int FI = FuncInfo->FramePointerSaveIndex.getValue();
754 assert(!MFI.isDeadObjectIndex(FI) &&
755 MFI.getStackID(FI) == TargetStackID::SGPRSpill);
756 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
757 = FuncInfo->getSGPRToVGPRSpills(FI);
758 assert(Spill.size() == 1);
760 // Save FP before setting it up.
761 // FIXME: This should respect spillSGPRToVGPR;
762 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
763 Spill[0].VGPR)
764 .addReg(FramePtrReg)
765 .addImm(Spill[0].Lane)
766 .addReg(Spill[0].VGPR, RegState::Undef);
769 if (TRI.needsStackRealignment(MF)) {
770 HasFP = true;
771 const unsigned Alignment = MFI.getMaxAlignment();
773 RoundedSize += Alignment;
774 if (LiveRegs.empty()) {
775 LiveRegs.init(TRI);
776 LiveRegs.addLiveIns(MBB);
777 LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
780 unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
781 MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
782 assert(ScratchSPReg != AMDGPU::NoRegister &&
783 ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
785 // s_add_u32 tmp_reg, s32, NumBytes
786 // s_and_b32 s32, tmp_reg, 0b111...0000
787 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
788 .addReg(StackPtrReg)
789 .addImm((Alignment - 1) * ST.getWavefrontSize())
790 .setMIFlag(MachineInstr::FrameSetup);
791 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
792 .addReg(ScratchSPReg, RegState::Kill)
793 .addImm(-Alignment * ST.getWavefrontSize())
794 .setMIFlag(MachineInstr::FrameSetup);
795 FuncInfo->setIsStackRealigned(true);
796 } else if ((HasFP = hasFP(MF))) {
797 // If we need a base pointer, set it up here. It's whatever the value of
798 // the stack pointer is at this point. Any variable size objects will be
799 // allocated after this, so we can still use the base pointer to reference
800 // locals.
801 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
802 .addReg(StackPtrReg)
803 .setMIFlag(MachineInstr::FrameSetup);
806 if (HasFP && RoundedSize != 0) {
807 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
808 .addReg(StackPtrReg)
809 .addImm(RoundedSize * ST.getWavefrontSize())
810 .setMIFlag(MachineInstr::FrameSetup);
813 assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
814 FuncInfo->FramePointerSaveIndex)) &&
815 "Needed to save FP but didn't save it anywhere");
817 assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
818 !FuncInfo->FramePointerSaveIndex)) &&
819 "Saved FP but didn't need it");
822 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
823 MachineBasicBlock &MBB) const {
824 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
825 if (FuncInfo->isEntryFunction())
826 return;
828 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
829 const SIInstrInfo *TII = ST.getInstrInfo();
830 MachineRegisterInfo &MRI = MF.getRegInfo();
831 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
832 LivePhysRegs LiveRegs;
833 DebugLoc DL;
835 const MachineFrameInfo &MFI = MF.getFrameInfo();
836 uint32_t NumBytes = MFI.getStackSize();
837 uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
838 NumBytes + MFI.getMaxAlignment() : NumBytes;
840 if (RoundedSize != 0 && hasFP(MF)) {
841 const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
842 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
843 .addReg(StackPtrReg)
844 .addImm(RoundedSize * ST.getWavefrontSize())
845 .setMIFlag(MachineInstr::FrameDestroy);
848 if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
849 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
850 .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
851 .setMIFlag(MachineInstr::FrameSetup);
854 if (FuncInfo->FramePointerSaveIndex) {
855 const int FI = FuncInfo->FramePointerSaveIndex.getValue();
857 assert(!MF.getFrameInfo().isDeadObjectIndex(FI) &&
858 MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill);
860 ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
861 = FuncInfo->getSGPRToVGPRSpills(FI);
862 assert(Spill.size() == 1);
863 BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
864 FuncInfo->getFrameOffsetReg())
865 .addReg(Spill[0].VGPR)
866 .addImm(Spill[0].Lane);
869 unsigned ScratchExecCopy = AMDGPU::NoRegister;
870 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
871 : FuncInfo->getSGPRSpillVGPRs()) {
872 if (!Reg.FI.hasValue())
873 continue;
875 const SIRegisterInfo &TRI = TII->getRegisterInfo();
876 if (ScratchExecCopy == AMDGPU::NoRegister) {
877 // See emitPrologue
878 if (LiveRegs.empty()) {
879 LiveRegs.init(*ST.getRegisterInfo());
880 LiveRegs.addLiveOuts(MBB);
881 LiveRegs.stepBackward(*MBBI);
884 ScratchExecCopy = findScratchNonCalleeSaveRegister(
885 MRI, LiveRegs, *TRI.getWaveMaskRegClass());
886 LiveRegs.removeReg(ScratchExecCopy);
888 const unsigned OrSaveExec =
889 ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
891 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
892 .addImm(-1);
895 buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
896 FuncInfo->getScratchRSrcReg(),
897 FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
900 if (ScratchExecCopy != AMDGPU::NoRegister) {
901 // FIXME: Split block and make terminator.
902 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
903 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
904 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
905 .addReg(ScratchExecCopy, RegState::Kill);
909 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
910 // memory. They should have been removed by now.
911 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
912 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
913 I != E; ++I) {
914 if (!MFI.isDeadObjectIndex(I))
915 return false;
918 return true;
921 #ifndef NDEBUG
922 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
923 Optional<int> FramePointerSaveIndex) {
924 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
925 I != E; ++I) {
926 if (!MFI.isDeadObjectIndex(I) &&
927 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
928 FramePointerSaveIndex && I != FramePointerSaveIndex) {
929 return false;
933 return true;
935 #endif
937 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
938 unsigned &FrameReg) const {
939 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
941 FrameReg = RI->getFrameRegister(MF);
942 return MF.getFrameInfo().getObjectOffset(FI);
945 void SIFrameLowering::processFunctionBeforeFrameFinalized(
946 MachineFunction &MF,
947 RegScavenger *RS) const {
948 MachineFrameInfo &MFI = MF.getFrameInfo();
950 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
951 const SIRegisterInfo *TRI = ST.getRegisterInfo();
952 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
954 FuncInfo->removeDeadFrameIndices(MFI);
955 assert(allSGPRSpillsAreDead(MFI, None) &&
956 "SGPR spill should have been removed in SILowerSGPRSpills");
958 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
959 // but currently hasNonSpillStackObjects is set only from source
960 // allocas. Stack temps produced from legalization are not counted currently.
961 if (!allStackObjectsAreDead(MFI)) {
962 assert(RS && "RegScavenger required if spilling");
964 if (FuncInfo->isEntryFunction()) {
965 int ScavengeFI = MFI.CreateFixedObject(
966 TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
967 RS->addScavengingFrameIndex(ScavengeFI);
968 } else {
969 int ScavengeFI = MFI.CreateStackObject(
970 TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
971 TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
972 false);
973 RS->addScavengingFrameIndex(ScavengeFI);
978 // Only report VGPRs to generic code.
979 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
980 BitVector &SavedVGPRs,
981 RegScavenger *RS) const {
982 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
983 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
984 if (MFI->isEntryFunction())
985 return;
987 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
988 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
989 const SIRegisterInfo *TRI = ST.getRegisterInfo();
991 // Ignore the SGPRs the default implementation found.
992 SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
994 // hasFP only knows about stack objects that already exist. We're now
995 // determining the stack slots that will be created, so we have to predict
996 // them. Stack objects force FP usage with calls.
998 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
999 // don't want to report it here.
1001 // FIXME: Is this really hasReservedCallFrame?
1002 const bool WillHaveFP =
1003 FrameInfo.hasCalls() &&
1004 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1006 // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1007 // so don't allow the default insertion to handle them.
1008 for (auto SSpill : MFI->getSGPRSpillVGPRs())
1009 SavedVGPRs.reset(SSpill.VGPR);
1011 const bool HasFP = WillHaveFP || hasFP(MF);
1012 if (!HasFP)
1013 return;
1015 if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
1016 int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1017 TargetStackID::SGPRSpill);
1019 // If there is already a VGPR with free lanes, use it. We may already have
1020 // to pay the penalty for spilling a CSR VGPR.
1021 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1022 llvm_unreachable("allocate SGPR spill should have worked");
1024 MFI->FramePointerSaveIndex = NewFI;
1026 LLVM_DEBUG(
1027 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1028 dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI)
1029 << ':' << Spill.Lane << '\n');
1030 return;
1033 MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
1035 if (!MFI->SGPRForFPSaveRestoreCopy) {
1036 // There's no free lane to spill, and no free register to save FP, so we're
1037 // forced to spill another VGPR to use for the spill.
1038 int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
1039 TargetStackID::SGPRSpill);
1040 if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
1041 llvm_unreachable("allocate SGPR spill should have worked");
1042 MFI->FramePointerSaveIndex = NewFI;
1044 LLVM_DEBUG(
1045 auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
1046 dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
1047 << ':' << Spill.Lane << '\n';);
1048 } else {
1049 LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
1050 printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
1054 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1055 BitVector &SavedRegs,
1056 RegScavenger *RS) const {
1057 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1058 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1059 if (MFI->isEntryFunction())
1060 return;
1062 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1063 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1065 // The SP is specifically managed and we don't want extra spills of it.
1066 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1067 SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1070 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1071 MachineFunction &MF, const TargetRegisterInfo *TRI,
1072 std::vector<CalleeSavedInfo> &CSI) const {
1073 if (CSI.empty())
1074 return true; // Early exit if no callee saved registers are modified!
1076 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1077 if (!FuncInfo->SGPRForFPSaveRestoreCopy)
1078 return false;
1080 for (auto &CS : CSI) {
1081 if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
1082 if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
1083 CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1084 break;
1088 return false;
1091 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1092 MachineFunction &MF,
1093 MachineBasicBlock &MBB,
1094 MachineBasicBlock::iterator I) const {
1095 int64_t Amount = I->getOperand(0).getImm();
1096 if (Amount == 0)
1097 return MBB.erase(I);
1099 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1100 const SIInstrInfo *TII = ST.getInstrInfo();
1101 const DebugLoc &DL = I->getDebugLoc();
1102 unsigned Opc = I->getOpcode();
1103 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1104 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1106 if (!hasReservedCallFrame(MF)) {
1107 unsigned Align = getStackAlignment();
1109 Amount = alignTo(Amount, Align);
1110 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1111 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1112 unsigned SPReg = MFI->getStackPtrOffsetReg();
1114 unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1115 BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1116 .addReg(SPReg)
1117 .addImm(Amount * ST.getWavefrontSize());
1118 } else if (CalleePopAmount != 0) {
1119 llvm_unreachable("is this used?");
1122 return MBB.erase(I);
1125 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1126 const MachineFrameInfo &MFI = MF.getFrameInfo();
1127 if (MFI.hasCalls()) {
1128 // All offsets are unsigned, so need to be addressed in the same direction
1129 // as stack growth.
1131 // FIXME: This function is pretty broken, since it can be called before the
1132 // frame layout is determined or CSR spills are inserted.
1133 if (MFI.getStackSize() != 0)
1134 return true;
1136 // For the entry point, the input wave scratch offset must be copied to the
1137 // API SP if there are calls.
1138 if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
1139 return true;
1142 return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
1143 MFI.hasStackMap() || MFI.hasPatchPoint() ||
1144 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1145 MF.getTarget().Options.DisableFramePointerElim(MF);