Revert " [LoongArch][ISel] Check the number of sign bits in `PatGprGpr_32` (#107432)"
[llvm-project.git] / llvm / lib / Target / AMDGPU / SIRegisterInfo.cpp
blob8a315aa8227866b9e8fcec56a2ec3b9d3371d224
1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPU.h"
15 #include "AMDGPURegisterBankInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUInstPrinter.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/LiveRegUnits.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
27 using namespace llvm;
29 #define GET_REGINFO_TARGET_DESC
30 #include "AMDGPUGenRegisterInfo.inc"
32 static cl::opt<bool> EnableSpillSGPRToVGPR(
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
35 cl::ReallyHidden,
36 cl::init(true));
38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44 // meaning index 7 in SubRegFromChannelTable.
45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
48 namespace llvm {
50 // A temporary struct to spill SGPRs.
51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
52 // just v_writelane and v_readlane.
54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
55 // is saved to scratch (or the other way around for loads).
56 // For this, a VGPR is required where the needed lanes can be clobbered. The
57 // RegScavenger can provide a VGPR where currently active lanes can be
58 // clobbered, but we still need to save inactive lanes.
59 // The high-level steps are:
60 // - Try to scavenge SGPR(s) to save exec
61 // - Try to scavenge VGPR
62 // - Save needed, all or inactive lanes of a TmpVGPR
63 // - Spill/Restore SGPRs using TmpVGPR
64 // - Restore TmpVGPR
66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
67 // cannot scavenge temporary SGPRs to save exec, we use the following code:
68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved
69 // s_not exec, exec
70 // buffer_store_dword TmpVGPR ; save inactive lanes
71 // s_not exec, exec
72 struct SGPRSpillBuilder {
73 struct PerVGPRData {
74 unsigned PerVGPR;
75 unsigned NumVGPRs;
76 int64_t VGPRLanes;
79 // The SGPR to save
80 Register SuperReg;
81 MachineBasicBlock::iterator MI;
82 ArrayRef<int16_t> SplitParts;
83 unsigned NumSubRegs;
84 bool IsKill;
85 const DebugLoc &DL;
87 /* When spilling to stack */
88 // The SGPRs are written into this VGPR, which is then written to scratch
89 // (or vice versa for loads).
90 Register TmpVGPR = AMDGPU::NoRegister;
91 // Temporary spill slot to save TmpVGPR to.
92 int TmpVGPRIndex = 0;
93 // If TmpVGPR is live before the spill or if it is scavenged.
94 bool TmpVGPRLive = false;
95 // Scavenged SGPR to save EXEC.
96 Register SavedExecReg = AMDGPU::NoRegister;
97 // Stack index to write the SGPRs to.
98 int Index;
99 unsigned EltSize = 4;
101 RegScavenger *RS;
102 MachineBasicBlock *MBB;
103 MachineFunction &MF;
104 SIMachineFunctionInfo &MFI;
105 const SIInstrInfo &TII;
106 const SIRegisterInfo &TRI;
107 bool IsWave32;
108 Register ExecReg;
109 unsigned MovOpc;
110 unsigned NotOpc;
112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
113 bool IsWave32, MachineBasicBlock::iterator MI, int Index,
114 RegScavenger *RS)
115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
116 MI->getOperand(0).isKill(), Index, RS) {}
118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg,
120 bool IsKill, int Index, RegScavenger *RS)
121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
124 IsWave32(IsWave32) {
125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
126 SplitParts = TRI.getRegSplitParts(RC, EltSize);
127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
129 if (IsWave32) {
130 ExecReg = AMDGPU::EXEC_LO;
131 MovOpc = AMDGPU::S_MOV_B32;
132 NotOpc = AMDGPU::S_NOT_B32;
133 } else {
134 ExecReg = AMDGPU::EXEC;
135 MovOpc = AMDGPU::S_MOV_B64;
136 NotOpc = AMDGPU::S_NOT_B64;
139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
141 SuperReg != AMDGPU::EXEC && "exec should never spill");
144 PerVGPRData getPerVGPRData() {
145 PerVGPRData Data;
146 Data.PerVGPR = IsWave32 ? 32 : 64;
147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
149 return Data;
152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
153 // free.
154 // Writes these instructions if an SGPR can be scavenged:
155 // s_mov_b64 s[6:7], exec ; Save exec
156 // s_mov_b64 exec, 3 ; Wanted lanemask
157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
159 // Writes these instructions if no SGPR can be scavenged:
160 // buffer_store_dword v0 ; Only if no free VGPR was found
161 // s_not_b64 exec, exec
162 // buffer_store_dword v0 ; Save inactive lanes
163 // ; exec stays inverted, it is flipped back in
164 // ; restore.
165 void prepare() {
166 // Scavenged temporary VGPR to use. It must be scavenged once for any number
167 // of spilled subregs.
168 // FIXME: The liveness analysis is limited and does not tell if a register
169 // is in use in lanes that are currently inactive. We can never be sure if
170 // a register as actually in use in another lane, so we need to save all
171 // used lanes of the chosen VGPR.
172 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
173 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
174 0, false);
176 // Reserve temporary stack slot
177 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
178 if (TmpVGPR) {
179 // Found a register that is dead in the currently active lanes, we only
180 // need to spill inactive lanes.
181 TmpVGPRLive = false;
182 } else {
183 // Pick v0 because it doesn't make a difference.
184 TmpVGPR = AMDGPU::VGPR0;
185 TmpVGPRLive = true;
188 if (TmpVGPRLive) {
189 // We need to inform the scavenger that this index is already in use until
190 // we're done with the custom emergency spill.
191 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
194 // We may end up recursively calling the scavenger, and don't want to re-use
195 // the same register.
196 RS->setRegUsed(TmpVGPR);
198 // Try to scavenge SGPRs to save exec
199 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
200 const TargetRegisterClass &RC =
201 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
202 RS->setRegUsed(SuperReg);
203 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
205 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
207 if (SavedExecReg) {
208 RS->setRegUsed(SavedExecReg);
209 // Set exec to needed lanes
210 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
211 auto I =
212 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
213 if (!TmpVGPRLive)
214 I.addReg(TmpVGPR, RegState::ImplicitDefine);
215 // Spill needed lanes
216 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
217 } else {
218 // The modify and restore of exec clobber SCC, which we would have to save
219 // and restore. FIXME: We probably would need to reserve a register for
220 // this.
221 if (RS->isRegUsed(AMDGPU::SCC))
222 MI->emitError("unhandled SGPR spill to memory");
224 // Spill active lanes
225 if (TmpVGPRLive)
226 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
227 /*IsKill*/ false);
228 // Spill inactive lanes
229 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
230 if (!TmpVGPRLive)
231 I.addReg(TmpVGPR, RegState::ImplicitDefine);
232 I->getOperand(2).setIsDead(); // Mark SCC as dead.
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
237 // Writes these instructions if an SGPR can be scavenged:
238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
239 // s_waitcnt vmcnt(0) ; If a free VGPR was found
240 // s_mov_b64 exec, s[6:7] ; Save exec
242 // Writes these instructions if no SGPR can be scavenged:
243 // buffer_load_dword v0 ; Restore inactive lanes
244 // s_waitcnt vmcnt(0) ; If a free VGPR was found
245 // s_not_b64 exec, exec
246 // buffer_load_dword v0 ; Only if no free VGPR was found
247 void restore() {
248 if (SavedExecReg) {
249 // Restore used lanes
250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
251 /*IsKill*/ false);
252 // Restore exec
253 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
254 .addReg(SavedExecReg, RegState::Kill);
255 // Add an implicit use of the load so it is not dead.
256 // FIXME This inserts an unnecessary waitcnt
257 if (!TmpVGPRLive) {
258 I.addReg(TmpVGPR, RegState::ImplicitKill);
260 } else {
261 // Restore inactive lanes
262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
263 /*IsKill*/ false);
264 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
265 if (!TmpVGPRLive)
266 I.addReg(TmpVGPR, RegState::ImplicitKill);
267 I->getOperand(2).setIsDead(); // Mark SCC as dead.
269 // Restore active lanes
270 if (TmpVGPRLive)
271 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
274 // Inform the scavenger where we're releasing our custom scavenged register.
275 if (TmpVGPRLive) {
276 MachineBasicBlock::iterator RestorePt = std::prev(MI);
277 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
281 // Write TmpVGPR to memory or read TmpVGPR from memory.
282 // Either using a single buffer_load/store if exec is set to the needed mask
283 // or using
284 // buffer_load
285 // s_not exec, exec
286 // buffer_load
287 // s_not exec, exec
288 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
289 if (SavedExecReg) {
290 // Spill needed lanes
291 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
292 } else {
293 // The modify and restore of exec clobber SCC, which we would have to save
294 // and restore. FIXME: We probably would need to reserve a register for
295 // this.
296 if (RS->isRegUsed(AMDGPU::SCC))
297 MI->emitError("unhandled SGPR spill to memory");
299 // Spill active lanes
300 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
301 /*IsKill*/ false);
302 // Spill inactive lanes
303 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
304 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
305 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
306 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
307 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
311 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) {
312 assert(MBB->getParent() == &MF);
313 MI = NewMI;
314 MBB = NewMBB;
318 } // namespace llvm
320 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
322 ST.getAMDGPUDwarfFlavour()),
323 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
325 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
326 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
327 (getSubRegIndexLaneMask(AMDGPU::lo16) |
328 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
329 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
330 "getNumCoveredRegs() will not work with generated subreg masks!");
332 RegPressureIgnoredUnits.resize(getNumRegUnits());
333 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
334 for (auto Reg : AMDGPU::VGPR_16RegClass) {
335 if (AMDGPU::isHi(Reg, *this))
336 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
339 // HACK: Until this is fully tablegen'd.
340 static llvm::once_flag InitializeRegSplitPartsFlag;
342 static auto InitializeRegSplitPartsOnce = [this]() {
343 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
344 unsigned Size = getSubRegIdxSize(Idx);
345 if (Size & 31)
346 continue;
347 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
348 unsigned Pos = getSubRegIdxOffset(Idx);
349 if (Pos % Size)
350 continue;
351 Pos /= Size;
352 if (Vec.empty()) {
353 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
354 Vec.resize(MaxNumParts);
356 Vec[Pos] = Idx;
360 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
362 static auto InitializeSubRegFromChannelTableOnce = [this]() {
363 for (auto &Row : SubRegFromChannelTable)
364 Row.fill(AMDGPU::NoSubRegister);
365 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
366 unsigned Width = getSubRegIdxSize(Idx) / 32;
367 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
368 assert(Width < SubRegFromChannelTableWidthMap.size());
369 Width = SubRegFromChannelTableWidthMap[Width];
370 if (Width == 0)
371 continue;
372 unsigned TableIdx = Width - 1;
373 assert(TableIdx < SubRegFromChannelTable.size());
374 assert(Offset < SubRegFromChannelTable[TableIdx].size());
375 SubRegFromChannelTable[TableIdx][Offset] = Idx;
379 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
380 llvm::call_once(InitializeSubRegFromChannelTableFlag,
381 InitializeSubRegFromChannelTableOnce);
384 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
385 MCRegister Reg) const {
386 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
387 Reserved.set(*R);
390 // Forced to be here by one .inc
391 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
392 const MachineFunction *MF) const {
393 CallingConv::ID CC = MF->getFunction().getCallingConv();
394 switch (CC) {
395 case CallingConv::C:
396 case CallingConv::Fast:
397 case CallingConv::Cold:
398 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
399 : CSR_AMDGPU_SaveList;
400 case CallingConv::AMDGPU_Gfx:
401 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
402 : CSR_AMDGPU_SI_Gfx_SaveList;
403 case CallingConv::AMDGPU_CS_ChainPreserve:
404 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
405 default: {
406 // Dummy to not crash RegisterClassInfo.
407 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
408 return &NoCalleeSavedReg;
413 const MCPhysReg *
414 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
415 return nullptr;
418 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
419 CallingConv::ID CC) const {
420 switch (CC) {
421 case CallingConv::C:
422 case CallingConv::Fast:
423 case CallingConv::Cold:
424 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
425 : CSR_AMDGPU_RegMask;
426 case CallingConv::AMDGPU_Gfx:
427 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
428 : CSR_AMDGPU_SI_Gfx_RegMask;
429 case CallingConv::AMDGPU_CS_Chain:
430 case CallingConv::AMDGPU_CS_ChainPreserve:
431 // Calls to these functions never return, so we can pretend everything is
432 // preserved.
433 return AMDGPU_AllVGPRs_RegMask;
434 default:
435 return nullptr;
439 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
440 return CSR_AMDGPU_NoRegs_RegMask;
443 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) {
444 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
447 const TargetRegisterClass *
448 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
449 const MachineFunction &MF) const {
450 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
451 // equivalent AV class. If used one, the verifier will crash after
452 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
453 // until Instruction selection.
454 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
455 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
456 return &AMDGPU::AV_32RegClass;
457 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
458 return &AMDGPU::AV_64RegClass;
459 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
460 RC == &AMDGPU::AReg_64_Align2RegClass)
461 return &AMDGPU::AV_64_Align2RegClass;
462 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
463 return &AMDGPU::AV_96RegClass;
464 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
465 RC == &AMDGPU::AReg_96_Align2RegClass)
466 return &AMDGPU::AV_96_Align2RegClass;
467 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
468 return &AMDGPU::AV_128RegClass;
469 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
470 RC == &AMDGPU::AReg_128_Align2RegClass)
471 return &AMDGPU::AV_128_Align2RegClass;
472 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
473 return &AMDGPU::AV_160RegClass;
474 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
475 RC == &AMDGPU::AReg_160_Align2RegClass)
476 return &AMDGPU::AV_160_Align2RegClass;
477 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
478 return &AMDGPU::AV_192RegClass;
479 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
480 RC == &AMDGPU::AReg_192_Align2RegClass)
481 return &AMDGPU::AV_192_Align2RegClass;
482 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
483 return &AMDGPU::AV_256RegClass;
484 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
485 RC == &AMDGPU::AReg_256_Align2RegClass)
486 return &AMDGPU::AV_256_Align2RegClass;
487 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
488 return &AMDGPU::AV_512RegClass;
489 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
490 RC == &AMDGPU::AReg_512_Align2RegClass)
491 return &AMDGPU::AV_512_Align2RegClass;
492 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
493 return &AMDGPU::AV_1024RegClass;
494 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
495 RC == &AMDGPU::AReg_1024_Align2RegClass)
496 return &AMDGPU::AV_1024_Align2RegClass;
499 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
502 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
503 const SIFrameLowering *TFI = ST.getFrameLowering();
504 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
505 // During ISel lowering we always reserve the stack pointer in entry and chain
506 // functions, but never actually want to reference it when accessing our own
507 // frame. If we need a frame pointer we use it, but otherwise we can just use
508 // an immediate "0" which we represent by returning NoRegister.
509 if (FuncInfo->isBottomOfStack()) {
510 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
512 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
513 : FuncInfo->getStackPtrOffsetReg();
516 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
517 // When we need stack realignment, we can't reference off of the
518 // stack pointer, so we reserve a base pointer.
519 const MachineFrameInfo &MFI = MF.getFrameInfo();
520 return MFI.getNumFixedObjects() && shouldRealignStack(MF);
523 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
525 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
526 return AMDGPU_AllVGPRs_RegMask;
529 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
530 return AMDGPU_AllAGPRs_RegMask;
533 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
534 return AMDGPU_AllVectorRegs_RegMask;
537 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
538 return AMDGPU_AllAllocatableSRegs_RegMask;
541 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
542 unsigned NumRegs) {
543 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
544 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
545 assert(NumRegIndex && "Not implemented");
546 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
547 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
550 MCRegister
551 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF,
552 const unsigned Align,
553 const TargetRegisterClass *RC) const {
554 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
555 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
556 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
559 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
560 const MachineFunction &MF) const {
561 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
564 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
565 BitVector Reserved(getNumRegs());
566 Reserved.set(AMDGPU::MODE);
568 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
570 // Reserve special purpose registers.
572 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
573 // this seems likely to result in bugs, so I'm marking them as reserved.
574 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
575 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
577 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
578 reserveRegisterTuples(Reserved, AMDGPU::M0);
580 // Reserve src_vccz, src_execz, src_scc.
581 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
582 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
583 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
585 // Reserve the memory aperture registers
586 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
587 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
588 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
589 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
591 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
592 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
594 // Reserve xnack_mask registers - support is not implemented in Codegen.
595 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
597 // Reserve lds_direct register - support is not implemented in Codegen.
598 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
600 // Reserve Trap Handler registers - support is not implemented in Codegen.
601 reserveRegisterTuples(Reserved, AMDGPU::TBA);
602 reserveRegisterTuples(Reserved, AMDGPU::TMA);
603 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
604 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
605 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
606 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
607 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
608 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
609 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
610 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
612 // Reserve null register - it shall never be allocated
613 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
615 // Reserve SGPRs.
617 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
618 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
619 for (const TargetRegisterClass *RC : regclasses()) {
620 if (RC->isBaseClass() && isSGPRClass(RC)) {
621 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
622 for (MCPhysReg Reg : *RC) {
623 unsigned Index = getHWRegIndex(Reg);
624 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
625 Reserved.set(Reg);
630 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
631 if (ScratchRSrcReg != AMDGPU::NoRegister) {
632 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
633 // need to spill.
634 // TODO: May need to reserve a VGPR if doing LDS spilling.
635 reserveRegisterTuples(Reserved, ScratchRSrcReg);
638 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
639 if (LongBranchReservedReg)
640 reserveRegisterTuples(Reserved, LongBranchReservedReg);
642 // We have to assume the SP is needed in case there are calls in the function,
643 // which is detected after the function is lowered. If we aren't really going
644 // to need SP, don't bother reserving it.
645 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
646 if (StackPtrReg) {
647 reserveRegisterTuples(Reserved, StackPtrReg);
648 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
651 MCRegister FrameReg = MFI->getFrameOffsetReg();
652 if (FrameReg) {
653 reserveRegisterTuples(Reserved, FrameReg);
654 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
657 if (hasBasePointer(MF)) {
658 MCRegister BasePtrReg = getBaseRegister();
659 reserveRegisterTuples(Reserved, BasePtrReg);
660 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
663 // FIXME: Use same reserved register introduced in D149775
664 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
665 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
666 if (ExecCopyReg)
667 reserveRegisterTuples(Reserved, ExecCopyReg);
669 // Reserve VGPRs/AGPRs.
671 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
672 unsigned MaxNumAGPRs = MaxNumVGPRs;
673 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
675 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
676 // a wave may have up to 512 total vector registers combining together both
677 // VGPRs and AGPRs. Hence, in an entry function without calls and without
678 // AGPRs used within it, it is possible to use the whole vector register
679 // budget for VGPRs.
681 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
682 // register file accordingly.
683 if (ST.hasGFX90AInsts()) {
684 if (MFI->usesAGPRs(MF)) {
685 MaxNumVGPRs /= 2;
686 MaxNumAGPRs = MaxNumVGPRs;
687 } else {
688 if (MaxNumVGPRs > TotalNumVGPRs) {
689 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
690 MaxNumVGPRs = TotalNumVGPRs;
691 } else
692 MaxNumAGPRs = 0;
696 for (const TargetRegisterClass *RC : regclasses()) {
697 if (RC->isBaseClass() && isVGPRClass(RC)) {
698 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
699 for (MCPhysReg Reg : *RC) {
700 unsigned Index = getHWRegIndex(Reg);
701 if (Index + NumRegs > MaxNumVGPRs)
702 Reserved.set(Reg);
707 // Reserve all the AGPRs if there are no instructions to use it.
708 if (!ST.hasMAIInsts())
709 MaxNumAGPRs = 0;
710 for (const TargetRegisterClass *RC : regclasses()) {
711 if (RC->isBaseClass() && isAGPRClass(RC)) {
712 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
713 for (MCPhysReg Reg : *RC) {
714 unsigned Index = getHWRegIndex(Reg);
715 if (Index + NumRegs > MaxNumAGPRs)
716 Reserved.set(Reg);
721 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
722 // VGPR available at all times.
723 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
724 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
727 for (Register Reg : MFI->getWWMReservedRegs())
728 reserveRegisterTuples(Reserved, Reg);
730 // FIXME: Stop using reserved registers for this.
731 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
732 reserveRegisterTuples(Reserved, Reg);
734 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
735 reserveRegisterTuples(Reserved, Reg);
737 return Reserved;
740 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
741 MCRegister PhysReg) const {
742 return !MF.getRegInfo().isReserved(PhysReg);
745 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
746 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
747 // On entry or in chain functions, the base address is 0, so it can't possibly
748 // need any more alignment.
750 // FIXME: Should be able to specify the entry frame alignment per calling
751 // convention instead.
752 if (Info->isBottomOfStack())
753 return false;
755 return TargetRegisterInfo::shouldRealignStack(MF);
758 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
759 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
760 if (Info->isEntryFunction()) {
761 const MachineFrameInfo &MFI = Fn.getFrameInfo();
762 return MFI.hasStackObjects() || MFI.hasCalls();
765 // May need scavenger for dealing with callee saved registers.
766 return true;
769 bool SIRegisterInfo::requiresFrameIndexScavenging(
770 const MachineFunction &MF) const {
771 // Do not use frame virtual registers. They used to be used for SGPRs, but
772 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
773 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
774 // spill.
775 return false;
778 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
779 const MachineFunction &MF) const {
780 const MachineFrameInfo &MFI = MF.getFrameInfo();
781 return MFI.hasStackObjects();
784 bool SIRegisterInfo::requiresVirtualBaseRegisters(
785 const MachineFunction &) const {
786 // There are no special dedicated stack or frame pointers.
787 return true;
790 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
791 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
793 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
794 AMDGPU::OpName::offset);
795 return MI->getOperand(OffIdx).getImm();
798 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
799 int Idx) const {
800 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
801 return 0;
803 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
804 AMDGPU::OpName::vaddr) ||
805 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
806 AMDGPU::OpName::saddr))) &&
807 "Should never see frame index on non-address operand");
809 return getScratchInstrOffset(MI);
812 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
813 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
814 return false;
816 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
818 const SIInstrInfo *TII = ST.getInstrInfo();
819 if (SIInstrInfo::isMUBUF(*MI))
820 return !TII->isLegalMUBUFImmOffset(FullOffset);
822 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
823 SIInstrFlags::FlatScratch);
826 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
827 int FrameIdx,
828 int64_t Offset) const {
829 MachineBasicBlock::iterator Ins = MBB->begin();
830 DebugLoc DL; // Defaults to "unknown"
832 if (Ins != MBB->end())
833 DL = Ins->getDebugLoc();
835 MachineFunction *MF = MBB->getParent();
836 const SIInstrInfo *TII = ST.getInstrInfo();
837 MachineRegisterInfo &MRI = MF->getRegInfo();
838 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
839 : AMDGPU::V_MOV_B32_e32;
841 Register BaseReg = MRI.createVirtualRegister(
842 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
843 : &AMDGPU::VGPR_32RegClass);
845 if (Offset == 0) {
846 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
847 .addFrameIndex(FrameIdx);
848 return BaseReg;
851 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
853 Register FIReg = MRI.createVirtualRegister(
854 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
855 : &AMDGPU::VGPR_32RegClass);
857 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
858 .addImm(Offset);
859 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
860 .addFrameIndex(FrameIdx);
862 if (ST.enableFlatScratch() ) {
863 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
864 .addReg(OffsetReg, RegState::Kill)
865 .addReg(FIReg);
866 return BaseReg;
869 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
870 .addReg(OffsetReg, RegState::Kill)
871 .addReg(FIReg)
872 .addImm(0); // clamp bit
874 return BaseReg;
877 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
878 int64_t Offset) const {
879 const SIInstrInfo *TII = ST.getInstrInfo();
880 bool IsFlat = TII->isFLATScratch(MI);
882 #ifndef NDEBUG
883 // FIXME: Is it possible to be storing a frame index to itself?
884 bool SeenFI = false;
885 for (const MachineOperand &MO: MI.operands()) {
886 if (MO.isFI()) {
887 if (SeenFI)
888 llvm_unreachable("should not see multiple frame indices");
890 SeenFI = true;
893 #endif
895 MachineOperand *FIOp =
896 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
897 : AMDGPU::OpName::vaddr);
899 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
900 int64_t NewOffset = OffsetOp->getImm() + Offset;
902 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
903 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
905 if (IsFlat) {
906 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
907 SIInstrFlags::FlatScratch) &&
908 "offset should be legal");
909 FIOp->ChangeToRegister(BaseReg, false);
910 OffsetOp->setImm(NewOffset);
911 return;
914 #ifndef NDEBUG
915 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
916 assert(SOffset->isImm() && SOffset->getImm() == 0);
917 #endif
919 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
921 FIOp->ChangeToRegister(BaseReg, false);
922 OffsetOp->setImm(NewOffset);
925 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
926 Register BaseReg,
927 int64_t Offset) const {
928 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
929 return false;
931 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
933 const SIInstrInfo *TII = ST.getInstrInfo();
934 if (SIInstrInfo::isMUBUF(*MI))
935 return TII->isLegalMUBUFImmOffset(NewOffset);
937 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
938 SIInstrFlags::FlatScratch);
941 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
942 const MachineFunction &MF, unsigned Kind) const {
943 // This is inaccurate. It depends on the instruction and address space. The
944 // only place where we should hit this is for dealing with frame indexes /
945 // private accesses, so this is correct in that case.
946 return &AMDGPU::VGPR_32RegClass;
949 const TargetRegisterClass *
950 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
951 if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
952 return getEquivalentVGPRClass(RC);
953 if (RC == &AMDGPU::SCC_CLASSRegClass)
954 return getWaveMaskRegClass();
956 return RC;
959 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
961 switch (Op) {
962 case AMDGPU::SI_SPILL_S1024_SAVE:
963 case AMDGPU::SI_SPILL_S1024_RESTORE:
964 case AMDGPU::SI_SPILL_V1024_SAVE:
965 case AMDGPU::SI_SPILL_V1024_RESTORE:
966 case AMDGPU::SI_SPILL_A1024_SAVE:
967 case AMDGPU::SI_SPILL_A1024_RESTORE:
968 case AMDGPU::SI_SPILL_AV1024_SAVE:
969 case AMDGPU::SI_SPILL_AV1024_RESTORE:
970 return 32;
971 case AMDGPU::SI_SPILL_S512_SAVE:
972 case AMDGPU::SI_SPILL_S512_RESTORE:
973 case AMDGPU::SI_SPILL_V512_SAVE:
974 case AMDGPU::SI_SPILL_V512_RESTORE:
975 case AMDGPU::SI_SPILL_A512_SAVE:
976 case AMDGPU::SI_SPILL_A512_RESTORE:
977 case AMDGPU::SI_SPILL_AV512_SAVE:
978 case AMDGPU::SI_SPILL_AV512_RESTORE:
979 return 16;
980 case AMDGPU::SI_SPILL_S384_SAVE:
981 case AMDGPU::SI_SPILL_S384_RESTORE:
982 case AMDGPU::SI_SPILL_V384_SAVE:
983 case AMDGPU::SI_SPILL_V384_RESTORE:
984 case AMDGPU::SI_SPILL_A384_SAVE:
985 case AMDGPU::SI_SPILL_A384_RESTORE:
986 case AMDGPU::SI_SPILL_AV384_SAVE:
987 case AMDGPU::SI_SPILL_AV384_RESTORE:
988 return 12;
989 case AMDGPU::SI_SPILL_S352_SAVE:
990 case AMDGPU::SI_SPILL_S352_RESTORE:
991 case AMDGPU::SI_SPILL_V352_SAVE:
992 case AMDGPU::SI_SPILL_V352_RESTORE:
993 case AMDGPU::SI_SPILL_A352_SAVE:
994 case AMDGPU::SI_SPILL_A352_RESTORE:
995 case AMDGPU::SI_SPILL_AV352_SAVE:
996 case AMDGPU::SI_SPILL_AV352_RESTORE:
997 return 11;
998 case AMDGPU::SI_SPILL_S320_SAVE:
999 case AMDGPU::SI_SPILL_S320_RESTORE:
1000 case AMDGPU::SI_SPILL_V320_SAVE:
1001 case AMDGPU::SI_SPILL_V320_RESTORE:
1002 case AMDGPU::SI_SPILL_A320_SAVE:
1003 case AMDGPU::SI_SPILL_A320_RESTORE:
1004 case AMDGPU::SI_SPILL_AV320_SAVE:
1005 case AMDGPU::SI_SPILL_AV320_RESTORE:
1006 return 10;
1007 case AMDGPU::SI_SPILL_S288_SAVE:
1008 case AMDGPU::SI_SPILL_S288_RESTORE:
1009 case AMDGPU::SI_SPILL_V288_SAVE:
1010 case AMDGPU::SI_SPILL_V288_RESTORE:
1011 case AMDGPU::SI_SPILL_A288_SAVE:
1012 case AMDGPU::SI_SPILL_A288_RESTORE:
1013 case AMDGPU::SI_SPILL_AV288_SAVE:
1014 case AMDGPU::SI_SPILL_AV288_RESTORE:
1015 return 9;
1016 case AMDGPU::SI_SPILL_S256_SAVE:
1017 case AMDGPU::SI_SPILL_S256_RESTORE:
1018 case AMDGPU::SI_SPILL_V256_SAVE:
1019 case AMDGPU::SI_SPILL_V256_RESTORE:
1020 case AMDGPU::SI_SPILL_A256_SAVE:
1021 case AMDGPU::SI_SPILL_A256_RESTORE:
1022 case AMDGPU::SI_SPILL_AV256_SAVE:
1023 case AMDGPU::SI_SPILL_AV256_RESTORE:
1024 return 8;
1025 case AMDGPU::SI_SPILL_S224_SAVE:
1026 case AMDGPU::SI_SPILL_S224_RESTORE:
1027 case AMDGPU::SI_SPILL_V224_SAVE:
1028 case AMDGPU::SI_SPILL_V224_RESTORE:
1029 case AMDGPU::SI_SPILL_A224_SAVE:
1030 case AMDGPU::SI_SPILL_A224_RESTORE:
1031 case AMDGPU::SI_SPILL_AV224_SAVE:
1032 case AMDGPU::SI_SPILL_AV224_RESTORE:
1033 return 7;
1034 case AMDGPU::SI_SPILL_S192_SAVE:
1035 case AMDGPU::SI_SPILL_S192_RESTORE:
1036 case AMDGPU::SI_SPILL_V192_SAVE:
1037 case AMDGPU::SI_SPILL_V192_RESTORE:
1038 case AMDGPU::SI_SPILL_A192_SAVE:
1039 case AMDGPU::SI_SPILL_A192_RESTORE:
1040 case AMDGPU::SI_SPILL_AV192_SAVE:
1041 case AMDGPU::SI_SPILL_AV192_RESTORE:
1042 return 6;
1043 case AMDGPU::SI_SPILL_S160_SAVE:
1044 case AMDGPU::SI_SPILL_S160_RESTORE:
1045 case AMDGPU::SI_SPILL_V160_SAVE:
1046 case AMDGPU::SI_SPILL_V160_RESTORE:
1047 case AMDGPU::SI_SPILL_A160_SAVE:
1048 case AMDGPU::SI_SPILL_A160_RESTORE:
1049 case AMDGPU::SI_SPILL_AV160_SAVE:
1050 case AMDGPU::SI_SPILL_AV160_RESTORE:
1051 return 5;
1052 case AMDGPU::SI_SPILL_S128_SAVE:
1053 case AMDGPU::SI_SPILL_S128_RESTORE:
1054 case AMDGPU::SI_SPILL_V128_SAVE:
1055 case AMDGPU::SI_SPILL_V128_RESTORE:
1056 case AMDGPU::SI_SPILL_A128_SAVE:
1057 case AMDGPU::SI_SPILL_A128_RESTORE:
1058 case AMDGPU::SI_SPILL_AV128_SAVE:
1059 case AMDGPU::SI_SPILL_AV128_RESTORE:
1060 return 4;
1061 case AMDGPU::SI_SPILL_S96_SAVE:
1062 case AMDGPU::SI_SPILL_S96_RESTORE:
1063 case AMDGPU::SI_SPILL_V96_SAVE:
1064 case AMDGPU::SI_SPILL_V96_RESTORE:
1065 case AMDGPU::SI_SPILL_A96_SAVE:
1066 case AMDGPU::SI_SPILL_A96_RESTORE:
1067 case AMDGPU::SI_SPILL_AV96_SAVE:
1068 case AMDGPU::SI_SPILL_AV96_RESTORE:
1069 return 3;
1070 case AMDGPU::SI_SPILL_S64_SAVE:
1071 case AMDGPU::SI_SPILL_S64_RESTORE:
1072 case AMDGPU::SI_SPILL_V64_SAVE:
1073 case AMDGPU::SI_SPILL_V64_RESTORE:
1074 case AMDGPU::SI_SPILL_A64_SAVE:
1075 case AMDGPU::SI_SPILL_A64_RESTORE:
1076 case AMDGPU::SI_SPILL_AV64_SAVE:
1077 case AMDGPU::SI_SPILL_AV64_RESTORE:
1078 return 2;
1079 case AMDGPU::SI_SPILL_S32_SAVE:
1080 case AMDGPU::SI_SPILL_S32_RESTORE:
1081 case AMDGPU::SI_SPILL_V32_SAVE:
1082 case AMDGPU::SI_SPILL_V32_RESTORE:
1083 case AMDGPU::SI_SPILL_A32_SAVE:
1084 case AMDGPU::SI_SPILL_A32_RESTORE:
1085 case AMDGPU::SI_SPILL_AV32_SAVE:
1086 case AMDGPU::SI_SPILL_AV32_RESTORE:
1087 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1088 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1089 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1090 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1091 return 1;
1092 default: llvm_unreachable("Invalid spill opcode");
1096 static int getOffsetMUBUFStore(unsigned Opc) {
1097 switch (Opc) {
1098 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1099 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1100 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1101 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1102 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1103 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1104 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1105 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1106 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1107 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1108 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1109 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1110 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1111 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1112 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1113 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1114 default:
1115 return -1;
1119 static int getOffsetMUBUFLoad(unsigned Opc) {
1120 switch (Opc) {
1121 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1122 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1123 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1124 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1125 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1126 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1127 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1128 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1129 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1130 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1131 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1132 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1133 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1134 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1135 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1136 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1137 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1138 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1139 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1140 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1141 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1142 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1143 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1144 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1145 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1146 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1147 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1148 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1149 default:
1150 return -1;
1154 static int getOffenMUBUFStore(unsigned Opc) {
1155 switch (Opc) {
1156 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1157 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1158 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1159 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1160 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1161 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1162 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1163 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1164 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1165 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1166 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1167 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1168 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1169 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1170 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1171 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1172 default:
1173 return -1;
1177 static int getOffenMUBUFLoad(unsigned Opc) {
1178 switch (Opc) {
1179 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1180 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1181 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1182 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1183 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1184 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1185 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1186 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1187 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1188 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1189 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1190 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1191 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1192 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1193 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1194 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1195 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1196 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1197 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1198 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1199 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1200 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1201 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1202 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1203 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1204 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1205 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1206 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1207 default:
1208 return -1;
1212 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
1213 MachineBasicBlock &MBB,
1214 MachineBasicBlock::iterator MI,
1215 int Index, unsigned Lane,
1216 unsigned ValueReg, bool IsKill) {
1217 MachineFunction *MF = MBB.getParent();
1218 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1219 const SIInstrInfo *TII = ST.getInstrInfo();
1221 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1223 if (Reg == AMDGPU::NoRegister)
1224 return MachineInstrBuilder();
1226 bool IsStore = MI->mayStore();
1227 MachineRegisterInfo &MRI = MF->getRegInfo();
1228 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1230 unsigned Dst = IsStore ? Reg : ValueReg;
1231 unsigned Src = IsStore ? ValueReg : Reg;
1232 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1233 DebugLoc DL = MI->getDebugLoc();
1234 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1235 // Spiller during regalloc may restore a spilled register to its superclass.
1236 // It could result in AGPR spills restored to VGPRs or the other way around,
1237 // making the src and dst with identical regclasses at this point. It just
1238 // needs a copy in such cases.
1239 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1240 .addReg(Src, getKillRegState(IsKill));
1241 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1242 return CopyMIB;
1244 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1245 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1247 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1248 .addReg(Src, getKillRegState(IsKill));
1249 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1250 return MIB;
1253 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1254 // need to handle the case where an SGPR may need to be spilled while spilling.
1255 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
1256 MachineFrameInfo &MFI,
1257 MachineBasicBlock::iterator MI,
1258 int Index,
1259 int64_t Offset) {
1260 const SIInstrInfo *TII = ST.getInstrInfo();
1261 MachineBasicBlock *MBB = MI->getParent();
1262 const DebugLoc &DL = MI->getDebugLoc();
1263 bool IsStore = MI->mayStore();
1265 unsigned Opc = MI->getOpcode();
1266 int LoadStoreOp = IsStore ?
1267 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
1268 if (LoadStoreOp == -1)
1269 return false;
1271 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1272 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1273 return true;
1275 MachineInstrBuilder NewMI =
1276 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1277 .add(*Reg)
1278 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1279 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1280 .addImm(Offset)
1281 .addImm(0) // cpol
1282 .addImm(0) // swz
1283 .cloneMemRefs(*MI);
1285 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1286 AMDGPU::OpName::vdata_in);
1287 if (VDataIn)
1288 NewMI.add(*VDataIn);
1289 return true;
1292 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
1293 unsigned LoadStoreOp,
1294 unsigned EltSize) {
1295 bool IsStore = TII->get(LoadStoreOp).mayStore();
1296 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1297 bool UseST =
1298 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1300 switch (EltSize) {
1301 case 4:
1302 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1303 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1304 break;
1305 case 8:
1306 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1307 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1308 break;
1309 case 12:
1310 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1311 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1312 break;
1313 case 16:
1314 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1315 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1316 break;
1317 default:
1318 llvm_unreachable("Unexpected spill load/store size!");
1321 if (HasVAddr)
1322 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1323 else if (UseST)
1324 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1326 return LoadStoreOp;
1329 void SIRegisterInfo::buildSpillLoadStore(
1330 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
1331 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1332 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1333 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1334 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1336 MachineFunction *MF = MBB.getParent();
1337 const SIInstrInfo *TII = ST.getInstrInfo();
1338 const MachineFrameInfo &MFI = MF->getFrameInfo();
1339 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1341 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1342 bool IsStore = Desc->mayStore();
1343 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1345 bool CanClobberSCC = false;
1346 bool Scavenged = false;
1347 MCRegister SOffset = ScratchOffsetReg;
1349 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1350 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1351 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1352 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1354 // Always use 4 byte operations for AGPRs because we need to scavenge
1355 // a temporary VGPR.
1356 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
1357 unsigned NumSubRegs = RegWidth / EltSize;
1358 unsigned Size = NumSubRegs * EltSize;
1359 unsigned RemSize = RegWidth - Size;
1360 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1361 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1362 int64_t MaterializedOffset = Offset;
1364 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1365 int64_t ScratchOffsetRegDelta = 0;
1367 if (IsFlat && EltSize > 4) {
1368 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1369 Desc = &TII->get(LoadStoreOp);
1372 Align Alignment = MFI.getObjectAlign(Index);
1373 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1375 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1376 "unexpected VGPR spill offset");
1378 // Track a VGPR to use for a constant offset we need to materialize.
1379 Register TmpOffsetVGPR;
1381 // Track a VGPR to use as an intermediate value.
1382 Register TmpIntermediateVGPR;
1383 bool UseVGPROffset = false;
1385 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1386 // combination.
1387 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1388 int64_t VOffset) {
1389 // We are using a VGPR offset
1390 if (IsFlat && SGPRBase) {
1391 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1392 // SGPR, so perform the add as vector.
1393 // We don't need a base SGPR in the kernel.
1395 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1396 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1397 .addReg(SGPRBase)
1398 .addImm(VOffset)
1399 .addImm(0); // clamp
1400 } else {
1401 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1402 .addReg(SGPRBase);
1403 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1404 .addImm(VOffset)
1405 .addReg(TmpOffsetVGPR);
1407 } else {
1408 assert(TmpOffsetVGPR);
1409 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1410 .addImm(VOffset);
1414 bool IsOffsetLegal =
1415 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1416 SIInstrFlags::FlatScratch)
1417 : TII->isLegalMUBUFImmOffset(MaxOffset);
1418 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1419 SOffset = MCRegister();
1421 // We don't have access to the register scavenger if this function is called
1422 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1423 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1424 // entry.
1425 if (RS) {
1426 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1428 // Piggy back on the liveness scan we just did see if SCC is dead.
1429 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1430 } else if (LiveUnits) {
1431 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1432 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1433 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1434 SOffset = Reg;
1435 break;
1440 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1441 SOffset = Register();
1443 if (!SOffset) {
1444 UseVGPROffset = true;
1446 if (RS) {
1447 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1448 } else {
1449 assert(LiveUnits);
1450 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1451 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1452 TmpOffsetVGPR = Reg;
1453 break;
1458 assert(TmpOffsetVGPR);
1459 } else if (!SOffset && CanClobberSCC) {
1460 // There are no free SGPRs, and since we are in the process of spilling
1461 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1462 // on SI/CI and on VI it is true until we implement spilling using scalar
1463 // stores), we have no way to free up an SGPR. Our solution here is to
1464 // add the offset directly to the ScratchOffset or StackPtrOffset
1465 // register, and then subtract the offset after the spill to return the
1466 // register to it's original value.
1468 // TODO: If we don't have to do an emergency stack slot spill, converting
1469 // to use the VGPR offset is fewer instructions.
1470 if (!ScratchOffsetReg)
1471 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1472 SOffset = ScratchOffsetReg;
1473 ScratchOffsetRegDelta = Offset;
1474 } else {
1475 Scavenged = true;
1478 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1479 // we can simplify the adjustment of Offset here to just scale with
1480 // WavefrontSize.
1481 if (!IsFlat && !UseVGPROffset)
1482 Offset *= ST.getWavefrontSize();
1484 if (!UseVGPROffset && !SOffset)
1485 report_fatal_error("could not scavenge SGPR to spill in entry function");
1487 if (UseVGPROffset) {
1488 // We are using a VGPR offset
1489 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1490 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1491 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1492 } else {
1493 assert(Offset != 0);
1494 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1495 .addReg(ScratchOffsetReg)
1496 .addImm(Offset);
1497 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1500 Offset = 0;
1503 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1504 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1505 && "Unexpected vaddr for flat scratch with a FI operand");
1507 if (UseVGPROffset) {
1508 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1509 } else {
1510 assert(ST.hasFlatScratchSTMode());
1511 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1514 Desc = &TII->get(LoadStoreOp);
1517 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1518 ++i, RegOffset += EltSize) {
1519 if (i == NumSubRegs) {
1520 EltSize = RemSize;
1521 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1523 Desc = &TII->get(LoadStoreOp);
1525 if (!IsFlat && UseVGPROffset) {
1526 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1527 : getOffenMUBUFLoad(LoadStoreOp);
1528 Desc = &TII->get(NewLoadStoreOp);
1531 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1532 // If we are spilling an AGPR beyond the range of the memory instruction
1533 // offset and need to use a VGPR offset, we ideally have at least 2
1534 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1535 // recycle the VGPR used for the offset which requires resetting after
1536 // each subregister.
1538 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1541 unsigned NumRegs = EltSize / 4;
1542 Register SubReg = e == 1
1543 ? ValueReg
1544 : Register(getSubReg(ValueReg,
1545 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1547 unsigned SOffsetRegState = 0;
1548 unsigned SrcDstRegState = getDefRegState(!IsStore);
1549 const bool IsLastSubReg = i + 1 == e;
1550 const bool IsFirstSubReg = i == 0;
1551 if (IsLastSubReg) {
1552 SOffsetRegState |= getKillRegState(Scavenged);
1553 // The last implicit use carries the "Kill" flag.
1554 SrcDstRegState |= getKillRegState(IsKill);
1557 // Make sure the whole register is defined if there are undef components by
1558 // adding an implicit def of the super-reg on the first instruction.
1559 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1560 bool NeedSuperRegImpOperand = e > 1;
1562 // Remaining element size to spill into memory after some parts of it
1563 // spilled into either AGPRs or VGPRs.
1564 unsigned RemEltSize = EltSize;
1566 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1567 // starting from the last lane. In case if a register cannot be completely
1568 // spilled into another register that will ensure its alignment does not
1569 // change. For targets with VGPR alignment requirement this is important
1570 // in case of flat scratch usage as we might get a scratch_load or
1571 // scratch_store of an unaligned register otherwise.
1572 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1573 LaneE = RegOffset / 4;
1574 Lane >= LaneE; --Lane) {
1575 bool IsSubReg = e > 1 || EltSize > 4;
1576 Register Sub = IsSubReg
1577 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1578 : ValueReg;
1579 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1580 if (!MIB.getInstr())
1581 break;
1582 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1583 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1584 NeedSuperRegDef = false;
1586 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1587 NeedSuperRegImpOperand = true;
1588 unsigned State = SrcDstRegState;
1589 if (!IsLastSubReg || (Lane != LaneE))
1590 State &= ~RegState::Kill;
1591 if (!IsFirstSubReg || (Lane != LaneS))
1592 State &= ~RegState::Define;
1593 MIB.addReg(ValueReg, RegState::Implicit | State);
1595 RemEltSize -= 4;
1598 if (!RemEltSize) // Fully spilled into AGPRs.
1599 continue;
1601 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1602 assert(IsFlat && EltSize > 4);
1604 unsigned NumRegs = RemEltSize / 4;
1605 SubReg = Register(getSubReg(ValueReg,
1606 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1607 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1608 Desc = &TII->get(Opc);
1611 unsigned FinalReg = SubReg;
1613 if (IsAGPR) {
1614 assert(EltSize == 4);
1616 if (!TmpIntermediateVGPR) {
1617 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1618 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1620 if (IsStore) {
1621 auto AccRead = BuildMI(MBB, MI, DL,
1622 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1623 TmpIntermediateVGPR)
1624 .addReg(SubReg, getKillRegState(IsKill));
1625 if (NeedSuperRegDef)
1626 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1627 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1629 SubReg = TmpIntermediateVGPR;
1630 } else if (UseVGPROffset) {
1631 if (!TmpOffsetVGPR) {
1632 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1633 MI, false, 0);
1634 RS->setRegUsed(TmpOffsetVGPR);
1638 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1639 MachineMemOperand *NewMMO =
1640 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1641 commonAlignment(Alignment, RegOffset));
1643 auto MIB =
1644 BuildMI(MBB, MI, DL, *Desc)
1645 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1647 if (UseVGPROffset) {
1648 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1649 // intermediate accvgpr_write.
1650 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1653 if (!IsFlat)
1654 MIB.addReg(FuncInfo->getScratchRSrcReg());
1656 if (SOffset == AMDGPU::NoRegister) {
1657 if (!IsFlat) {
1658 if (UseVGPROffset && ScratchOffsetReg) {
1659 MIB.addReg(ScratchOffsetReg);
1660 } else {
1661 assert(FuncInfo->isBottomOfStack());
1662 MIB.addImm(0);
1665 } else {
1666 MIB.addReg(SOffset, SOffsetRegState);
1669 MIB.addImm(Offset + RegOffset);
1671 bool LastUse = MMO->getFlags() & MOLastUse;
1672 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1674 if (!IsFlat)
1675 MIB.addImm(0); // swz
1676 MIB.addMemOperand(NewMMO);
1678 if (!IsAGPR && NeedSuperRegDef)
1679 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1681 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1682 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1683 FinalReg)
1684 .addReg(TmpIntermediateVGPR, RegState::Kill);
1685 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1688 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1689 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1691 // The epilog restore of a wwm-scratch register can cause undesired
1692 // optimization during machine-cp post PrologEpilogInserter if the same
1693 // register was assigned for return value ABI lowering with a COPY
1694 // instruction. As given below, with the epilog reload, the earlier COPY
1695 // appeared to be dead during machine-cp.
1696 // ...
1697 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1698 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1699 // ...
1700 // Epilog block:
1701 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1702 // ...
1703 // WWM spill restore to preserve the inactive lanes of v0.
1704 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1705 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1706 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1707 // ...
1708 // SI_RETURN implicit $vgpr0
1709 // ...
1710 // To fix it, mark the same reg as a tied op for such restore instructions
1711 // so that it marks a usage for the preceding COPY.
1712 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1713 MI->readsRegister(SubReg, this)) {
1714 MIB.addReg(SubReg, RegState::Implicit);
1715 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1719 if (ScratchOffsetRegDelta != 0) {
1720 // Subtract the offset we added to the ScratchOffset register.
1721 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1722 .addReg(SOffset)
1723 .addImm(-ScratchOffsetRegDelta);
1727 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
1728 int Offset, bool IsLoad,
1729 bool IsKill) const {
1730 // Load/store VGPR
1731 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1732 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1734 Register FrameReg =
1735 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1736 ? getBaseRegister()
1737 : getFrameRegister(SB.MF);
1739 Align Alignment = FrameInfo.getObjectAlign(Index);
1740 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index);
1741 MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
1742 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
1743 SB.EltSize, Alignment);
1745 if (IsLoad) {
1746 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1747 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1748 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1749 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1750 } else {
1751 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1752 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1753 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1754 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1755 // This only ever adds one VGPR spill
1756 SB.MFI.addToSpilledVGPRs(1);
1760 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
1761 RegScavenger *RS, SlotIndexes *Indexes,
1762 LiveIntervals *LIS, bool OnlyToVGPR,
1763 bool SpillToPhysVGPRLane) const {
1764 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1766 ArrayRef<SpilledReg> VGPRSpills =
1767 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1768 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
1769 bool SpillToVGPR = !VGPRSpills.empty();
1770 if (OnlyToVGPR && !SpillToVGPR)
1771 return false;
1773 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1774 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1776 if (SpillToVGPR) {
1778 assert(SB.NumSubRegs == VGPRSpills.size() &&
1779 "Num of VGPR lanes should be equal to num of SGPRs spilled");
1781 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1782 Register SubReg =
1783 SB.NumSubRegs == 1
1784 ? SB.SuperReg
1785 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1786 SpilledReg Spill = VGPRSpills[i];
1788 bool IsFirstSubreg = i == 0;
1789 bool IsLastSubreg = i == SB.NumSubRegs - 1;
1790 bool UseKill = SB.IsKill && IsLastSubreg;
1793 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1794 // spill to this specific vgpr in the first basic block.
1795 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1796 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
1797 .addReg(SubReg, getKillRegState(UseKill))
1798 .addImm(Spill.Lane)
1799 .addReg(Spill.VGPR);
1800 if (Indexes) {
1801 if (IsFirstSubreg)
1802 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
1803 else
1804 Indexes->insertMachineInstrInMaps(*MIB);
1807 if (IsFirstSubreg && SB.NumSubRegs > 1) {
1808 // We may be spilling a super-register which is only partially defined,
1809 // and need to ensure later spills think the value is defined.
1810 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1813 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
1814 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
1816 // FIXME: Since this spills to another register instead of an actual
1817 // frame index, we should delete the frame index when all references to
1818 // it are fixed.
1820 } else {
1821 SB.prepare();
1823 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
1824 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
1826 // Per VGPR helper data
1827 auto PVD = SB.getPerVGPRData();
1829 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1830 unsigned TmpVGPRFlags = RegState::Undef;
1832 // Write sub registers into the VGPR
1833 for (unsigned i = Offset * PVD.PerVGPR,
1834 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
1835 i < e; ++i) {
1836 Register SubReg =
1837 SB.NumSubRegs == 1
1838 ? SB.SuperReg
1839 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1841 MachineInstrBuilder WriteLane =
1842 BuildMI(*SB.MBB, MI, SB.DL,
1843 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
1844 .addReg(SubReg, SubKillState)
1845 .addImm(i % PVD.PerVGPR)
1846 .addReg(SB.TmpVGPR, TmpVGPRFlags);
1847 TmpVGPRFlags = 0;
1849 if (Indexes) {
1850 if (i == 0)
1851 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
1852 else
1853 Indexes->insertMachineInstrInMaps(*WriteLane);
1856 // There could be undef components of a spilled super register.
1857 // TODO: Can we detect this and skip the spill?
1858 if (SB.NumSubRegs > 1) {
1859 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1860 unsigned SuperKillState = 0;
1861 if (i + 1 == SB.NumSubRegs)
1862 SuperKillState |= getKillRegState(SB.IsKill);
1863 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
1867 // Write out VGPR
1868 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
1871 SB.restore();
1874 MI->eraseFromParent();
1875 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
1877 if (LIS)
1878 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
1880 return true;
1883 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
1884 RegScavenger *RS, SlotIndexes *Indexes,
1885 LiveIntervals *LIS, bool OnlyToVGPR,
1886 bool SpillToPhysVGPRLane) const {
1887 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1889 ArrayRef<SpilledReg> VGPRSpills =
1890 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1891 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
1892 bool SpillToVGPR = !VGPRSpills.empty();
1893 if (OnlyToVGPR && !SpillToVGPR)
1894 return false;
1896 if (SpillToVGPR) {
1897 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1898 Register SubReg =
1899 SB.NumSubRegs == 1
1900 ? SB.SuperReg
1901 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1903 SpilledReg Spill = VGPRSpills[i];
1904 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1905 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
1906 .addReg(Spill.VGPR)
1907 .addImm(Spill.Lane);
1908 if (SB.NumSubRegs > 1 && i == 0)
1909 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1910 if (Indexes) {
1911 if (i == e - 1)
1912 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
1913 else
1914 Indexes->insertMachineInstrInMaps(*MIB);
1917 } else {
1918 SB.prepare();
1920 // Per VGPR helper data
1921 auto PVD = SB.getPerVGPRData();
1923 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1924 // Load in VGPR data
1925 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
1927 // Unpack lanes
1928 for (unsigned i = Offset * PVD.PerVGPR,
1929 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
1930 i < e; ++i) {
1931 Register SubReg =
1932 SB.NumSubRegs == 1
1933 ? SB.SuperReg
1934 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1936 bool LastSubReg = (i + 1 == e);
1937 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1938 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
1939 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
1940 .addImm(i);
1941 if (SB.NumSubRegs > 1 && i == 0)
1942 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1943 if (Indexes) {
1944 if (i == e - 1)
1945 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
1946 else
1947 Indexes->insertMachineInstrInMaps(*MIB);
1952 SB.restore();
1955 MI->eraseFromParent();
1957 if (LIS)
1958 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
1960 return true;
1963 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
1964 MachineBasicBlock &RestoreMBB,
1965 Register SGPR, RegScavenger *RS) const {
1966 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
1967 RS);
1968 SB.prepare();
1969 // Generate the spill of SGPR to SB.TmpVGPR.
1970 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
1971 auto PVD = SB.getPerVGPRData();
1972 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1973 unsigned TmpVGPRFlags = RegState::Undef;
1974 // Write sub registers into the VGPR
1975 for (unsigned i = Offset * PVD.PerVGPR,
1976 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
1977 i < e; ++i) {
1978 Register SubReg =
1979 SB.NumSubRegs == 1
1980 ? SB.SuperReg
1981 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1983 MachineInstrBuilder WriteLane =
1984 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
1985 SB.TmpVGPR)
1986 .addReg(SubReg, SubKillState)
1987 .addImm(i % PVD.PerVGPR)
1988 .addReg(SB.TmpVGPR, TmpVGPRFlags);
1989 TmpVGPRFlags = 0;
1990 // There could be undef components of a spilled super register.
1991 // TODO: Can we detect this and skip the spill?
1992 if (SB.NumSubRegs > 1) {
1993 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1994 unsigned SuperKillState = 0;
1995 if (i + 1 == SB.NumSubRegs)
1996 SuperKillState |= getKillRegState(SB.IsKill);
1997 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2000 // Don't need to write VGPR out.
2003 // Restore clobbered registers in the specified restore block.
2004 MI = RestoreMBB.end();
2005 SB.setMI(&RestoreMBB, MI);
2006 // Generate the restore of SGPR from SB.TmpVGPR.
2007 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2008 // Don't need to load VGPR in.
2009 // Unpack lanes
2010 for (unsigned i = Offset * PVD.PerVGPR,
2011 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2012 i < e; ++i) {
2013 Register SubReg =
2014 SB.NumSubRegs == 1
2015 ? SB.SuperReg
2016 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2017 bool LastSubReg = (i + 1 == e);
2018 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2019 SubReg)
2020 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2021 .addImm(i);
2022 if (SB.NumSubRegs > 1 && i == 0)
2023 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2026 SB.restore();
2028 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
2029 return false;
2032 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2033 /// a VGPR and the stack slot can be safely eliminated when all other users are
2034 /// handled.
2035 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
2036 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
2037 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2038 switch (MI->getOpcode()) {
2039 case AMDGPU::SI_SPILL_S1024_SAVE:
2040 case AMDGPU::SI_SPILL_S512_SAVE:
2041 case AMDGPU::SI_SPILL_S384_SAVE:
2042 case AMDGPU::SI_SPILL_S352_SAVE:
2043 case AMDGPU::SI_SPILL_S320_SAVE:
2044 case AMDGPU::SI_SPILL_S288_SAVE:
2045 case AMDGPU::SI_SPILL_S256_SAVE:
2046 case AMDGPU::SI_SPILL_S224_SAVE:
2047 case AMDGPU::SI_SPILL_S192_SAVE:
2048 case AMDGPU::SI_SPILL_S160_SAVE:
2049 case AMDGPU::SI_SPILL_S128_SAVE:
2050 case AMDGPU::SI_SPILL_S96_SAVE:
2051 case AMDGPU::SI_SPILL_S64_SAVE:
2052 case AMDGPU::SI_SPILL_S32_SAVE:
2053 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2054 case AMDGPU::SI_SPILL_S1024_RESTORE:
2055 case AMDGPU::SI_SPILL_S512_RESTORE:
2056 case AMDGPU::SI_SPILL_S384_RESTORE:
2057 case AMDGPU::SI_SPILL_S352_RESTORE:
2058 case AMDGPU::SI_SPILL_S320_RESTORE:
2059 case AMDGPU::SI_SPILL_S288_RESTORE:
2060 case AMDGPU::SI_SPILL_S256_RESTORE:
2061 case AMDGPU::SI_SPILL_S224_RESTORE:
2062 case AMDGPU::SI_SPILL_S192_RESTORE:
2063 case AMDGPU::SI_SPILL_S160_RESTORE:
2064 case AMDGPU::SI_SPILL_S128_RESTORE:
2065 case AMDGPU::SI_SPILL_S96_RESTORE:
2066 case AMDGPU::SI_SPILL_S64_RESTORE:
2067 case AMDGPU::SI_SPILL_S32_RESTORE:
2068 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2069 default:
2070 llvm_unreachable("not an SGPR spill instruction");
2074 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2075 int SPAdj, unsigned FIOperandNum,
2076 RegScavenger *RS) const {
2077 MachineFunction *MF = MI->getParent()->getParent();
2078 MachineBasicBlock *MBB = MI->getParent();
2079 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2080 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2081 const SIInstrInfo *TII = ST.getInstrInfo();
2082 DebugLoc DL = MI->getDebugLoc();
2084 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2086 assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
2087 "unreserved scratch RSRC register");
2089 MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2090 int Index = MI->getOperand(FIOperandNum).getIndex();
2092 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2093 ? getBaseRegister()
2094 : getFrameRegister(*MF);
2096 switch (MI->getOpcode()) {
2097 // SGPR register spill
2098 case AMDGPU::SI_SPILL_S1024_SAVE:
2099 case AMDGPU::SI_SPILL_S512_SAVE:
2100 case AMDGPU::SI_SPILL_S384_SAVE:
2101 case AMDGPU::SI_SPILL_S352_SAVE:
2102 case AMDGPU::SI_SPILL_S320_SAVE:
2103 case AMDGPU::SI_SPILL_S288_SAVE:
2104 case AMDGPU::SI_SPILL_S256_SAVE:
2105 case AMDGPU::SI_SPILL_S224_SAVE:
2106 case AMDGPU::SI_SPILL_S192_SAVE:
2107 case AMDGPU::SI_SPILL_S160_SAVE:
2108 case AMDGPU::SI_SPILL_S128_SAVE:
2109 case AMDGPU::SI_SPILL_S96_SAVE:
2110 case AMDGPU::SI_SPILL_S64_SAVE:
2111 case AMDGPU::SI_SPILL_S32_SAVE: {
2112 return spillSGPR(MI, Index, RS);
2115 // SGPR register restore
2116 case AMDGPU::SI_SPILL_S1024_RESTORE:
2117 case AMDGPU::SI_SPILL_S512_RESTORE:
2118 case AMDGPU::SI_SPILL_S384_RESTORE:
2119 case AMDGPU::SI_SPILL_S352_RESTORE:
2120 case AMDGPU::SI_SPILL_S320_RESTORE:
2121 case AMDGPU::SI_SPILL_S288_RESTORE:
2122 case AMDGPU::SI_SPILL_S256_RESTORE:
2123 case AMDGPU::SI_SPILL_S224_RESTORE:
2124 case AMDGPU::SI_SPILL_S192_RESTORE:
2125 case AMDGPU::SI_SPILL_S160_RESTORE:
2126 case AMDGPU::SI_SPILL_S128_RESTORE:
2127 case AMDGPU::SI_SPILL_S96_RESTORE:
2128 case AMDGPU::SI_SPILL_S64_RESTORE:
2129 case AMDGPU::SI_SPILL_S32_RESTORE: {
2130 return restoreSGPR(MI, Index, RS);
2133 // VGPR register spill
2134 case AMDGPU::SI_SPILL_V1024_SAVE:
2135 case AMDGPU::SI_SPILL_V512_SAVE:
2136 case AMDGPU::SI_SPILL_V384_SAVE:
2137 case AMDGPU::SI_SPILL_V352_SAVE:
2138 case AMDGPU::SI_SPILL_V320_SAVE:
2139 case AMDGPU::SI_SPILL_V288_SAVE:
2140 case AMDGPU::SI_SPILL_V256_SAVE:
2141 case AMDGPU::SI_SPILL_V224_SAVE:
2142 case AMDGPU::SI_SPILL_V192_SAVE:
2143 case AMDGPU::SI_SPILL_V160_SAVE:
2144 case AMDGPU::SI_SPILL_V128_SAVE:
2145 case AMDGPU::SI_SPILL_V96_SAVE:
2146 case AMDGPU::SI_SPILL_V64_SAVE:
2147 case AMDGPU::SI_SPILL_V32_SAVE:
2148 case AMDGPU::SI_SPILL_A1024_SAVE:
2149 case AMDGPU::SI_SPILL_A512_SAVE:
2150 case AMDGPU::SI_SPILL_A384_SAVE:
2151 case AMDGPU::SI_SPILL_A352_SAVE:
2152 case AMDGPU::SI_SPILL_A320_SAVE:
2153 case AMDGPU::SI_SPILL_A288_SAVE:
2154 case AMDGPU::SI_SPILL_A256_SAVE:
2155 case AMDGPU::SI_SPILL_A224_SAVE:
2156 case AMDGPU::SI_SPILL_A192_SAVE:
2157 case AMDGPU::SI_SPILL_A160_SAVE:
2158 case AMDGPU::SI_SPILL_A128_SAVE:
2159 case AMDGPU::SI_SPILL_A96_SAVE:
2160 case AMDGPU::SI_SPILL_A64_SAVE:
2161 case AMDGPU::SI_SPILL_A32_SAVE:
2162 case AMDGPU::SI_SPILL_AV1024_SAVE:
2163 case AMDGPU::SI_SPILL_AV512_SAVE:
2164 case AMDGPU::SI_SPILL_AV384_SAVE:
2165 case AMDGPU::SI_SPILL_AV352_SAVE:
2166 case AMDGPU::SI_SPILL_AV320_SAVE:
2167 case AMDGPU::SI_SPILL_AV288_SAVE:
2168 case AMDGPU::SI_SPILL_AV256_SAVE:
2169 case AMDGPU::SI_SPILL_AV224_SAVE:
2170 case AMDGPU::SI_SPILL_AV192_SAVE:
2171 case AMDGPU::SI_SPILL_AV160_SAVE:
2172 case AMDGPU::SI_SPILL_AV128_SAVE:
2173 case AMDGPU::SI_SPILL_AV96_SAVE:
2174 case AMDGPU::SI_SPILL_AV64_SAVE:
2175 case AMDGPU::SI_SPILL_AV32_SAVE:
2176 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2177 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2178 const MachineOperand *VData = TII->getNamedOperand(*MI,
2179 AMDGPU::OpName::vdata);
2180 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2181 MFI->getStackPtrOffsetReg());
2183 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2184 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2185 auto *MBB = MI->getParent();
2186 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2187 if (IsWWMRegSpill) {
2188 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2189 RS->isRegUsed(AMDGPU::SCC));
2191 buildSpillLoadStore(
2192 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2193 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2194 *MI->memoperands_begin(), RS);
2195 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
2196 if (IsWWMRegSpill)
2197 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2199 MI->eraseFromParent();
2200 return true;
2202 case AMDGPU::SI_SPILL_V32_RESTORE:
2203 case AMDGPU::SI_SPILL_V64_RESTORE:
2204 case AMDGPU::SI_SPILL_V96_RESTORE:
2205 case AMDGPU::SI_SPILL_V128_RESTORE:
2206 case AMDGPU::SI_SPILL_V160_RESTORE:
2207 case AMDGPU::SI_SPILL_V192_RESTORE:
2208 case AMDGPU::SI_SPILL_V224_RESTORE:
2209 case AMDGPU::SI_SPILL_V256_RESTORE:
2210 case AMDGPU::SI_SPILL_V288_RESTORE:
2211 case AMDGPU::SI_SPILL_V320_RESTORE:
2212 case AMDGPU::SI_SPILL_V352_RESTORE:
2213 case AMDGPU::SI_SPILL_V384_RESTORE:
2214 case AMDGPU::SI_SPILL_V512_RESTORE:
2215 case AMDGPU::SI_SPILL_V1024_RESTORE:
2216 case AMDGPU::SI_SPILL_A32_RESTORE:
2217 case AMDGPU::SI_SPILL_A64_RESTORE:
2218 case AMDGPU::SI_SPILL_A96_RESTORE:
2219 case AMDGPU::SI_SPILL_A128_RESTORE:
2220 case AMDGPU::SI_SPILL_A160_RESTORE:
2221 case AMDGPU::SI_SPILL_A192_RESTORE:
2222 case AMDGPU::SI_SPILL_A224_RESTORE:
2223 case AMDGPU::SI_SPILL_A256_RESTORE:
2224 case AMDGPU::SI_SPILL_A288_RESTORE:
2225 case AMDGPU::SI_SPILL_A320_RESTORE:
2226 case AMDGPU::SI_SPILL_A352_RESTORE:
2227 case AMDGPU::SI_SPILL_A384_RESTORE:
2228 case AMDGPU::SI_SPILL_A512_RESTORE:
2229 case AMDGPU::SI_SPILL_A1024_RESTORE:
2230 case AMDGPU::SI_SPILL_AV32_RESTORE:
2231 case AMDGPU::SI_SPILL_AV64_RESTORE:
2232 case AMDGPU::SI_SPILL_AV96_RESTORE:
2233 case AMDGPU::SI_SPILL_AV128_RESTORE:
2234 case AMDGPU::SI_SPILL_AV160_RESTORE:
2235 case AMDGPU::SI_SPILL_AV192_RESTORE:
2236 case AMDGPU::SI_SPILL_AV224_RESTORE:
2237 case AMDGPU::SI_SPILL_AV256_RESTORE:
2238 case AMDGPU::SI_SPILL_AV288_RESTORE:
2239 case AMDGPU::SI_SPILL_AV320_RESTORE:
2240 case AMDGPU::SI_SPILL_AV352_RESTORE:
2241 case AMDGPU::SI_SPILL_AV384_RESTORE:
2242 case AMDGPU::SI_SPILL_AV512_RESTORE:
2243 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2244 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2245 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2246 const MachineOperand *VData = TII->getNamedOperand(*MI,
2247 AMDGPU::OpName::vdata);
2248 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2249 MFI->getStackPtrOffsetReg());
2251 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2252 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2253 auto *MBB = MI->getParent();
2254 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2255 if (IsWWMRegSpill) {
2256 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2257 RS->isRegUsed(AMDGPU::SCC));
2260 buildSpillLoadStore(
2261 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2262 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2263 *MI->memoperands_begin(), RS);
2265 if (IsWWMRegSpill)
2266 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2268 MI->eraseFromParent();
2269 return true;
2272 default: {
2273 // Other access to frame index
2274 const DebugLoc &DL = MI->getDebugLoc();
2276 int64_t Offset = FrameInfo.getObjectOffset(Index);
2277 if (ST.enableFlatScratch()) {
2278 if (TII->isFLATScratch(*MI)) {
2279 assert((int16_t)FIOperandNum ==
2280 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2281 AMDGPU::OpName::saddr));
2283 // The offset is always swizzled, just replace it
2284 if (FrameReg)
2285 FIOp.ChangeToRegister(FrameReg, false);
2287 MachineOperand *OffsetOp =
2288 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2289 int64_t NewOffset = Offset + OffsetOp->getImm();
2290 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2291 SIInstrFlags::FlatScratch)) {
2292 OffsetOp->setImm(NewOffset);
2293 if (FrameReg)
2294 return false;
2295 Offset = 0;
2298 if (!Offset) {
2299 unsigned Opc = MI->getOpcode();
2300 int NewOpc = -1;
2301 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2302 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc);
2303 } else if (ST.hasFlatScratchSTMode()) {
2304 // On GFX10 we have ST mode to use no registers for an address.
2305 // Otherwise we need to materialize 0 into an SGPR.
2306 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
2309 if (NewOpc != -1) {
2310 // removeOperand doesn't fixup tied operand indexes as it goes, so
2311 // it asserts. Untie vdst_in for now and retie them afterwards.
2312 int VDstIn = AMDGPU::getNamedOperandIdx(Opc,
2313 AMDGPU::OpName::vdst_in);
2314 bool TiedVDst = VDstIn != -1 &&
2315 MI->getOperand(VDstIn).isReg() &&
2316 MI->getOperand(VDstIn).isTied();
2317 if (TiedVDst)
2318 MI->untieRegOperand(VDstIn);
2320 MI->removeOperand(
2321 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2323 if (TiedVDst) {
2324 int NewVDst =
2325 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2326 int NewVDstIn =
2327 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2328 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2329 MI->tieOperands(NewVDst, NewVDstIn);
2331 MI->setDesc(TII->get(NewOpc));
2332 return false;
2337 if (!FrameReg) {
2338 FIOp.ChangeToImmediate(Offset);
2339 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2340 return false;
2343 // We need to use register here. Check if we can use an SGPR or need
2344 // a VGPR.
2345 FIOp.ChangeToRegister(AMDGPU::M0, false);
2346 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2348 if (!Offset && FrameReg && UseSGPR) {
2349 FIOp.setReg(FrameReg);
2350 return false;
2353 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
2354 : &AMDGPU::VGPR_32RegClass;
2356 Register TmpReg =
2357 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2358 FIOp.setReg(TmpReg);
2359 FIOp.setIsKill();
2361 if ((!FrameReg || !Offset) && TmpReg) {
2362 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2363 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2364 if (FrameReg)
2365 MIB.addReg(FrameReg);
2366 else
2367 MIB.addImm(Offset);
2369 return false;
2372 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2373 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2375 Register TmpSReg =
2376 UseSGPR ? TmpReg
2377 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2378 MI, false, 0, !UseSGPR);
2380 // TODO: for flat scratch another attempt can be made with a VGPR index
2381 // if no SGPRs can be scavenged.
2382 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2383 report_fatal_error("Cannot scavenge register in FI elimination!");
2385 if (!TmpSReg) {
2386 // Use frame register and restore it after.
2387 TmpSReg = FrameReg;
2388 FIOp.setReg(FrameReg);
2389 FIOp.setIsKill(false);
2392 if (NeedSaveSCC) {
2393 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2394 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2395 .addReg(FrameReg)
2396 .addImm(Offset);
2397 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2398 .addReg(TmpSReg)
2399 .addImm(0);
2400 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2401 .addImm(0)
2402 .addReg(TmpSReg);
2403 } else {
2404 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2405 .addReg(FrameReg)
2406 .addImm(Offset);
2409 if (!UseSGPR)
2410 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2411 .addReg(TmpSReg, RegState::Kill);
2413 if (TmpSReg == FrameReg) {
2414 // Undo frame register modification.
2415 if (NeedSaveSCC &&
2416 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
2417 MachineBasicBlock::iterator I =
2418 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
2419 TmpSReg)
2420 .addReg(FrameReg)
2421 .addImm(-Offset);
2422 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
2423 .addReg(TmpSReg)
2424 .addImm(0);
2425 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
2426 TmpSReg)
2427 .addImm(0)
2428 .addReg(TmpSReg);
2429 } else {
2430 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
2431 FrameReg)
2432 .addReg(FrameReg)
2433 .addImm(-Offset);
2437 return false;
2440 bool IsMUBUF = TII->isMUBUF(*MI);
2442 if (!IsMUBUF && !MFI->isBottomOfStack()) {
2443 // Convert to a swizzled stack address by scaling by the wave size.
2444 // In an entry function/kernel the offset is already swizzled.
2445 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
2446 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2447 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2448 const TargetRegisterClass *RC = IsSALU && !LiveSCC
2449 ? &AMDGPU::SReg_32RegClass
2450 : &AMDGPU::VGPR_32RegClass;
2451 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2452 MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
2453 Register ResultReg =
2454 IsCopy ? MI->getOperand(0).getReg()
2455 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
2457 int64_t Offset = FrameInfo.getObjectOffset(Index);
2458 if (Offset == 0) {
2459 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
2460 : AMDGPU::V_LSHRREV_B32_e64;
2461 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
2462 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2463 // For V_LSHRREV, the operands are reversed (the shift count goes
2464 // first).
2465 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
2466 else
2467 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
2468 if (IsSALU && !LiveSCC)
2469 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
2470 if (IsSALU && LiveSCC) {
2471 Register NewDest = RS->scavengeRegisterBackwards(
2472 AMDGPU::SReg_32RegClass, Shift, false, 0);
2473 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2474 NewDest)
2475 .addReg(ResultReg);
2476 ResultReg = NewDest;
2478 } else {
2479 MachineInstrBuilder MIB;
2480 if (!IsSALU) {
2481 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
2482 nullptr) {
2483 // Reuse ResultReg in intermediate step.
2484 Register ScaledReg = ResultReg;
2486 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2487 ScaledReg)
2488 .addImm(ST.getWavefrontSizeLog2())
2489 .addReg(FrameReg);
2491 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
2493 // TODO: Fold if use instruction is another add of a constant.
2494 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
2495 // FIXME: This can fail
2496 MIB.addImm(Offset);
2497 MIB.addReg(ScaledReg, RegState::Kill);
2498 if (!IsVOP2)
2499 MIB.addImm(0); // clamp bit
2500 } else {
2501 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
2502 "Need to reuse carry out register");
2504 // Use scavenged unused carry out as offset register.
2505 Register ConstOffsetReg;
2506 if (!isWave32)
2507 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
2508 else
2509 ConstOffsetReg = MIB.getReg(1);
2511 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
2512 .addImm(Offset);
2513 MIB.addReg(ConstOffsetReg, RegState::Kill);
2514 MIB.addReg(ScaledReg, RegState::Kill);
2515 MIB.addImm(0); // clamp bit
2519 if (!MIB || IsSALU) {
2520 // We have to produce a carry out, and there isn't a free SGPR pair
2521 // for it. We can keep the whole computation on the SALU to avoid
2522 // clobbering an additional register at the cost of an extra mov.
2524 // We may have 1 free scratch SGPR even though a carry out is
2525 // unavailable. Only one additional mov is needed.
2526 Register TmpScaledReg = RS->scavengeRegisterBackwards(
2527 AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
2528 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2530 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
2531 .addReg(FrameReg)
2532 .addImm(ST.getWavefrontSizeLog2());
2533 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2534 .addReg(ScaledReg, RegState::Kill)
2535 .addImm(Offset);
2536 if (!IsSALU)
2537 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
2538 .addReg(ScaledReg, RegState::Kill);
2539 else
2540 ResultReg = ScaledReg;
2542 // If there were truly no free SGPRs, we need to undo everything.
2543 if (!TmpScaledReg.isValid()) {
2544 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2545 .addReg(ScaledReg, RegState::Kill)
2546 .addImm(-Offset);
2547 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
2548 .addReg(FrameReg)
2549 .addImm(ST.getWavefrontSizeLog2());
2554 // Don't introduce an extra copy if we're just materializing in a mov.
2555 if (IsCopy) {
2556 MI->eraseFromParent();
2557 return true;
2559 FIOp.ChangeToRegister(ResultReg, false, false, true);
2560 return false;
2563 if (IsMUBUF) {
2564 // Disable offen so we don't need a 0 vgpr base.
2565 assert(static_cast<int>(FIOperandNum) ==
2566 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2567 AMDGPU::OpName::vaddr));
2569 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
2570 assert((SOffset.isImm() && SOffset.getImm() == 0));
2572 if (FrameReg != AMDGPU::NoRegister)
2573 SOffset.ChangeToRegister(FrameReg, false);
2575 int64_t Offset = FrameInfo.getObjectOffset(Index);
2576 int64_t OldImm
2577 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
2578 int64_t NewOffset = OldImm + Offset;
2580 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
2581 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
2582 MI->eraseFromParent();
2583 return true;
2587 // If the offset is simply too big, don't convert to a scratch wave offset
2588 // relative index.
2590 FIOp.ChangeToImmediate(Offset);
2591 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
2592 Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
2593 MI, false, 0);
2594 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2595 .addImm(Offset);
2596 FIOp.ChangeToRegister(TmpReg, false, false, true);
2600 return false;
2603 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
2604 return AMDGPUInstPrinter::getRegisterName(Reg);
2607 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
2608 return getRegBitWidth(RC.getID());
2611 static const TargetRegisterClass *
2612 getAnyVGPRClassForBitWidth(unsigned BitWidth) {
2613 if (BitWidth == 64)
2614 return &AMDGPU::VReg_64RegClass;
2615 if (BitWidth == 96)
2616 return &AMDGPU::VReg_96RegClass;
2617 if (BitWidth == 128)
2618 return &AMDGPU::VReg_128RegClass;
2619 if (BitWidth == 160)
2620 return &AMDGPU::VReg_160RegClass;
2621 if (BitWidth == 192)
2622 return &AMDGPU::VReg_192RegClass;
2623 if (BitWidth == 224)
2624 return &AMDGPU::VReg_224RegClass;
2625 if (BitWidth == 256)
2626 return &AMDGPU::VReg_256RegClass;
2627 if (BitWidth == 288)
2628 return &AMDGPU::VReg_288RegClass;
2629 if (BitWidth == 320)
2630 return &AMDGPU::VReg_320RegClass;
2631 if (BitWidth == 352)
2632 return &AMDGPU::VReg_352RegClass;
2633 if (BitWidth == 384)
2634 return &AMDGPU::VReg_384RegClass;
2635 if (BitWidth == 512)
2636 return &AMDGPU::VReg_512RegClass;
2637 if (BitWidth == 1024)
2638 return &AMDGPU::VReg_1024RegClass;
2640 return nullptr;
2643 static const TargetRegisterClass *
2644 getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
2645 if (BitWidth == 64)
2646 return &AMDGPU::VReg_64_Align2RegClass;
2647 if (BitWidth == 96)
2648 return &AMDGPU::VReg_96_Align2RegClass;
2649 if (BitWidth == 128)
2650 return &AMDGPU::VReg_128_Align2RegClass;
2651 if (BitWidth == 160)
2652 return &AMDGPU::VReg_160_Align2RegClass;
2653 if (BitWidth == 192)
2654 return &AMDGPU::VReg_192_Align2RegClass;
2655 if (BitWidth == 224)
2656 return &AMDGPU::VReg_224_Align2RegClass;
2657 if (BitWidth == 256)
2658 return &AMDGPU::VReg_256_Align2RegClass;
2659 if (BitWidth == 288)
2660 return &AMDGPU::VReg_288_Align2RegClass;
2661 if (BitWidth == 320)
2662 return &AMDGPU::VReg_320_Align2RegClass;
2663 if (BitWidth == 352)
2664 return &AMDGPU::VReg_352_Align2RegClass;
2665 if (BitWidth == 384)
2666 return &AMDGPU::VReg_384_Align2RegClass;
2667 if (BitWidth == 512)
2668 return &AMDGPU::VReg_512_Align2RegClass;
2669 if (BitWidth == 1024)
2670 return &AMDGPU::VReg_1024_Align2RegClass;
2672 return nullptr;
2675 const TargetRegisterClass *
2676 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
2677 if (BitWidth == 1)
2678 return &AMDGPU::VReg_1RegClass;
2679 if (BitWidth == 16)
2680 return &AMDGPU::VGPR_16RegClass;
2681 if (BitWidth == 32)
2682 return &AMDGPU::VGPR_32RegClass;
2683 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
2684 : getAnyVGPRClassForBitWidth(BitWidth);
2687 static const TargetRegisterClass *
2688 getAnyAGPRClassForBitWidth(unsigned BitWidth) {
2689 if (BitWidth == 64)
2690 return &AMDGPU::AReg_64RegClass;
2691 if (BitWidth == 96)
2692 return &AMDGPU::AReg_96RegClass;
2693 if (BitWidth == 128)
2694 return &AMDGPU::AReg_128RegClass;
2695 if (BitWidth == 160)
2696 return &AMDGPU::AReg_160RegClass;
2697 if (BitWidth == 192)
2698 return &AMDGPU::AReg_192RegClass;
2699 if (BitWidth == 224)
2700 return &AMDGPU::AReg_224RegClass;
2701 if (BitWidth == 256)
2702 return &AMDGPU::AReg_256RegClass;
2703 if (BitWidth == 288)
2704 return &AMDGPU::AReg_288RegClass;
2705 if (BitWidth == 320)
2706 return &AMDGPU::AReg_320RegClass;
2707 if (BitWidth == 352)
2708 return &AMDGPU::AReg_352RegClass;
2709 if (BitWidth == 384)
2710 return &AMDGPU::AReg_384RegClass;
2711 if (BitWidth == 512)
2712 return &AMDGPU::AReg_512RegClass;
2713 if (BitWidth == 1024)
2714 return &AMDGPU::AReg_1024RegClass;
2716 return nullptr;
2719 static const TargetRegisterClass *
2720 getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
2721 if (BitWidth == 64)
2722 return &AMDGPU::AReg_64_Align2RegClass;
2723 if (BitWidth == 96)
2724 return &AMDGPU::AReg_96_Align2RegClass;
2725 if (BitWidth == 128)
2726 return &AMDGPU::AReg_128_Align2RegClass;
2727 if (BitWidth == 160)
2728 return &AMDGPU::AReg_160_Align2RegClass;
2729 if (BitWidth == 192)
2730 return &AMDGPU::AReg_192_Align2RegClass;
2731 if (BitWidth == 224)
2732 return &AMDGPU::AReg_224_Align2RegClass;
2733 if (BitWidth == 256)
2734 return &AMDGPU::AReg_256_Align2RegClass;
2735 if (BitWidth == 288)
2736 return &AMDGPU::AReg_288_Align2RegClass;
2737 if (BitWidth == 320)
2738 return &AMDGPU::AReg_320_Align2RegClass;
2739 if (BitWidth == 352)
2740 return &AMDGPU::AReg_352_Align2RegClass;
2741 if (BitWidth == 384)
2742 return &AMDGPU::AReg_384_Align2RegClass;
2743 if (BitWidth == 512)
2744 return &AMDGPU::AReg_512_Align2RegClass;
2745 if (BitWidth == 1024)
2746 return &AMDGPU::AReg_1024_Align2RegClass;
2748 return nullptr;
2751 const TargetRegisterClass *
2752 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
2753 if (BitWidth == 16)
2754 return &AMDGPU::AGPR_LO16RegClass;
2755 if (BitWidth == 32)
2756 return &AMDGPU::AGPR_32RegClass;
2757 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
2758 : getAnyAGPRClassForBitWidth(BitWidth);
2761 static const TargetRegisterClass *
2762 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
2763 if (BitWidth == 64)
2764 return &AMDGPU::AV_64RegClass;
2765 if (BitWidth == 96)
2766 return &AMDGPU::AV_96RegClass;
2767 if (BitWidth == 128)
2768 return &AMDGPU::AV_128RegClass;
2769 if (BitWidth == 160)
2770 return &AMDGPU::AV_160RegClass;
2771 if (BitWidth == 192)
2772 return &AMDGPU::AV_192RegClass;
2773 if (BitWidth == 224)
2774 return &AMDGPU::AV_224RegClass;
2775 if (BitWidth == 256)
2776 return &AMDGPU::AV_256RegClass;
2777 if (BitWidth == 288)
2778 return &AMDGPU::AV_288RegClass;
2779 if (BitWidth == 320)
2780 return &AMDGPU::AV_320RegClass;
2781 if (BitWidth == 352)
2782 return &AMDGPU::AV_352RegClass;
2783 if (BitWidth == 384)
2784 return &AMDGPU::AV_384RegClass;
2785 if (BitWidth == 512)
2786 return &AMDGPU::AV_512RegClass;
2787 if (BitWidth == 1024)
2788 return &AMDGPU::AV_1024RegClass;
2790 return nullptr;
2793 static const TargetRegisterClass *
2794 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
2795 if (BitWidth == 64)
2796 return &AMDGPU::AV_64_Align2RegClass;
2797 if (BitWidth == 96)
2798 return &AMDGPU::AV_96_Align2RegClass;
2799 if (BitWidth == 128)
2800 return &AMDGPU::AV_128_Align2RegClass;
2801 if (BitWidth == 160)
2802 return &AMDGPU::AV_160_Align2RegClass;
2803 if (BitWidth == 192)
2804 return &AMDGPU::AV_192_Align2RegClass;
2805 if (BitWidth == 224)
2806 return &AMDGPU::AV_224_Align2RegClass;
2807 if (BitWidth == 256)
2808 return &AMDGPU::AV_256_Align2RegClass;
2809 if (BitWidth == 288)
2810 return &AMDGPU::AV_288_Align2RegClass;
2811 if (BitWidth == 320)
2812 return &AMDGPU::AV_320_Align2RegClass;
2813 if (BitWidth == 352)
2814 return &AMDGPU::AV_352_Align2RegClass;
2815 if (BitWidth == 384)
2816 return &AMDGPU::AV_384_Align2RegClass;
2817 if (BitWidth == 512)
2818 return &AMDGPU::AV_512_Align2RegClass;
2819 if (BitWidth == 1024)
2820 return &AMDGPU::AV_1024_Align2RegClass;
2822 return nullptr;
2825 const TargetRegisterClass *
2826 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
2827 if (BitWidth == 32)
2828 return &AMDGPU::AV_32RegClass;
2829 return ST.needsAlignedVGPRs()
2830 ? getAlignedVectorSuperClassForBitWidth(BitWidth)
2831 : getAnyVectorSuperClassForBitWidth(BitWidth);
2834 const TargetRegisterClass *
2835 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
2836 if (BitWidth == 16)
2837 return &AMDGPU::SGPR_LO16RegClass;
2838 if (BitWidth == 32)
2839 return &AMDGPU::SReg_32RegClass;
2840 if (BitWidth == 64)
2841 return &AMDGPU::SReg_64RegClass;
2842 if (BitWidth == 96)
2843 return &AMDGPU::SGPR_96RegClass;
2844 if (BitWidth == 128)
2845 return &AMDGPU::SGPR_128RegClass;
2846 if (BitWidth == 160)
2847 return &AMDGPU::SGPR_160RegClass;
2848 if (BitWidth == 192)
2849 return &AMDGPU::SGPR_192RegClass;
2850 if (BitWidth == 224)
2851 return &AMDGPU::SGPR_224RegClass;
2852 if (BitWidth == 256)
2853 return &AMDGPU::SGPR_256RegClass;
2854 if (BitWidth == 288)
2855 return &AMDGPU::SGPR_288RegClass;
2856 if (BitWidth == 320)
2857 return &AMDGPU::SGPR_320RegClass;
2858 if (BitWidth == 352)
2859 return &AMDGPU::SGPR_352RegClass;
2860 if (BitWidth == 384)
2861 return &AMDGPU::SGPR_384RegClass;
2862 if (BitWidth == 512)
2863 return &AMDGPU::SGPR_512RegClass;
2864 if (BitWidth == 1024)
2865 return &AMDGPU::SGPR_1024RegClass;
2867 return nullptr;
2870 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
2871 Register Reg) const {
2872 const TargetRegisterClass *RC;
2873 if (Reg.isVirtual())
2874 RC = MRI.getRegClass(Reg);
2875 else
2876 RC = getPhysRegBaseClass(Reg);
2877 return RC ? isSGPRClass(RC) : false;
2880 const TargetRegisterClass *
2881 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
2882 unsigned Size = getRegSizeInBits(*SRC);
2883 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
2884 assert(VRC && "Invalid register class size");
2885 return VRC;
2888 const TargetRegisterClass *
2889 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
2890 unsigned Size = getRegSizeInBits(*SRC);
2891 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
2892 assert(ARC && "Invalid register class size");
2893 return ARC;
2896 const TargetRegisterClass *
2897 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
2898 unsigned Size = getRegSizeInBits(*VRC);
2899 if (Size == 32)
2900 return &AMDGPU::SGPR_32RegClass;
2901 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
2902 assert(SRC && "Invalid register class size");
2903 return SRC;
2906 const TargetRegisterClass *
2907 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
2908 const TargetRegisterClass *SubRC,
2909 unsigned SubIdx) const {
2910 // Ensure this subregister index is aligned in the super register.
2911 const TargetRegisterClass *MatchRC =
2912 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
2913 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
2916 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
2917 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
2918 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
2919 return !ST.hasMFMAInlineLiteralBug();
2921 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
2922 OpType <= AMDGPU::OPERAND_SRC_LAST;
2925 bool SIRegisterInfo::shouldRewriteCopySrc(
2926 const TargetRegisterClass *DefRC,
2927 unsigned DefSubReg,
2928 const TargetRegisterClass *SrcRC,
2929 unsigned SrcSubReg) const {
2930 // We want to prefer the smallest register class possible, so we don't want to
2931 // stop and rewrite on anything that looks like a subregister
2932 // extract. Operations mostly don't care about the super register class, so we
2933 // only want to stop on the most basic of copies between the same register
2934 // class.
2936 // e.g. if we have something like
2937 // %0 = ...
2938 // %1 = ...
2939 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
2940 // %3 = COPY %2, sub0
2942 // We want to look through the COPY to find:
2943 // => %3 = COPY %0
2945 // Plain copy.
2946 return getCommonSubClass(DefRC, SrcRC) != nullptr;
2949 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
2950 // TODO: 64-bit operands have extending behavior from 32-bit literal.
2951 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
2952 OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
2955 /// Returns a lowest register that is not used at any point in the function.
2956 /// If all registers are used, then this function will return
2957 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
2958 /// highest unused register.
2959 MCRegister SIRegisterInfo::findUnusedRegister(
2960 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
2961 const MachineFunction &MF, bool ReserveHighestRegister) const {
2962 if (ReserveHighestRegister) {
2963 for (MCRegister Reg : reverse(*RC))
2964 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
2965 return Reg;
2966 } else {
2967 for (MCRegister Reg : *RC)
2968 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
2969 return Reg;
2971 return MCRegister();
2974 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
2975 const RegisterBankInfo &RBI,
2976 Register Reg) const {
2977 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
2978 if (!RB)
2979 return false;
2981 return !RBI.isDivergentRegBank(RB);
2984 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
2985 unsigned EltSize) const {
2986 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
2987 assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
2989 const unsigned RegDWORDs = RegBitWidth / 32;
2990 const unsigned EltDWORDs = EltSize / 4;
2991 assert(RegSplitParts.size() + 1 >= EltDWORDs);
2993 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
2994 const unsigned NumParts = RegDWORDs / EltDWORDs;
2996 return ArrayRef(Parts.data(), NumParts);
2999 const TargetRegisterClass*
3000 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
3001 Register Reg) const {
3002 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3005 const TargetRegisterClass *
3006 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI,
3007 const MachineOperand &MO) const {
3008 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3009 return getSubRegisterClass(SrcRC, MO.getSubReg());
3012 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
3013 Register Reg) const {
3014 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3015 // Registers without classes are unaddressable, SGPR-like registers.
3016 return RC && isVGPRClass(RC);
3019 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
3020 Register Reg) const {
3021 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3023 // Registers without classes are unaddressable, SGPR-like registers.
3024 return RC && isAGPRClass(RC);
3027 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
3028 const TargetRegisterClass *SrcRC,
3029 unsigned SubReg,
3030 const TargetRegisterClass *DstRC,
3031 unsigned DstSubReg,
3032 const TargetRegisterClass *NewRC,
3033 LiveIntervals &LIS) const {
3034 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3035 unsigned DstSize = getRegSizeInBits(*DstRC);
3036 unsigned NewSize = getRegSizeInBits(*NewRC);
3038 // Do not increase size of registers beyond dword, we would need to allocate
3039 // adjacent registers and constraint regalloc more than needed.
3041 // Always allow dword coalescing.
3042 if (SrcSize <= 32 || DstSize <= 32)
3043 return true;
3045 return NewSize <= DstSize || NewSize <= SrcSize;
3048 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
3049 MachineFunction &MF) const {
3050 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3052 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
3053 MF.getFunction());
3054 switch (RC->getID()) {
3055 default:
3056 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3057 case AMDGPU::VGPR_32RegClassID:
3058 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
3059 case AMDGPU::SGPR_32RegClassID:
3060 case AMDGPU::SGPR_LO16RegClassID:
3061 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
3065 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
3066 unsigned Idx) const {
3067 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3068 Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3069 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3070 const_cast<MachineFunction &>(MF));
3072 if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3073 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3074 const_cast<MachineFunction &>(MF));
3076 llvm_unreachable("Unexpected register pressure set!");
3079 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3080 static const int Empty[] = { -1 };
3082 if (RegPressureIgnoredUnits[RegUnit])
3083 return Empty;
3085 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3088 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
3089 // Not a callee saved register.
3090 return AMDGPU::SGPR30_SGPR31;
3093 const TargetRegisterClass *
3094 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
3095 const RegisterBank &RB) const {
3096 switch (RB.getID()) {
3097 case AMDGPU::VGPRRegBankID:
3098 return getVGPRClassForBitWidth(
3099 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3100 case AMDGPU::VCCRegBankID:
3101 assert(Size == 1);
3102 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
3103 : &AMDGPU::SReg_64_XEXECRegClass;
3104 case AMDGPU::SGPRRegBankID:
3105 return getSGPRClassForBitWidth(std::max(32u, Size));
3106 case AMDGPU::AGPRRegBankID:
3107 return getAGPRClassForBitWidth(std::max(32u, Size));
3108 default:
3109 llvm_unreachable("unknown register bank");
3113 const TargetRegisterClass *
3114 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
3115 const MachineRegisterInfo &MRI) const {
3116 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3117 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
3118 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3120 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
3121 return getAllocatableClass(RC);
3123 return nullptr;
3126 MCRegister SIRegisterInfo::getVCC() const {
3127 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3130 MCRegister SIRegisterInfo::getExec() const {
3131 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3134 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
3135 // VGPR tuples have an alignment requirement on gfx90a variants.
3136 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3137 : &AMDGPU::VReg_64RegClass;
3140 const TargetRegisterClass *
3141 SIRegisterInfo::getRegClass(unsigned RCID) const {
3142 switch ((int)RCID) {
3143 case AMDGPU::SReg_1RegClassID:
3144 return getBoolRC();
3145 case AMDGPU::SReg_1_XEXECRegClassID:
3146 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
3147 : &AMDGPU::SReg_64_XEXECRegClass;
3148 case -1:
3149 return nullptr;
3150 default:
3151 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3155 // Find reaching register definition
3156 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
3157 MachineInstr &Use,
3158 MachineRegisterInfo &MRI,
3159 LiveIntervals *LIS) const {
3160 auto &MDT = LIS->getDomTree();
3161 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3162 SlotIndex DefIdx;
3164 if (Reg.isVirtual()) {
3165 if (!LIS->hasInterval(Reg))
3166 return nullptr;
3167 LiveInterval &LI = LIS->getInterval(Reg);
3168 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3169 : MRI.getMaxLaneMaskForVReg(Reg);
3170 VNInfo *V = nullptr;
3171 if (LI.hasSubRanges()) {
3172 for (auto &S : LI.subranges()) {
3173 if ((S.LaneMask & SubLanes) == SubLanes) {
3174 V = S.getVNInfoAt(UseIdx);
3175 break;
3178 } else {
3179 V = LI.getVNInfoAt(UseIdx);
3181 if (!V)
3182 return nullptr;
3183 DefIdx = V->def;
3184 } else {
3185 // Find last def.
3186 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3187 LiveRange &LR = LIS->getRegUnit(Unit);
3188 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3189 if (!DefIdx.isValid() ||
3190 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3191 LIS->getInstructionFromIndex(V->def)))
3192 DefIdx = V->def;
3193 } else {
3194 return nullptr;
3199 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3201 if (!Def || !MDT.dominates(Def, &Use))
3202 return nullptr;
3204 assert(Def->modifiesRegister(Reg, this));
3206 return Def;
3209 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
3210 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3212 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3213 AMDGPU::SReg_32RegClass,
3214 AMDGPU::AGPR_32RegClass } ) {
3215 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3216 return Super;
3218 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3219 &AMDGPU::VGPR_32RegClass)) {
3220 return Super;
3223 return AMDGPU::NoRegister;
3226 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
3227 if (!ST.needsAlignedVGPRs())
3228 return true;
3230 if (isVGPRClass(&RC))
3231 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3232 if (isAGPRClass(&RC))
3233 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3234 if (isVectorSuperClass(&RC))
3235 return RC.hasSuperClassEq(
3236 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
3238 return true;
3241 const TargetRegisterClass *
3242 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
3243 if (!RC || !ST.needsAlignedVGPRs())
3244 return RC;
3246 unsigned Size = getRegSizeInBits(*RC);
3247 if (Size <= 32)
3248 return RC;
3250 if (isVGPRClass(RC))
3251 return getAlignedVGPRClassForBitWidth(Size);
3252 if (isAGPRClass(RC))
3253 return getAlignedAGPRClassForBitWidth(Size);
3254 if (isVectorSuperClass(RC))
3255 return getAlignedVectorSuperClassForBitWidth(Size);
3257 return RC;
3260 ArrayRef<MCPhysReg>
3261 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
3262 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
3265 ArrayRef<MCPhysReg>
3266 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
3267 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
3270 ArrayRef<MCPhysReg>
3271 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
3272 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
3275 unsigned
3276 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
3277 unsigned SubReg) const {
3278 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
3279 case SIRCFlags::HasSGPR:
3280 return std::min(128u, getSubRegIdxSize(SubReg));
3281 case SIRCFlags::HasAGPR:
3282 case SIRCFlags::HasVGPR:
3283 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR:
3284 return std::min(32u, getSubRegIdxSize(SubReg));
3285 default:
3286 break;
3288 return 0;