1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// SI implementation of the TargetRegisterInfo class.
12 //===----------------------------------------------------------------------===//
15 #include "AMDGPURegisterBankInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUInstPrinter.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/LiveRegUnits.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
29 #define GET_REGINFO_TARGET_DESC
30 #include "AMDGPUGenRegisterInfo.inc"
32 static cl::opt
<bool> EnableSpillSGPRToVGPR(
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
38 std::array
<std::vector
<int16_t>, 16> SIRegisterInfo::RegSplitParts
;
39 std::array
<std::array
<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable
;
41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44 // meaning index 7 in SubRegFromChannelTable.
45 static const std::array
<unsigned, 17> SubRegFromChannelTableWidthMap
= {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
50 // A temporary struct to spill SGPRs.
51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
52 // just v_writelane and v_readlane.
54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
55 // is saved to scratch (or the other way around for loads).
56 // For this, a VGPR is required where the needed lanes can be clobbered. The
57 // RegScavenger can provide a VGPR where currently active lanes can be
58 // clobbered, but we still need to save inactive lanes.
59 // The high-level steps are:
60 // - Try to scavenge SGPR(s) to save exec
61 // - Try to scavenge VGPR
62 // - Save needed, all or inactive lanes of a TmpVGPR
63 // - Spill/Restore SGPRs using TmpVGPR
66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
67 // cannot scavenge temporary SGPRs to save exec, we use the following code:
68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved
70 // buffer_store_dword TmpVGPR ; save inactive lanes
72 struct SGPRSpillBuilder
{
81 MachineBasicBlock::iterator MI
;
82 ArrayRef
<int16_t> SplitParts
;
87 /* When spilling to stack */
88 // The SGPRs are written into this VGPR, which is then written to scratch
89 // (or vice versa for loads).
90 Register TmpVGPR
= AMDGPU::NoRegister
;
91 // Temporary spill slot to save TmpVGPR to.
93 // If TmpVGPR is live before the spill or if it is scavenged.
94 bool TmpVGPRLive
= false;
95 // Scavenged SGPR to save EXEC.
96 Register SavedExecReg
= AMDGPU::NoRegister
;
97 // Stack index to write the SGPRs to.
102 MachineBasicBlock
*MBB
;
104 SIMachineFunctionInfo
&MFI
;
105 const SIInstrInfo
&TII
;
106 const SIRegisterInfo
&TRI
;
112 SGPRSpillBuilder(const SIRegisterInfo
&TRI
, const SIInstrInfo
&TII
,
113 bool IsWave32
, MachineBasicBlock::iterator MI
, int Index
,
115 : SGPRSpillBuilder(TRI
, TII
, IsWave32
, MI
, MI
->getOperand(0).getReg(),
116 MI
->getOperand(0).isKill(), Index
, RS
) {}
118 SGPRSpillBuilder(const SIRegisterInfo
&TRI
, const SIInstrInfo
&TII
,
119 bool IsWave32
, MachineBasicBlock::iterator MI
, Register Reg
,
120 bool IsKill
, int Index
, RegScavenger
*RS
)
121 : SuperReg(Reg
), MI(MI
), IsKill(IsKill
), DL(MI
->getDebugLoc()),
122 Index(Index
), RS(RS
), MBB(MI
->getParent()), MF(*MBB
->getParent()),
123 MFI(*MF
.getInfo
<SIMachineFunctionInfo
>()), TII(TII
), TRI(TRI
),
125 const TargetRegisterClass
*RC
= TRI
.getPhysRegBaseClass(SuperReg
);
126 SplitParts
= TRI
.getRegSplitParts(RC
, EltSize
);
127 NumSubRegs
= SplitParts
.empty() ? 1 : SplitParts
.size();
130 ExecReg
= AMDGPU::EXEC_LO
;
131 MovOpc
= AMDGPU::S_MOV_B32
;
132 NotOpc
= AMDGPU::S_NOT_B32
;
134 ExecReg
= AMDGPU::EXEC
;
135 MovOpc
= AMDGPU::S_MOV_B64
;
136 NotOpc
= AMDGPU::S_NOT_B64
;
139 assert(SuperReg
!= AMDGPU::M0
&& "m0 should never spill");
140 assert(SuperReg
!= AMDGPU::EXEC_LO
&& SuperReg
!= AMDGPU::EXEC_HI
&&
141 SuperReg
!= AMDGPU::EXEC
&& "exec should never spill");
144 PerVGPRData
getPerVGPRData() {
146 Data
.PerVGPR
= IsWave32
? 32 : 64;
147 Data
.NumVGPRs
= (NumSubRegs
+ (Data
.PerVGPR
- 1)) / Data
.PerVGPR
;
148 Data
.VGPRLanes
= (1LL << std::min(Data
.PerVGPR
, NumSubRegs
)) - 1LL;
152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
154 // Writes these instructions if an SGPR can be scavenged:
155 // s_mov_b64 s[6:7], exec ; Save exec
156 // s_mov_b64 exec, 3 ; Wanted lanemask
157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
159 // Writes these instructions if no SGPR can be scavenged:
160 // buffer_store_dword v0 ; Only if no free VGPR was found
161 // s_not_b64 exec, exec
162 // buffer_store_dword v0 ; Save inactive lanes
163 // ; exec stays inverted, it is flipped back in
166 // Scavenged temporary VGPR to use. It must be scavenged once for any number
167 // of spilled subregs.
168 // FIXME: The liveness analysis is limited and does not tell if a register
169 // is in use in lanes that are currently inactive. We can never be sure if
170 // a register as actually in use in another lane, so we need to save all
171 // used lanes of the chosen VGPR.
172 assert(RS
&& "Cannot spill SGPR to memory without RegScavenger");
173 TmpVGPR
= RS
->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass
, MI
, false,
176 // Reserve temporary stack slot
177 TmpVGPRIndex
= MFI
.getScavengeFI(MF
.getFrameInfo(), TRI
);
179 // Found a register that is dead in the currently active lanes, we only
180 // need to spill inactive lanes.
183 // Pick v0 because it doesn't make a difference.
184 TmpVGPR
= AMDGPU::VGPR0
;
189 // We need to inform the scavenger that this index is already in use until
190 // we're done with the custom emergency spill.
191 RS
->assignRegToScavengingIndex(TmpVGPRIndex
, TmpVGPR
);
194 // We may end up recursively calling the scavenger, and don't want to re-use
195 // the same register.
196 RS
->setRegUsed(TmpVGPR
);
198 // Try to scavenge SGPRs to save exec
199 assert(!SavedExecReg
&& "Exec is already saved, refuse to save again");
200 const TargetRegisterClass
&RC
=
201 IsWave32
? AMDGPU::SGPR_32RegClass
: AMDGPU::SGPR_64RegClass
;
202 RS
->setRegUsed(SuperReg
);
203 SavedExecReg
= RS
->scavengeRegisterBackwards(RC
, MI
, false, 0, false);
205 int64_t VGPRLanes
= getPerVGPRData().VGPRLanes
;
208 RS
->setRegUsed(SavedExecReg
);
209 // Set exec to needed lanes
210 BuildMI(*MBB
, MI
, DL
, TII
.get(MovOpc
), SavedExecReg
).addReg(ExecReg
);
212 BuildMI(*MBB
, MI
, DL
, TII
.get(MovOpc
), ExecReg
).addImm(VGPRLanes
);
214 I
.addReg(TmpVGPR
, RegState::ImplicitDefine
);
215 // Spill needed lanes
216 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ false);
218 // The modify and restore of exec clobber SCC, which we would have to save
219 // and restore. FIXME: We probably would need to reserve a register for
221 if (RS
->isRegUsed(AMDGPU::SCC
))
222 MI
->emitError("unhandled SGPR spill to memory");
224 // Spill active lanes
226 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ false,
228 // Spill inactive lanes
229 auto I
= BuildMI(*MBB
, MI
, DL
, TII
.get(NotOpc
), ExecReg
).addReg(ExecReg
);
231 I
.addReg(TmpVGPR
, RegState::ImplicitDefine
);
232 I
->getOperand(2).setIsDead(); // Mark SCC as dead.
233 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ false);
237 // Writes these instructions if an SGPR can be scavenged:
238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
239 // s_waitcnt vmcnt(0) ; If a free VGPR was found
240 // s_mov_b64 exec, s[6:7] ; Save exec
242 // Writes these instructions if no SGPR can be scavenged:
243 // buffer_load_dword v0 ; Restore inactive lanes
244 // s_waitcnt vmcnt(0) ; If a free VGPR was found
245 // s_not_b64 exec, exec
246 // buffer_load_dword v0 ; Only if no free VGPR was found
249 // Restore used lanes
250 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ true,
253 auto I
= BuildMI(*MBB
, MI
, DL
, TII
.get(MovOpc
), ExecReg
)
254 .addReg(SavedExecReg
, RegState::Kill
);
255 // Add an implicit use of the load so it is not dead.
256 // FIXME This inserts an unnecessary waitcnt
258 I
.addReg(TmpVGPR
, RegState::ImplicitKill
);
261 // Restore inactive lanes
262 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ true,
264 auto I
= BuildMI(*MBB
, MI
, DL
, TII
.get(NotOpc
), ExecReg
).addReg(ExecReg
);
266 I
.addReg(TmpVGPR
, RegState::ImplicitKill
);
267 I
->getOperand(2).setIsDead(); // Mark SCC as dead.
269 // Restore active lanes
271 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ true);
274 // Inform the scavenger where we're releasing our custom scavenged register.
276 MachineBasicBlock::iterator RestorePt
= std::prev(MI
);
277 RS
->assignRegToScavengingIndex(TmpVGPRIndex
, TmpVGPR
, &*RestorePt
);
281 // Write TmpVGPR to memory or read TmpVGPR from memory.
282 // Either using a single buffer_load/store if exec is set to the needed mask
288 void readWriteTmpVGPR(unsigned Offset
, bool IsLoad
) {
290 // Spill needed lanes
291 TRI
.buildVGPRSpillLoadStore(*this, Index
, Offset
, IsLoad
);
293 // The modify and restore of exec clobber SCC, which we would have to save
294 // and restore. FIXME: We probably would need to reserve a register for
296 if (RS
->isRegUsed(AMDGPU::SCC
))
297 MI
->emitError("unhandled SGPR spill to memory");
299 // Spill active lanes
300 TRI
.buildVGPRSpillLoadStore(*this, Index
, Offset
, IsLoad
,
302 // Spill inactive lanes
303 auto Not0
= BuildMI(*MBB
, MI
, DL
, TII
.get(NotOpc
), ExecReg
).addReg(ExecReg
);
304 Not0
->getOperand(2).setIsDead(); // Mark SCC as dead.
305 TRI
.buildVGPRSpillLoadStore(*this, Index
, Offset
, IsLoad
);
306 auto Not1
= BuildMI(*MBB
, MI
, DL
, TII
.get(NotOpc
), ExecReg
).addReg(ExecReg
);
307 Not1
->getOperand(2).setIsDead(); // Mark SCC as dead.
311 void setMI(MachineBasicBlock
*NewMBB
, MachineBasicBlock::iterator NewMI
) {
312 assert(MBB
->getParent() == &MF
);
320 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget
&ST
)
321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG
, ST
.getAMDGPUDwarfFlavour(),
322 ST
.getAMDGPUDwarfFlavour()),
323 ST(ST
), SpillSGPRToVGPR(EnableSpillSGPRToVGPR
), isWave32(ST
.isWave32()) {
325 assert(getSubRegIndexLaneMask(AMDGPU::sub0
).getAsInteger() == 3 &&
326 getSubRegIndexLaneMask(AMDGPU::sub31
).getAsInteger() == (3ULL << 62) &&
327 (getSubRegIndexLaneMask(AMDGPU::lo16
) |
328 getSubRegIndexLaneMask(AMDGPU::hi16
)).getAsInteger() ==
329 getSubRegIndexLaneMask(AMDGPU::sub0
).getAsInteger() &&
330 "getNumCoveredRegs() will not work with generated subreg masks!");
332 RegPressureIgnoredUnits
.resize(getNumRegUnits());
333 RegPressureIgnoredUnits
.set(*regunits(MCRegister::from(AMDGPU::M0
)).begin());
334 for (auto Reg
: AMDGPU::VGPR_16RegClass
) {
335 if (AMDGPU::isHi(Reg
, *this))
336 RegPressureIgnoredUnits
.set(*regunits(Reg
).begin());
339 // HACK: Until this is fully tablegen'd.
340 static llvm::once_flag InitializeRegSplitPartsFlag
;
342 static auto InitializeRegSplitPartsOnce
= [this]() {
343 for (unsigned Idx
= 1, E
= getNumSubRegIndices() - 1; Idx
< E
; ++Idx
) {
344 unsigned Size
= getSubRegIdxSize(Idx
);
347 std::vector
<int16_t> &Vec
= RegSplitParts
[Size
/ 32 - 1];
348 unsigned Pos
= getSubRegIdxOffset(Idx
);
353 unsigned MaxNumParts
= 1024 / Size
; // Maximum register is 1024 bits.
354 Vec
.resize(MaxNumParts
);
360 static llvm::once_flag InitializeSubRegFromChannelTableFlag
;
362 static auto InitializeSubRegFromChannelTableOnce
= [this]() {
363 for (auto &Row
: SubRegFromChannelTable
)
364 Row
.fill(AMDGPU::NoSubRegister
);
365 for (unsigned Idx
= 1; Idx
< getNumSubRegIndices(); ++Idx
) {
366 unsigned Width
= getSubRegIdxSize(Idx
) / 32;
367 unsigned Offset
= getSubRegIdxOffset(Idx
) / 32;
368 assert(Width
< SubRegFromChannelTableWidthMap
.size());
369 Width
= SubRegFromChannelTableWidthMap
[Width
];
372 unsigned TableIdx
= Width
- 1;
373 assert(TableIdx
< SubRegFromChannelTable
.size());
374 assert(Offset
< SubRegFromChannelTable
[TableIdx
].size());
375 SubRegFromChannelTable
[TableIdx
][Offset
] = Idx
;
379 llvm::call_once(InitializeRegSplitPartsFlag
, InitializeRegSplitPartsOnce
);
380 llvm::call_once(InitializeSubRegFromChannelTableFlag
,
381 InitializeSubRegFromChannelTableOnce
);
384 void SIRegisterInfo::reserveRegisterTuples(BitVector
&Reserved
,
385 MCRegister Reg
) const {
386 for (MCRegAliasIterator
R(Reg
, this, true); R
.isValid(); ++R
)
390 // Forced to be here by one .inc
391 const MCPhysReg
*SIRegisterInfo::getCalleeSavedRegs(
392 const MachineFunction
*MF
) const {
393 CallingConv::ID CC
= MF
->getFunction().getCallingConv();
396 case CallingConv::Fast
:
397 case CallingConv::Cold
:
398 return ST
.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
399 : CSR_AMDGPU_SaveList
;
400 case CallingConv::AMDGPU_Gfx
:
401 return ST
.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
402 : CSR_AMDGPU_SI_Gfx_SaveList
;
403 case CallingConv::AMDGPU_CS_ChainPreserve
:
404 return CSR_AMDGPU_CS_ChainPreserve_SaveList
;
406 // Dummy to not crash RegisterClassInfo.
407 static const MCPhysReg NoCalleeSavedReg
= AMDGPU::NoRegister
;
408 return &NoCalleeSavedReg
;
414 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction
*MF
) const {
418 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction
&MF
,
419 CallingConv::ID CC
) const {
422 case CallingConv::Fast
:
423 case CallingConv::Cold
:
424 return ST
.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
425 : CSR_AMDGPU_RegMask
;
426 case CallingConv::AMDGPU_Gfx
:
427 return ST
.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
428 : CSR_AMDGPU_SI_Gfx_RegMask
;
429 case CallingConv::AMDGPU_CS_Chain
:
430 case CallingConv::AMDGPU_CS_ChainPreserve
:
431 // Calls to these functions never return, so we can pretend everything is
433 return AMDGPU_AllVGPRs_RegMask
;
439 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
440 return CSR_AMDGPU_NoRegs_RegMask
;
443 bool SIRegisterInfo::isChainScratchRegister(Register VGPR
) {
444 return VGPR
>= AMDGPU::VGPR0
&& VGPR
< AMDGPU::VGPR8
;
447 const TargetRegisterClass
*
448 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass
*RC
,
449 const MachineFunction
&MF
) const {
450 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
451 // equivalent AV class. If used one, the verifier will crash after
452 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
453 // until Instruction selection.
454 if (ST
.hasMAIInsts() && (isVGPRClass(RC
) || isAGPRClass(RC
))) {
455 if (RC
== &AMDGPU::VGPR_32RegClass
|| RC
== &AMDGPU::AGPR_32RegClass
)
456 return &AMDGPU::AV_32RegClass
;
457 if (RC
== &AMDGPU::VReg_64RegClass
|| RC
== &AMDGPU::AReg_64RegClass
)
458 return &AMDGPU::AV_64RegClass
;
459 if (RC
== &AMDGPU::VReg_64_Align2RegClass
||
460 RC
== &AMDGPU::AReg_64_Align2RegClass
)
461 return &AMDGPU::AV_64_Align2RegClass
;
462 if (RC
== &AMDGPU::VReg_96RegClass
|| RC
== &AMDGPU::AReg_96RegClass
)
463 return &AMDGPU::AV_96RegClass
;
464 if (RC
== &AMDGPU::VReg_96_Align2RegClass
||
465 RC
== &AMDGPU::AReg_96_Align2RegClass
)
466 return &AMDGPU::AV_96_Align2RegClass
;
467 if (RC
== &AMDGPU::VReg_128RegClass
|| RC
== &AMDGPU::AReg_128RegClass
)
468 return &AMDGPU::AV_128RegClass
;
469 if (RC
== &AMDGPU::VReg_128_Align2RegClass
||
470 RC
== &AMDGPU::AReg_128_Align2RegClass
)
471 return &AMDGPU::AV_128_Align2RegClass
;
472 if (RC
== &AMDGPU::VReg_160RegClass
|| RC
== &AMDGPU::AReg_160RegClass
)
473 return &AMDGPU::AV_160RegClass
;
474 if (RC
== &AMDGPU::VReg_160_Align2RegClass
||
475 RC
== &AMDGPU::AReg_160_Align2RegClass
)
476 return &AMDGPU::AV_160_Align2RegClass
;
477 if (RC
== &AMDGPU::VReg_192RegClass
|| RC
== &AMDGPU::AReg_192RegClass
)
478 return &AMDGPU::AV_192RegClass
;
479 if (RC
== &AMDGPU::VReg_192_Align2RegClass
||
480 RC
== &AMDGPU::AReg_192_Align2RegClass
)
481 return &AMDGPU::AV_192_Align2RegClass
;
482 if (RC
== &AMDGPU::VReg_256RegClass
|| RC
== &AMDGPU::AReg_256RegClass
)
483 return &AMDGPU::AV_256RegClass
;
484 if (RC
== &AMDGPU::VReg_256_Align2RegClass
||
485 RC
== &AMDGPU::AReg_256_Align2RegClass
)
486 return &AMDGPU::AV_256_Align2RegClass
;
487 if (RC
== &AMDGPU::VReg_512RegClass
|| RC
== &AMDGPU::AReg_512RegClass
)
488 return &AMDGPU::AV_512RegClass
;
489 if (RC
== &AMDGPU::VReg_512_Align2RegClass
||
490 RC
== &AMDGPU::AReg_512_Align2RegClass
)
491 return &AMDGPU::AV_512_Align2RegClass
;
492 if (RC
== &AMDGPU::VReg_1024RegClass
|| RC
== &AMDGPU::AReg_1024RegClass
)
493 return &AMDGPU::AV_1024RegClass
;
494 if (RC
== &AMDGPU::VReg_1024_Align2RegClass
||
495 RC
== &AMDGPU::AReg_1024_Align2RegClass
)
496 return &AMDGPU::AV_1024_Align2RegClass
;
499 return TargetRegisterInfo::getLargestLegalSuperClass(RC
, MF
);
502 Register
SIRegisterInfo::getFrameRegister(const MachineFunction
&MF
) const {
503 const SIFrameLowering
*TFI
= ST
.getFrameLowering();
504 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
505 // During ISel lowering we always reserve the stack pointer in entry and chain
506 // functions, but never actually want to reference it when accessing our own
507 // frame. If we need a frame pointer we use it, but otherwise we can just use
508 // an immediate "0" which we represent by returning NoRegister.
509 if (FuncInfo
->isBottomOfStack()) {
510 return TFI
->hasFP(MF
) ? FuncInfo
->getFrameOffsetReg() : Register();
512 return TFI
->hasFP(MF
) ? FuncInfo
->getFrameOffsetReg()
513 : FuncInfo
->getStackPtrOffsetReg();
516 bool SIRegisterInfo::hasBasePointer(const MachineFunction
&MF
) const {
517 // When we need stack realignment, we can't reference off of the
518 // stack pointer, so we reserve a base pointer.
519 const MachineFrameInfo
&MFI
= MF
.getFrameInfo();
520 return MFI
.getNumFixedObjects() && shouldRealignStack(MF
);
523 Register
SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34
; }
525 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
526 return AMDGPU_AllVGPRs_RegMask
;
529 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
530 return AMDGPU_AllAGPRs_RegMask
;
533 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
534 return AMDGPU_AllVectorRegs_RegMask
;
537 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
538 return AMDGPU_AllAllocatableSRegs_RegMask
;
541 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel
,
543 assert(NumRegs
< SubRegFromChannelTableWidthMap
.size());
544 unsigned NumRegIndex
= SubRegFromChannelTableWidthMap
[NumRegs
];
545 assert(NumRegIndex
&& "Not implemented");
546 assert(Channel
< SubRegFromChannelTable
[NumRegIndex
- 1].size());
547 return SubRegFromChannelTable
[NumRegIndex
- 1][Channel
];
551 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction
&MF
,
552 const unsigned Align
,
553 const TargetRegisterClass
*RC
) const {
554 unsigned BaseIdx
= alignDown(ST
.getMaxNumSGPRs(MF
), Align
) - Align
;
555 MCRegister
BaseReg(AMDGPU::SGPR_32RegClass
.getRegister(BaseIdx
));
556 return getMatchingSuperReg(BaseReg
, AMDGPU::sub0
, RC
);
559 MCRegister
SIRegisterInfo::reservedPrivateSegmentBufferReg(
560 const MachineFunction
&MF
) const {
561 return getAlignedHighSGPRForRC(MF
, /*Align=*/4, &AMDGPU::SGPR_128RegClass
);
564 BitVector
SIRegisterInfo::getReservedRegs(const MachineFunction
&MF
) const {
565 BitVector
Reserved(getNumRegs());
566 Reserved
.set(AMDGPU::MODE
);
568 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
570 // Reserve special purpose registers.
572 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
573 // this seems likely to result in bugs, so I'm marking them as reserved.
574 reserveRegisterTuples(Reserved
, AMDGPU::EXEC
);
575 reserveRegisterTuples(Reserved
, AMDGPU::FLAT_SCR
);
577 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
578 reserveRegisterTuples(Reserved
, AMDGPU::M0
);
580 // Reserve src_vccz, src_execz, src_scc.
581 reserveRegisterTuples(Reserved
, AMDGPU::SRC_VCCZ
);
582 reserveRegisterTuples(Reserved
, AMDGPU::SRC_EXECZ
);
583 reserveRegisterTuples(Reserved
, AMDGPU::SRC_SCC
);
585 // Reserve the memory aperture registers
586 reserveRegisterTuples(Reserved
, AMDGPU::SRC_SHARED_BASE
);
587 reserveRegisterTuples(Reserved
, AMDGPU::SRC_SHARED_LIMIT
);
588 reserveRegisterTuples(Reserved
, AMDGPU::SRC_PRIVATE_BASE
);
589 reserveRegisterTuples(Reserved
, AMDGPU::SRC_PRIVATE_LIMIT
);
591 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
592 reserveRegisterTuples(Reserved
, AMDGPU::SRC_POPS_EXITING_WAVE_ID
);
594 // Reserve xnack_mask registers - support is not implemented in Codegen.
595 reserveRegisterTuples(Reserved
, AMDGPU::XNACK_MASK
);
597 // Reserve lds_direct register - support is not implemented in Codegen.
598 reserveRegisterTuples(Reserved
, AMDGPU::LDS_DIRECT
);
600 // Reserve Trap Handler registers - support is not implemented in Codegen.
601 reserveRegisterTuples(Reserved
, AMDGPU::TBA
);
602 reserveRegisterTuples(Reserved
, AMDGPU::TMA
);
603 reserveRegisterTuples(Reserved
, AMDGPU::TTMP0_TTMP1
);
604 reserveRegisterTuples(Reserved
, AMDGPU::TTMP2_TTMP3
);
605 reserveRegisterTuples(Reserved
, AMDGPU::TTMP4_TTMP5
);
606 reserveRegisterTuples(Reserved
, AMDGPU::TTMP6_TTMP7
);
607 reserveRegisterTuples(Reserved
, AMDGPU::TTMP8_TTMP9
);
608 reserveRegisterTuples(Reserved
, AMDGPU::TTMP10_TTMP11
);
609 reserveRegisterTuples(Reserved
, AMDGPU::TTMP12_TTMP13
);
610 reserveRegisterTuples(Reserved
, AMDGPU::TTMP14_TTMP15
);
612 // Reserve null register - it shall never be allocated
613 reserveRegisterTuples(Reserved
, AMDGPU::SGPR_NULL64
);
617 unsigned MaxNumSGPRs
= ST
.getMaxNumSGPRs(MF
);
618 unsigned TotalNumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
619 for (const TargetRegisterClass
*RC
: regclasses()) {
620 if (RC
->isBaseClass() && isSGPRClass(RC
)) {
621 unsigned NumRegs
= divideCeil(getRegSizeInBits(*RC
), 32);
622 for (MCPhysReg Reg
: *RC
) {
623 unsigned Index
= getHWRegIndex(Reg
);
624 if (Index
+ NumRegs
> MaxNumSGPRs
&& Index
< TotalNumSGPRs
)
630 Register ScratchRSrcReg
= MFI
->getScratchRSrcReg();
631 if (ScratchRSrcReg
!= AMDGPU::NoRegister
) {
632 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
634 // TODO: May need to reserve a VGPR if doing LDS spilling.
635 reserveRegisterTuples(Reserved
, ScratchRSrcReg
);
638 Register LongBranchReservedReg
= MFI
->getLongBranchReservedReg();
639 if (LongBranchReservedReg
)
640 reserveRegisterTuples(Reserved
, LongBranchReservedReg
);
642 // We have to assume the SP is needed in case there are calls in the function,
643 // which is detected after the function is lowered. If we aren't really going
644 // to need SP, don't bother reserving it.
645 MCRegister StackPtrReg
= MFI
->getStackPtrOffsetReg();
647 reserveRegisterTuples(Reserved
, StackPtrReg
);
648 assert(!isSubRegister(ScratchRSrcReg
, StackPtrReg
));
651 MCRegister FrameReg
= MFI
->getFrameOffsetReg();
653 reserveRegisterTuples(Reserved
, FrameReg
);
654 assert(!isSubRegister(ScratchRSrcReg
, FrameReg
));
657 if (hasBasePointer(MF
)) {
658 MCRegister BasePtrReg
= getBaseRegister();
659 reserveRegisterTuples(Reserved
, BasePtrReg
);
660 assert(!isSubRegister(ScratchRSrcReg
, BasePtrReg
));
663 // FIXME: Use same reserved register introduced in D149775
664 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
665 Register ExecCopyReg
= MFI
->getSGPRForEXECCopy();
667 reserveRegisterTuples(Reserved
, ExecCopyReg
);
669 // Reserve VGPRs/AGPRs.
671 unsigned MaxNumVGPRs
= ST
.getMaxNumVGPRs(MF
);
672 unsigned MaxNumAGPRs
= MaxNumVGPRs
;
673 unsigned TotalNumVGPRs
= AMDGPU::VGPR_32RegClass
.getNumRegs();
675 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
676 // a wave may have up to 512 total vector registers combining together both
677 // VGPRs and AGPRs. Hence, in an entry function without calls and without
678 // AGPRs used within it, it is possible to use the whole vector register
681 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
682 // register file accordingly.
683 if (ST
.hasGFX90AInsts()) {
684 if (MFI
->usesAGPRs(MF
)) {
686 MaxNumAGPRs
= MaxNumVGPRs
;
688 if (MaxNumVGPRs
> TotalNumVGPRs
) {
689 MaxNumAGPRs
= MaxNumVGPRs
- TotalNumVGPRs
;
690 MaxNumVGPRs
= TotalNumVGPRs
;
696 for (const TargetRegisterClass
*RC
: regclasses()) {
697 if (RC
->isBaseClass() && isVGPRClass(RC
)) {
698 unsigned NumRegs
= divideCeil(getRegSizeInBits(*RC
), 32);
699 for (MCPhysReg Reg
: *RC
) {
700 unsigned Index
= getHWRegIndex(Reg
);
701 if (Index
+ NumRegs
> MaxNumVGPRs
)
707 // Reserve all the AGPRs if there are no instructions to use it.
708 if (!ST
.hasMAIInsts())
710 for (const TargetRegisterClass
*RC
: regclasses()) {
711 if (RC
->isBaseClass() && isAGPRClass(RC
)) {
712 unsigned NumRegs
= divideCeil(getRegSizeInBits(*RC
), 32);
713 for (MCPhysReg Reg
: *RC
) {
714 unsigned Index
= getHWRegIndex(Reg
);
715 if (Index
+ NumRegs
> MaxNumAGPRs
)
721 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
722 // VGPR available at all times.
723 if (ST
.hasMAIInsts() && !ST
.hasGFX90AInsts()) {
724 reserveRegisterTuples(Reserved
, MFI
->getVGPRForAGPRCopy());
727 for (Register Reg
: MFI
->getWWMReservedRegs())
728 reserveRegisterTuples(Reserved
, Reg
);
730 // FIXME: Stop using reserved registers for this.
731 for (MCPhysReg Reg
: MFI
->getAGPRSpillVGPRs())
732 reserveRegisterTuples(Reserved
, Reg
);
734 for (MCPhysReg Reg
: MFI
->getVGPRSpillAGPRs())
735 reserveRegisterTuples(Reserved
, Reg
);
740 bool SIRegisterInfo::isAsmClobberable(const MachineFunction
&MF
,
741 MCRegister PhysReg
) const {
742 return !MF
.getRegInfo().isReserved(PhysReg
);
745 bool SIRegisterInfo::shouldRealignStack(const MachineFunction
&MF
) const {
746 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
747 // On entry or in chain functions, the base address is 0, so it can't possibly
748 // need any more alignment.
750 // FIXME: Should be able to specify the entry frame alignment per calling
751 // convention instead.
752 if (Info
->isBottomOfStack())
755 return TargetRegisterInfo::shouldRealignStack(MF
);
758 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction
&Fn
) const {
759 const SIMachineFunctionInfo
*Info
= Fn
.getInfo
<SIMachineFunctionInfo
>();
760 if (Info
->isEntryFunction()) {
761 const MachineFrameInfo
&MFI
= Fn
.getFrameInfo();
762 return MFI
.hasStackObjects() || MFI
.hasCalls();
765 // May need scavenger for dealing with callee saved registers.
769 bool SIRegisterInfo::requiresFrameIndexScavenging(
770 const MachineFunction
&MF
) const {
771 // Do not use frame virtual registers. They used to be used for SGPRs, but
772 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
773 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
778 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
779 const MachineFunction
&MF
) const {
780 const MachineFrameInfo
&MFI
= MF
.getFrameInfo();
781 return MFI
.hasStackObjects();
784 bool SIRegisterInfo::requiresVirtualBaseRegisters(
785 const MachineFunction
&) const {
786 // There are no special dedicated stack or frame pointers.
790 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr
*MI
) const {
791 assert(SIInstrInfo::isMUBUF(*MI
) || SIInstrInfo::isFLATScratch(*MI
));
793 int OffIdx
= AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
794 AMDGPU::OpName::offset
);
795 return MI
->getOperand(OffIdx
).getImm();
798 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr
*MI
,
800 if (!SIInstrInfo::isMUBUF(*MI
) && !SIInstrInfo::isFLATScratch(*MI
))
803 assert((Idx
== AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
804 AMDGPU::OpName::vaddr
) ||
805 (Idx
== AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
806 AMDGPU::OpName::saddr
))) &&
807 "Should never see frame index on non-address operand");
809 return getScratchInstrOffset(MI
);
812 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr
*MI
, int64_t Offset
) const {
813 if (!SIInstrInfo::isMUBUF(*MI
) && !SIInstrInfo::isFLATScratch(*MI
))
816 int64_t FullOffset
= Offset
+ getScratchInstrOffset(MI
);
818 const SIInstrInfo
*TII
= ST
.getInstrInfo();
819 if (SIInstrInfo::isMUBUF(*MI
))
820 return !TII
->isLegalMUBUFImmOffset(FullOffset
);
822 return !TII
->isLegalFLATOffset(FullOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
823 SIInstrFlags::FlatScratch
);
826 Register
SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock
*MBB
,
828 int64_t Offset
) const {
829 MachineBasicBlock::iterator Ins
= MBB
->begin();
830 DebugLoc DL
; // Defaults to "unknown"
832 if (Ins
!= MBB
->end())
833 DL
= Ins
->getDebugLoc();
835 MachineFunction
*MF
= MBB
->getParent();
836 const SIInstrInfo
*TII
= ST
.getInstrInfo();
837 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
838 unsigned MovOpc
= ST
.enableFlatScratch() ? AMDGPU::S_MOV_B32
839 : AMDGPU::V_MOV_B32_e32
;
841 Register BaseReg
= MRI
.createVirtualRegister(
842 ST
.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
843 : &AMDGPU::VGPR_32RegClass
);
846 BuildMI(*MBB
, Ins
, DL
, TII
->get(MovOpc
), BaseReg
)
847 .addFrameIndex(FrameIdx
);
851 Register OffsetReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
853 Register FIReg
= MRI
.createVirtualRegister(
854 ST
.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
855 : &AMDGPU::VGPR_32RegClass
);
857 BuildMI(*MBB
, Ins
, DL
, TII
->get(AMDGPU::S_MOV_B32
), OffsetReg
)
859 BuildMI(*MBB
, Ins
, DL
, TII
->get(MovOpc
), FIReg
)
860 .addFrameIndex(FrameIdx
);
862 if (ST
.enableFlatScratch() ) {
863 BuildMI(*MBB
, Ins
, DL
, TII
->get(AMDGPU::S_ADD_I32
), BaseReg
)
864 .addReg(OffsetReg
, RegState::Kill
)
869 TII
->getAddNoCarry(*MBB
, Ins
, DL
, BaseReg
)
870 .addReg(OffsetReg
, RegState::Kill
)
872 .addImm(0); // clamp bit
877 void SIRegisterInfo::resolveFrameIndex(MachineInstr
&MI
, Register BaseReg
,
878 int64_t Offset
) const {
879 const SIInstrInfo
*TII
= ST
.getInstrInfo();
880 bool IsFlat
= TII
->isFLATScratch(MI
);
883 // FIXME: Is it possible to be storing a frame index to itself?
885 for (const MachineOperand
&MO
: MI
.operands()) {
888 llvm_unreachable("should not see multiple frame indices");
895 MachineOperand
*FIOp
=
896 TII
->getNamedOperand(MI
, IsFlat
? AMDGPU::OpName::saddr
897 : AMDGPU::OpName::vaddr
);
899 MachineOperand
*OffsetOp
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
);
900 int64_t NewOffset
= OffsetOp
->getImm() + Offset
;
902 assert(FIOp
&& FIOp
->isFI() && "frame index must be address operand");
903 assert(TII
->isMUBUF(MI
) || TII
->isFLATScratch(MI
));
906 assert(TII
->isLegalFLATOffset(NewOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
907 SIInstrFlags::FlatScratch
) &&
908 "offset should be legal");
909 FIOp
->ChangeToRegister(BaseReg
, false);
910 OffsetOp
->setImm(NewOffset
);
915 MachineOperand
*SOffset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::soffset
);
916 assert(SOffset
->isImm() && SOffset
->getImm() == 0);
919 assert(TII
->isLegalMUBUFImmOffset(NewOffset
) && "offset should be legal");
921 FIOp
->ChangeToRegister(BaseReg
, false);
922 OffsetOp
->setImm(NewOffset
);
925 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr
*MI
,
927 int64_t Offset
) const {
928 if (!SIInstrInfo::isMUBUF(*MI
) && !SIInstrInfo::isFLATScratch(*MI
))
931 int64_t NewOffset
= Offset
+ getScratchInstrOffset(MI
);
933 const SIInstrInfo
*TII
= ST
.getInstrInfo();
934 if (SIInstrInfo::isMUBUF(*MI
))
935 return TII
->isLegalMUBUFImmOffset(NewOffset
);
937 return TII
->isLegalFLATOffset(NewOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
938 SIInstrFlags::FlatScratch
);
941 const TargetRegisterClass
*SIRegisterInfo::getPointerRegClass(
942 const MachineFunction
&MF
, unsigned Kind
) const {
943 // This is inaccurate. It depends on the instruction and address space. The
944 // only place where we should hit this is for dealing with frame indexes /
945 // private accesses, so this is correct in that case.
946 return &AMDGPU::VGPR_32RegClass
;
949 const TargetRegisterClass
*
950 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass
*RC
) const {
951 if (isAGPRClass(RC
) && !ST
.hasGFX90AInsts())
952 return getEquivalentVGPRClass(RC
);
953 if (RC
== &AMDGPU::SCC_CLASSRegClass
)
954 return getWaveMaskRegClass();
959 static unsigned getNumSubRegsForSpillOp(unsigned Op
) {
962 case AMDGPU::SI_SPILL_S1024_SAVE
:
963 case AMDGPU::SI_SPILL_S1024_RESTORE
:
964 case AMDGPU::SI_SPILL_V1024_SAVE
:
965 case AMDGPU::SI_SPILL_V1024_RESTORE
:
966 case AMDGPU::SI_SPILL_A1024_SAVE
:
967 case AMDGPU::SI_SPILL_A1024_RESTORE
:
968 case AMDGPU::SI_SPILL_AV1024_SAVE
:
969 case AMDGPU::SI_SPILL_AV1024_RESTORE
:
971 case AMDGPU::SI_SPILL_S512_SAVE
:
972 case AMDGPU::SI_SPILL_S512_RESTORE
:
973 case AMDGPU::SI_SPILL_V512_SAVE
:
974 case AMDGPU::SI_SPILL_V512_RESTORE
:
975 case AMDGPU::SI_SPILL_A512_SAVE
:
976 case AMDGPU::SI_SPILL_A512_RESTORE
:
977 case AMDGPU::SI_SPILL_AV512_SAVE
:
978 case AMDGPU::SI_SPILL_AV512_RESTORE
:
980 case AMDGPU::SI_SPILL_S384_SAVE
:
981 case AMDGPU::SI_SPILL_S384_RESTORE
:
982 case AMDGPU::SI_SPILL_V384_SAVE
:
983 case AMDGPU::SI_SPILL_V384_RESTORE
:
984 case AMDGPU::SI_SPILL_A384_SAVE
:
985 case AMDGPU::SI_SPILL_A384_RESTORE
:
986 case AMDGPU::SI_SPILL_AV384_SAVE
:
987 case AMDGPU::SI_SPILL_AV384_RESTORE
:
989 case AMDGPU::SI_SPILL_S352_SAVE
:
990 case AMDGPU::SI_SPILL_S352_RESTORE
:
991 case AMDGPU::SI_SPILL_V352_SAVE
:
992 case AMDGPU::SI_SPILL_V352_RESTORE
:
993 case AMDGPU::SI_SPILL_A352_SAVE
:
994 case AMDGPU::SI_SPILL_A352_RESTORE
:
995 case AMDGPU::SI_SPILL_AV352_SAVE
:
996 case AMDGPU::SI_SPILL_AV352_RESTORE
:
998 case AMDGPU::SI_SPILL_S320_SAVE
:
999 case AMDGPU::SI_SPILL_S320_RESTORE
:
1000 case AMDGPU::SI_SPILL_V320_SAVE
:
1001 case AMDGPU::SI_SPILL_V320_RESTORE
:
1002 case AMDGPU::SI_SPILL_A320_SAVE
:
1003 case AMDGPU::SI_SPILL_A320_RESTORE
:
1004 case AMDGPU::SI_SPILL_AV320_SAVE
:
1005 case AMDGPU::SI_SPILL_AV320_RESTORE
:
1007 case AMDGPU::SI_SPILL_S288_SAVE
:
1008 case AMDGPU::SI_SPILL_S288_RESTORE
:
1009 case AMDGPU::SI_SPILL_V288_SAVE
:
1010 case AMDGPU::SI_SPILL_V288_RESTORE
:
1011 case AMDGPU::SI_SPILL_A288_SAVE
:
1012 case AMDGPU::SI_SPILL_A288_RESTORE
:
1013 case AMDGPU::SI_SPILL_AV288_SAVE
:
1014 case AMDGPU::SI_SPILL_AV288_RESTORE
:
1016 case AMDGPU::SI_SPILL_S256_SAVE
:
1017 case AMDGPU::SI_SPILL_S256_RESTORE
:
1018 case AMDGPU::SI_SPILL_V256_SAVE
:
1019 case AMDGPU::SI_SPILL_V256_RESTORE
:
1020 case AMDGPU::SI_SPILL_A256_SAVE
:
1021 case AMDGPU::SI_SPILL_A256_RESTORE
:
1022 case AMDGPU::SI_SPILL_AV256_SAVE
:
1023 case AMDGPU::SI_SPILL_AV256_RESTORE
:
1025 case AMDGPU::SI_SPILL_S224_SAVE
:
1026 case AMDGPU::SI_SPILL_S224_RESTORE
:
1027 case AMDGPU::SI_SPILL_V224_SAVE
:
1028 case AMDGPU::SI_SPILL_V224_RESTORE
:
1029 case AMDGPU::SI_SPILL_A224_SAVE
:
1030 case AMDGPU::SI_SPILL_A224_RESTORE
:
1031 case AMDGPU::SI_SPILL_AV224_SAVE
:
1032 case AMDGPU::SI_SPILL_AV224_RESTORE
:
1034 case AMDGPU::SI_SPILL_S192_SAVE
:
1035 case AMDGPU::SI_SPILL_S192_RESTORE
:
1036 case AMDGPU::SI_SPILL_V192_SAVE
:
1037 case AMDGPU::SI_SPILL_V192_RESTORE
:
1038 case AMDGPU::SI_SPILL_A192_SAVE
:
1039 case AMDGPU::SI_SPILL_A192_RESTORE
:
1040 case AMDGPU::SI_SPILL_AV192_SAVE
:
1041 case AMDGPU::SI_SPILL_AV192_RESTORE
:
1043 case AMDGPU::SI_SPILL_S160_SAVE
:
1044 case AMDGPU::SI_SPILL_S160_RESTORE
:
1045 case AMDGPU::SI_SPILL_V160_SAVE
:
1046 case AMDGPU::SI_SPILL_V160_RESTORE
:
1047 case AMDGPU::SI_SPILL_A160_SAVE
:
1048 case AMDGPU::SI_SPILL_A160_RESTORE
:
1049 case AMDGPU::SI_SPILL_AV160_SAVE
:
1050 case AMDGPU::SI_SPILL_AV160_RESTORE
:
1052 case AMDGPU::SI_SPILL_S128_SAVE
:
1053 case AMDGPU::SI_SPILL_S128_RESTORE
:
1054 case AMDGPU::SI_SPILL_V128_SAVE
:
1055 case AMDGPU::SI_SPILL_V128_RESTORE
:
1056 case AMDGPU::SI_SPILL_A128_SAVE
:
1057 case AMDGPU::SI_SPILL_A128_RESTORE
:
1058 case AMDGPU::SI_SPILL_AV128_SAVE
:
1059 case AMDGPU::SI_SPILL_AV128_RESTORE
:
1061 case AMDGPU::SI_SPILL_S96_SAVE
:
1062 case AMDGPU::SI_SPILL_S96_RESTORE
:
1063 case AMDGPU::SI_SPILL_V96_SAVE
:
1064 case AMDGPU::SI_SPILL_V96_RESTORE
:
1065 case AMDGPU::SI_SPILL_A96_SAVE
:
1066 case AMDGPU::SI_SPILL_A96_RESTORE
:
1067 case AMDGPU::SI_SPILL_AV96_SAVE
:
1068 case AMDGPU::SI_SPILL_AV96_RESTORE
:
1070 case AMDGPU::SI_SPILL_S64_SAVE
:
1071 case AMDGPU::SI_SPILL_S64_RESTORE
:
1072 case AMDGPU::SI_SPILL_V64_SAVE
:
1073 case AMDGPU::SI_SPILL_V64_RESTORE
:
1074 case AMDGPU::SI_SPILL_A64_SAVE
:
1075 case AMDGPU::SI_SPILL_A64_RESTORE
:
1076 case AMDGPU::SI_SPILL_AV64_SAVE
:
1077 case AMDGPU::SI_SPILL_AV64_RESTORE
:
1079 case AMDGPU::SI_SPILL_S32_SAVE
:
1080 case AMDGPU::SI_SPILL_S32_RESTORE
:
1081 case AMDGPU::SI_SPILL_V32_SAVE
:
1082 case AMDGPU::SI_SPILL_V32_RESTORE
:
1083 case AMDGPU::SI_SPILL_A32_SAVE
:
1084 case AMDGPU::SI_SPILL_A32_RESTORE
:
1085 case AMDGPU::SI_SPILL_AV32_SAVE
:
1086 case AMDGPU::SI_SPILL_AV32_RESTORE
:
1087 case AMDGPU::SI_SPILL_WWM_V32_SAVE
:
1088 case AMDGPU::SI_SPILL_WWM_V32_RESTORE
:
1089 case AMDGPU::SI_SPILL_WWM_AV32_SAVE
:
1090 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE
:
1092 default: llvm_unreachable("Invalid spill opcode");
1096 static int getOffsetMUBUFStore(unsigned Opc
) {
1098 case AMDGPU::BUFFER_STORE_DWORD_OFFEN
:
1099 return AMDGPU::BUFFER_STORE_DWORD_OFFSET
;
1100 case AMDGPU::BUFFER_STORE_BYTE_OFFEN
:
1101 return AMDGPU::BUFFER_STORE_BYTE_OFFSET
;
1102 case AMDGPU::BUFFER_STORE_SHORT_OFFEN
:
1103 return AMDGPU::BUFFER_STORE_SHORT_OFFSET
;
1104 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN
:
1105 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET
;
1106 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN
:
1107 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET
;
1108 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN
:
1109 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET
;
1110 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN
:
1111 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET
;
1112 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN
:
1113 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET
;
1119 static int getOffsetMUBUFLoad(unsigned Opc
) {
1121 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN
:
1122 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET
;
1123 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN
:
1124 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET
;
1125 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN
:
1126 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET
;
1127 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN
:
1128 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET
;
1129 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN
:
1130 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET
;
1131 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN
:
1132 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET
;
1133 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN
:
1134 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET
;
1135 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN
:
1136 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET
;
1137 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN
:
1138 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET
;
1139 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN
:
1140 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET
;
1141 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN
:
1142 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET
;
1143 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN
:
1144 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET
;
1145 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN
:
1146 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET
;
1147 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN
:
1148 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET
;
1154 static int getOffenMUBUFStore(unsigned Opc
) {
1156 case AMDGPU::BUFFER_STORE_DWORD_OFFSET
:
1157 return AMDGPU::BUFFER_STORE_DWORD_OFFEN
;
1158 case AMDGPU::BUFFER_STORE_BYTE_OFFSET
:
1159 return AMDGPU::BUFFER_STORE_BYTE_OFFEN
;
1160 case AMDGPU::BUFFER_STORE_SHORT_OFFSET
:
1161 return AMDGPU::BUFFER_STORE_SHORT_OFFEN
;
1162 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET
:
1163 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN
;
1164 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET
:
1165 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN
;
1166 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET
:
1167 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN
;
1168 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET
:
1169 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN
;
1170 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET
:
1171 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN
;
1177 static int getOffenMUBUFLoad(unsigned Opc
) {
1179 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET
:
1180 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN
;
1181 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET
:
1182 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN
;
1183 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET
:
1184 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN
;
1185 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET
:
1186 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN
;
1187 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET
:
1188 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN
;
1189 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET
:
1190 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN
;
1191 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET
:
1192 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN
;
1193 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET
:
1194 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN
;
1195 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET
:
1196 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN
;
1197 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET
:
1198 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN
;
1199 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET
:
1200 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN
;
1201 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET
:
1202 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN
;
1203 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET
:
1204 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN
;
1205 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET
:
1206 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN
;
1212 static MachineInstrBuilder
spillVGPRtoAGPR(const GCNSubtarget
&ST
,
1213 MachineBasicBlock
&MBB
,
1214 MachineBasicBlock::iterator MI
,
1215 int Index
, unsigned Lane
,
1216 unsigned ValueReg
, bool IsKill
) {
1217 MachineFunction
*MF
= MBB
.getParent();
1218 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
1219 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1221 MCPhysReg Reg
= MFI
->getVGPRToAGPRSpill(Index
, Lane
);
1223 if (Reg
== AMDGPU::NoRegister
)
1224 return MachineInstrBuilder();
1226 bool IsStore
= MI
->mayStore();
1227 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
1228 auto *TRI
= static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
1230 unsigned Dst
= IsStore
? Reg
: ValueReg
;
1231 unsigned Src
= IsStore
? ValueReg
: Reg
;
1232 bool IsVGPR
= TRI
->isVGPR(MRI
, Reg
);
1233 DebugLoc DL
= MI
->getDebugLoc();
1234 if (IsVGPR
== TRI
->isVGPR(MRI
, ValueReg
)) {
1235 // Spiller during regalloc may restore a spilled register to its superclass.
1236 // It could result in AGPR spills restored to VGPRs or the other way around,
1237 // making the src and dst with identical regclasses at this point. It just
1238 // needs a copy in such cases.
1239 auto CopyMIB
= BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::COPY
), Dst
)
1240 .addReg(Src
, getKillRegState(IsKill
));
1241 CopyMIB
->setAsmPrinterFlag(MachineInstr::ReloadReuse
);
1244 unsigned Opc
= (IsStore
^ IsVGPR
) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1245 : AMDGPU::V_ACCVGPR_READ_B32_e64
;
1247 auto MIB
= BuildMI(MBB
, MI
, DL
, TII
->get(Opc
), Dst
)
1248 .addReg(Src
, getKillRegState(IsKill
));
1249 MIB
->setAsmPrinterFlag(MachineInstr::ReloadReuse
);
1253 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1254 // need to handle the case where an SGPR may need to be spilled while spilling.
1255 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget
&ST
,
1256 MachineFrameInfo
&MFI
,
1257 MachineBasicBlock::iterator MI
,
1260 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1261 MachineBasicBlock
*MBB
= MI
->getParent();
1262 const DebugLoc
&DL
= MI
->getDebugLoc();
1263 bool IsStore
= MI
->mayStore();
1265 unsigned Opc
= MI
->getOpcode();
1266 int LoadStoreOp
= IsStore
?
1267 getOffsetMUBUFStore(Opc
) : getOffsetMUBUFLoad(Opc
);
1268 if (LoadStoreOp
== -1)
1271 const MachineOperand
*Reg
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::vdata
);
1272 if (spillVGPRtoAGPR(ST
, *MBB
, MI
, Index
, 0, Reg
->getReg(), false).getInstr())
1275 MachineInstrBuilder NewMI
=
1276 BuildMI(*MBB
, MI
, DL
, TII
->get(LoadStoreOp
))
1278 .add(*TII
->getNamedOperand(*MI
, AMDGPU::OpName::srsrc
))
1279 .add(*TII
->getNamedOperand(*MI
, AMDGPU::OpName::soffset
))
1285 const MachineOperand
*VDataIn
= TII
->getNamedOperand(*MI
,
1286 AMDGPU::OpName::vdata_in
);
1288 NewMI
.add(*VDataIn
);
1292 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo
*TII
,
1293 unsigned LoadStoreOp
,
1295 bool IsStore
= TII
->get(LoadStoreOp
).mayStore();
1296 bool HasVAddr
= AMDGPU::hasNamedOperand(LoadStoreOp
, AMDGPU::OpName::vaddr
);
1298 !HasVAddr
&& !AMDGPU::hasNamedOperand(LoadStoreOp
, AMDGPU::OpName::saddr
);
1302 LoadStoreOp
= IsStore
? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1303 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR
;
1306 LoadStoreOp
= IsStore
? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1307 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR
;
1310 LoadStoreOp
= IsStore
? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1311 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR
;
1314 LoadStoreOp
= IsStore
? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1315 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR
;
1318 llvm_unreachable("Unexpected spill load/store size!");
1322 LoadStoreOp
= AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp
);
1324 LoadStoreOp
= AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp
);
1329 void SIRegisterInfo::buildSpillLoadStore(
1330 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator MI
, const DebugLoc
&DL
,
1331 unsigned LoadStoreOp
, int Index
, Register ValueReg
, bool IsKill
,
1332 MCRegister ScratchOffsetReg
, int64_t InstOffset
, MachineMemOperand
*MMO
,
1333 RegScavenger
*RS
, LiveRegUnits
*LiveUnits
) const {
1334 assert((!RS
|| !LiveUnits
) && "Only RS or LiveUnits can be set but not both");
1336 MachineFunction
*MF
= MBB
.getParent();
1337 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1338 const MachineFrameInfo
&MFI
= MF
->getFrameInfo();
1339 const SIMachineFunctionInfo
*FuncInfo
= MF
->getInfo
<SIMachineFunctionInfo
>();
1341 const MCInstrDesc
*Desc
= &TII
->get(LoadStoreOp
);
1342 bool IsStore
= Desc
->mayStore();
1343 bool IsFlat
= TII
->isFLATScratch(LoadStoreOp
);
1345 bool CanClobberSCC
= false;
1346 bool Scavenged
= false;
1347 MCRegister SOffset
= ScratchOffsetReg
;
1349 const TargetRegisterClass
*RC
= getRegClassForReg(MF
->getRegInfo(), ValueReg
);
1350 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1351 const bool IsAGPR
= !ST
.hasGFX90AInsts() && isAGPRClass(RC
);
1352 const unsigned RegWidth
= AMDGPU::getRegBitWidth(*RC
) / 8;
1354 // Always use 4 byte operations for AGPRs because we need to scavenge
1355 // a temporary VGPR.
1356 unsigned EltSize
= (IsFlat
&& !IsAGPR
) ? std::min(RegWidth
, 16u) : 4u;
1357 unsigned NumSubRegs
= RegWidth
/ EltSize
;
1358 unsigned Size
= NumSubRegs
* EltSize
;
1359 unsigned RemSize
= RegWidth
- Size
;
1360 unsigned NumRemSubRegs
= RemSize
? 1 : 0;
1361 int64_t Offset
= InstOffset
+ MFI
.getObjectOffset(Index
);
1362 int64_t MaterializedOffset
= Offset
;
1364 int64_t MaxOffset
= Offset
+ Size
+ RemSize
- EltSize
;
1365 int64_t ScratchOffsetRegDelta
= 0;
1367 if (IsFlat
&& EltSize
> 4) {
1368 LoadStoreOp
= getFlatScratchSpillOpcode(TII
, LoadStoreOp
, EltSize
);
1369 Desc
= &TII
->get(LoadStoreOp
);
1372 Align Alignment
= MFI
.getObjectAlign(Index
);
1373 const MachinePointerInfo
&BasePtrInfo
= MMO
->getPointerInfo();
1375 assert((IsFlat
|| ((Offset
% EltSize
) == 0)) &&
1376 "unexpected VGPR spill offset");
1378 // Track a VGPR to use for a constant offset we need to materialize.
1379 Register TmpOffsetVGPR
;
1381 // Track a VGPR to use as an intermediate value.
1382 Register TmpIntermediateVGPR
;
1383 bool UseVGPROffset
= false;
1385 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1387 auto MaterializeVOffset
= [&](Register SGPRBase
, Register TmpVGPR
,
1389 // We are using a VGPR offset
1390 if (IsFlat
&& SGPRBase
) {
1391 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1392 // SGPR, so perform the add as vector.
1393 // We don't need a base SGPR in the kernel.
1395 if (ST
.getConstantBusLimit(AMDGPU::V_ADD_U32_e64
) >= 2) {
1396 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::V_ADD_U32_e64
), TmpVGPR
)
1399 .addImm(0); // clamp
1401 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), TmpVGPR
)
1403 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::V_ADD_U32_e32
), TmpVGPR
)
1405 .addReg(TmpOffsetVGPR
);
1408 assert(TmpOffsetVGPR
);
1409 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), TmpVGPR
)
1414 bool IsOffsetLegal
=
1415 IsFlat
? TII
->isLegalFLATOffset(MaxOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
1416 SIInstrFlags::FlatScratch
)
1417 : TII
->isLegalMUBUFImmOffset(MaxOffset
);
1418 if (!IsOffsetLegal
|| (IsFlat
&& !SOffset
&& !ST
.hasFlatScratchSTMode())) {
1419 SOffset
= MCRegister();
1421 // We don't have access to the register scavenger if this function is called
1422 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1423 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1426 SOffset
= RS
->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass
, MI
, false, 0, false);
1428 // Piggy back on the liveness scan we just did see if SCC is dead.
1429 CanClobberSCC
= !RS
->isRegUsed(AMDGPU::SCC
);
1430 } else if (LiveUnits
) {
1431 CanClobberSCC
= LiveUnits
->available(AMDGPU::SCC
);
1432 for (MCRegister Reg
: AMDGPU::SGPR_32RegClass
) {
1433 if (LiveUnits
->available(Reg
) && !MF
->getRegInfo().isReserved(Reg
)) {
1440 if (ScratchOffsetReg
!= AMDGPU::NoRegister
&& !CanClobberSCC
)
1441 SOffset
= Register();
1444 UseVGPROffset
= true;
1447 TmpOffsetVGPR
= RS
->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass
, MI
, false, 0);
1450 for (MCRegister Reg
: AMDGPU::VGPR_32RegClass
) {
1451 if (LiveUnits
->available(Reg
) && !MF
->getRegInfo().isReserved(Reg
)) {
1452 TmpOffsetVGPR
= Reg
;
1458 assert(TmpOffsetVGPR
);
1459 } else if (!SOffset
&& CanClobberSCC
) {
1460 // There are no free SGPRs, and since we are in the process of spilling
1461 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1462 // on SI/CI and on VI it is true until we implement spilling using scalar
1463 // stores), we have no way to free up an SGPR. Our solution here is to
1464 // add the offset directly to the ScratchOffset or StackPtrOffset
1465 // register, and then subtract the offset after the spill to return the
1466 // register to it's original value.
1468 // TODO: If we don't have to do an emergency stack slot spill, converting
1469 // to use the VGPR offset is fewer instructions.
1470 if (!ScratchOffsetReg
)
1471 ScratchOffsetReg
= FuncInfo
->getStackPtrOffsetReg();
1472 SOffset
= ScratchOffsetReg
;
1473 ScratchOffsetRegDelta
= Offset
;
1478 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1479 // we can simplify the adjustment of Offset here to just scale with
1481 if (!IsFlat
&& !UseVGPROffset
)
1482 Offset
*= ST
.getWavefrontSize();
1484 if (!UseVGPROffset
&& !SOffset
)
1485 report_fatal_error("could not scavenge SGPR to spill in entry function");
1487 if (UseVGPROffset
) {
1488 // We are using a VGPR offset
1489 MaterializeVOffset(ScratchOffsetReg
, TmpOffsetVGPR
, Offset
);
1490 } else if (ScratchOffsetReg
== AMDGPU::NoRegister
) {
1491 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
), SOffset
).addImm(Offset
);
1493 assert(Offset
!= 0);
1494 auto Add
= BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), SOffset
)
1495 .addReg(ScratchOffsetReg
)
1497 Add
->getOperand(3).setIsDead(); // Mark SCC as dead.
1503 if (IsFlat
&& SOffset
== AMDGPU::NoRegister
) {
1504 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp
, AMDGPU::OpName::vaddr
) < 0
1505 && "Unexpected vaddr for flat scratch with a FI operand");
1507 if (UseVGPROffset
) {
1508 LoadStoreOp
= AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp
);
1510 assert(ST
.hasFlatScratchSTMode());
1511 LoadStoreOp
= AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp
);
1514 Desc
= &TII
->get(LoadStoreOp
);
1517 for (unsigned i
= 0, e
= NumSubRegs
+ NumRemSubRegs
, RegOffset
= 0; i
!= e
;
1518 ++i
, RegOffset
+= EltSize
) {
1519 if (i
== NumSubRegs
) {
1521 LoadStoreOp
= getFlatScratchSpillOpcode(TII
, LoadStoreOp
, EltSize
);
1523 Desc
= &TII
->get(LoadStoreOp
);
1525 if (!IsFlat
&& UseVGPROffset
) {
1526 int NewLoadStoreOp
= IsStore
? getOffenMUBUFStore(LoadStoreOp
)
1527 : getOffenMUBUFLoad(LoadStoreOp
);
1528 Desc
= &TII
->get(NewLoadStoreOp
);
1531 if (UseVGPROffset
&& TmpOffsetVGPR
== TmpIntermediateVGPR
) {
1532 // If we are spilling an AGPR beyond the range of the memory instruction
1533 // offset and need to use a VGPR offset, we ideally have at least 2
1534 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1535 // recycle the VGPR used for the offset which requires resetting after
1536 // each subregister.
1538 MaterializeVOffset(ScratchOffsetReg
, TmpOffsetVGPR
, MaterializedOffset
);
1541 unsigned NumRegs
= EltSize
/ 4;
1542 Register SubReg
= e
== 1
1544 : Register(getSubReg(ValueReg
,
1545 getSubRegFromChannel(RegOffset
/ 4, NumRegs
)));
1547 unsigned SOffsetRegState
= 0;
1548 unsigned SrcDstRegState
= getDefRegState(!IsStore
);
1549 const bool IsLastSubReg
= i
+ 1 == e
;
1550 const bool IsFirstSubReg
= i
== 0;
1552 SOffsetRegState
|= getKillRegState(Scavenged
);
1553 // The last implicit use carries the "Kill" flag.
1554 SrcDstRegState
|= getKillRegState(IsKill
);
1557 // Make sure the whole register is defined if there are undef components by
1558 // adding an implicit def of the super-reg on the first instruction.
1559 bool NeedSuperRegDef
= e
> 1 && IsStore
&& IsFirstSubReg
;
1560 bool NeedSuperRegImpOperand
= e
> 1;
1562 // Remaining element size to spill into memory after some parts of it
1563 // spilled into either AGPRs or VGPRs.
1564 unsigned RemEltSize
= EltSize
;
1566 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1567 // starting from the last lane. In case if a register cannot be completely
1568 // spilled into another register that will ensure its alignment does not
1569 // change. For targets with VGPR alignment requirement this is important
1570 // in case of flat scratch usage as we might get a scratch_load or
1571 // scratch_store of an unaligned register otherwise.
1572 for (int LaneS
= (RegOffset
+ EltSize
) / 4 - 1, Lane
= LaneS
,
1573 LaneE
= RegOffset
/ 4;
1574 Lane
>= LaneE
; --Lane
) {
1575 bool IsSubReg
= e
> 1 || EltSize
> 4;
1576 Register Sub
= IsSubReg
1577 ? Register(getSubReg(ValueReg
, getSubRegFromChannel(Lane
)))
1579 auto MIB
= spillVGPRtoAGPR(ST
, MBB
, MI
, Index
, Lane
, Sub
, IsKill
);
1580 if (!MIB
.getInstr())
1582 if (NeedSuperRegDef
|| (IsSubReg
&& IsStore
&& Lane
== LaneS
&& IsFirstSubReg
)) {
1583 MIB
.addReg(ValueReg
, RegState::ImplicitDefine
);
1584 NeedSuperRegDef
= false;
1586 if ((IsSubReg
|| NeedSuperRegImpOperand
) && (IsFirstSubReg
|| IsLastSubReg
)) {
1587 NeedSuperRegImpOperand
= true;
1588 unsigned State
= SrcDstRegState
;
1589 if (!IsLastSubReg
|| (Lane
!= LaneE
))
1590 State
&= ~RegState::Kill
;
1591 if (!IsFirstSubReg
|| (Lane
!= LaneS
))
1592 State
&= ~RegState::Define
;
1593 MIB
.addReg(ValueReg
, RegState::Implicit
| State
);
1598 if (!RemEltSize
) // Fully spilled into AGPRs.
1601 if (RemEltSize
!= EltSize
) { // Partially spilled to AGPRs
1602 assert(IsFlat
&& EltSize
> 4);
1604 unsigned NumRegs
= RemEltSize
/ 4;
1605 SubReg
= Register(getSubReg(ValueReg
,
1606 getSubRegFromChannel(RegOffset
/ 4, NumRegs
)));
1607 unsigned Opc
= getFlatScratchSpillOpcode(TII
, LoadStoreOp
, RemEltSize
);
1608 Desc
= &TII
->get(Opc
);
1611 unsigned FinalReg
= SubReg
;
1614 assert(EltSize
== 4);
1616 if (!TmpIntermediateVGPR
) {
1617 TmpIntermediateVGPR
= FuncInfo
->getVGPRForAGPRCopy();
1618 assert(MF
->getRegInfo().isReserved(TmpIntermediateVGPR
));
1621 auto AccRead
= BuildMI(MBB
, MI
, DL
,
1622 TII
->get(AMDGPU::V_ACCVGPR_READ_B32_e64
),
1623 TmpIntermediateVGPR
)
1624 .addReg(SubReg
, getKillRegState(IsKill
));
1625 if (NeedSuperRegDef
)
1626 AccRead
.addReg(ValueReg
, RegState::ImplicitDefine
);
1627 AccRead
->setAsmPrinterFlag(MachineInstr::ReloadReuse
);
1629 SubReg
= TmpIntermediateVGPR
;
1630 } else if (UseVGPROffset
) {
1631 if (!TmpOffsetVGPR
) {
1632 TmpOffsetVGPR
= RS
->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass
,
1634 RS
->setRegUsed(TmpOffsetVGPR
);
1638 MachinePointerInfo PInfo
= BasePtrInfo
.getWithOffset(RegOffset
);
1639 MachineMemOperand
*NewMMO
=
1640 MF
->getMachineMemOperand(PInfo
, MMO
->getFlags(), RemEltSize
,
1641 commonAlignment(Alignment
, RegOffset
));
1644 BuildMI(MBB
, MI
, DL
, *Desc
)
1645 .addReg(SubReg
, getDefRegState(!IsStore
) | getKillRegState(IsKill
));
1647 if (UseVGPROffset
) {
1648 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1649 // intermediate accvgpr_write.
1650 MIB
.addReg(TmpOffsetVGPR
, getKillRegState(IsLastSubReg
&& !IsAGPR
));
1654 MIB
.addReg(FuncInfo
->getScratchRSrcReg());
1656 if (SOffset
== AMDGPU::NoRegister
) {
1658 if (UseVGPROffset
&& ScratchOffsetReg
) {
1659 MIB
.addReg(ScratchOffsetReg
);
1661 assert(FuncInfo
->isBottomOfStack());
1666 MIB
.addReg(SOffset
, SOffsetRegState
);
1669 MIB
.addImm(Offset
+ RegOffset
);
1671 bool LastUse
= MMO
->getFlags() & MOLastUse
;
1672 MIB
.addImm(LastUse
? AMDGPU::CPol::TH_LU
: 0); // cpol
1675 MIB
.addImm(0); // swz
1676 MIB
.addMemOperand(NewMMO
);
1678 if (!IsAGPR
&& NeedSuperRegDef
)
1679 MIB
.addReg(ValueReg
, RegState::ImplicitDefine
);
1681 if (!IsStore
&& IsAGPR
&& TmpIntermediateVGPR
!= AMDGPU::NoRegister
) {
1682 MIB
= BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
),
1684 .addReg(TmpIntermediateVGPR
, RegState::Kill
);
1685 MIB
->setAsmPrinterFlag(MachineInstr::ReloadReuse
);
1688 if (NeedSuperRegImpOperand
&& (IsFirstSubReg
|| IsLastSubReg
))
1689 MIB
.addReg(ValueReg
, RegState::Implicit
| SrcDstRegState
);
1691 // The epilog restore of a wwm-scratch register can cause undesired
1692 // optimization during machine-cp post PrologEpilogInserter if the same
1693 // register was assigned for return value ABI lowering with a COPY
1694 // instruction. As given below, with the epilog reload, the earlier COPY
1695 // appeared to be dead during machine-cp.
1697 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1698 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1701 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1703 // WWM spill restore to preserve the inactive lanes of v0.
1704 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1705 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1706 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1708 // SI_RETURN implicit $vgpr0
1710 // To fix it, mark the same reg as a tied op for such restore instructions
1711 // so that it marks a usage for the preceding COPY.
1712 if (!IsStore
&& MI
!= MBB
.end() && MI
->isReturn() &&
1713 MI
->readsRegister(SubReg
, this)) {
1714 MIB
.addReg(SubReg
, RegState::Implicit
);
1715 MIB
->tieOperands(0, MIB
->getNumOperands() - 1);
1719 if (ScratchOffsetRegDelta
!= 0) {
1720 // Subtract the offset we added to the ScratchOffset register.
1721 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), SOffset
)
1723 .addImm(-ScratchOffsetRegDelta
);
1727 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder
&SB
, int Index
,
1728 int Offset
, bool IsLoad
,
1729 bool IsKill
) const {
1731 MachineFrameInfo
&FrameInfo
= SB
.MF
.getFrameInfo();
1732 assert(FrameInfo
.getStackID(Index
) != TargetStackID::SGPRSpill
);
1735 FrameInfo
.isFixedObjectIndex(Index
) && hasBasePointer(SB
.MF
)
1737 : getFrameRegister(SB
.MF
);
1739 Align Alignment
= FrameInfo
.getObjectAlign(Index
);
1740 MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(SB
.MF
, Index
);
1741 MachineMemOperand
*MMO
= SB
.MF
.getMachineMemOperand(
1742 PtrInfo
, IsLoad
? MachineMemOperand::MOLoad
: MachineMemOperand::MOStore
,
1743 SB
.EltSize
, Alignment
);
1746 unsigned Opc
= ST
.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1747 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET
;
1748 buildSpillLoadStore(*SB
.MBB
, SB
.MI
, SB
.DL
, Opc
, Index
, SB
.TmpVGPR
, false,
1749 FrameReg
, (int64_t)Offset
* SB
.EltSize
, MMO
, SB
.RS
);
1751 unsigned Opc
= ST
.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1752 : AMDGPU::BUFFER_STORE_DWORD_OFFSET
;
1753 buildSpillLoadStore(*SB
.MBB
, SB
.MI
, SB
.DL
, Opc
, Index
, SB
.TmpVGPR
, IsKill
,
1754 FrameReg
, (int64_t)Offset
* SB
.EltSize
, MMO
, SB
.RS
);
1755 // This only ever adds one VGPR spill
1756 SB
.MFI
.addToSpilledVGPRs(1);
1760 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI
, int Index
,
1761 RegScavenger
*RS
, SlotIndexes
*Indexes
,
1762 LiveIntervals
*LIS
, bool OnlyToVGPR
,
1763 bool SpillToPhysVGPRLane
) const {
1764 SGPRSpillBuilder
SB(*this, *ST
.getInstrInfo(), isWave32
, MI
, Index
, RS
);
1766 ArrayRef
<SpilledReg
> VGPRSpills
=
1767 SpillToPhysVGPRLane
? SB
.MFI
.getSGPRSpillToPhysicalVGPRLanes(Index
)
1768 : SB
.MFI
.getSGPRSpillToVirtualVGPRLanes(Index
);
1769 bool SpillToVGPR
= !VGPRSpills
.empty();
1770 if (OnlyToVGPR
&& !SpillToVGPR
)
1773 assert(SpillToVGPR
|| (SB
.SuperReg
!= SB
.MFI
.getStackPtrOffsetReg() &&
1774 SB
.SuperReg
!= SB
.MFI
.getFrameOffsetReg()));
1778 assert(SB
.NumSubRegs
== VGPRSpills
.size() &&
1779 "Num of VGPR lanes should be equal to num of SGPRs spilled");
1781 for (unsigned i
= 0, e
= SB
.NumSubRegs
; i
< e
; ++i
) {
1785 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1786 SpilledReg Spill
= VGPRSpills
[i
];
1788 bool IsFirstSubreg
= i
== 0;
1789 bool IsLastSubreg
= i
== SB
.NumSubRegs
- 1;
1790 bool UseKill
= SB
.IsKill
&& IsLastSubreg
;
1793 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1794 // spill to this specific vgpr in the first basic block.
1795 auto MIB
= BuildMI(*SB
.MBB
, MI
, SB
.DL
,
1796 SB
.TII
.get(AMDGPU::SI_SPILL_S32_TO_VGPR
), Spill
.VGPR
)
1797 .addReg(SubReg
, getKillRegState(UseKill
))
1799 .addReg(Spill
.VGPR
);
1802 Indexes
->replaceMachineInstrInMaps(*MI
, *MIB
);
1804 Indexes
->insertMachineInstrInMaps(*MIB
);
1807 if (IsFirstSubreg
&& SB
.NumSubRegs
> 1) {
1808 // We may be spilling a super-register which is only partially defined,
1809 // and need to ensure later spills think the value is defined.
1810 MIB
.addReg(SB
.SuperReg
, RegState::ImplicitDefine
);
1813 if (SB
.NumSubRegs
> 1 && (IsFirstSubreg
|| IsLastSubreg
))
1814 MIB
.addReg(SB
.SuperReg
, getKillRegState(UseKill
) | RegState::Implicit
);
1816 // FIXME: Since this spills to another register instead of an actual
1817 // frame index, we should delete the frame index when all references to
1823 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
1824 unsigned SubKillState
= getKillRegState((SB
.NumSubRegs
== 1) && SB
.IsKill
);
1826 // Per VGPR helper data
1827 auto PVD
= SB
.getPerVGPRData();
1829 for (unsigned Offset
= 0; Offset
< PVD
.NumVGPRs
; ++Offset
) {
1830 unsigned TmpVGPRFlags
= RegState::Undef
;
1832 // Write sub registers into the VGPR
1833 for (unsigned i
= Offset
* PVD
.PerVGPR
,
1834 e
= std::min((Offset
+ 1) * PVD
.PerVGPR
, SB
.NumSubRegs
);
1839 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1841 MachineInstrBuilder WriteLane
=
1842 BuildMI(*SB
.MBB
, MI
, SB
.DL
,
1843 SB
.TII
.get(AMDGPU::SI_SPILL_S32_TO_VGPR
), SB
.TmpVGPR
)
1844 .addReg(SubReg
, SubKillState
)
1845 .addImm(i
% PVD
.PerVGPR
)
1846 .addReg(SB
.TmpVGPR
, TmpVGPRFlags
);
1851 Indexes
->replaceMachineInstrInMaps(*MI
, *WriteLane
);
1853 Indexes
->insertMachineInstrInMaps(*WriteLane
);
1856 // There could be undef components of a spilled super register.
1857 // TODO: Can we detect this and skip the spill?
1858 if (SB
.NumSubRegs
> 1) {
1859 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1860 unsigned SuperKillState
= 0;
1861 if (i
+ 1 == SB
.NumSubRegs
)
1862 SuperKillState
|= getKillRegState(SB
.IsKill
);
1863 WriteLane
.addReg(SB
.SuperReg
, RegState::Implicit
| SuperKillState
);
1868 SB
.readWriteTmpVGPR(Offset
, /*IsLoad*/ false);
1874 MI
->eraseFromParent();
1875 SB
.MFI
.addToSpilledSGPRs(SB
.NumSubRegs
);
1878 LIS
->removeAllRegUnitsForPhysReg(SB
.SuperReg
);
1883 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI
, int Index
,
1884 RegScavenger
*RS
, SlotIndexes
*Indexes
,
1885 LiveIntervals
*LIS
, bool OnlyToVGPR
,
1886 bool SpillToPhysVGPRLane
) const {
1887 SGPRSpillBuilder
SB(*this, *ST
.getInstrInfo(), isWave32
, MI
, Index
, RS
);
1889 ArrayRef
<SpilledReg
> VGPRSpills
=
1890 SpillToPhysVGPRLane
? SB
.MFI
.getSGPRSpillToPhysicalVGPRLanes(Index
)
1891 : SB
.MFI
.getSGPRSpillToVirtualVGPRLanes(Index
);
1892 bool SpillToVGPR
= !VGPRSpills
.empty();
1893 if (OnlyToVGPR
&& !SpillToVGPR
)
1897 for (unsigned i
= 0, e
= SB
.NumSubRegs
; i
< e
; ++i
) {
1901 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1903 SpilledReg Spill
= VGPRSpills
[i
];
1904 auto MIB
= BuildMI(*SB
.MBB
, MI
, SB
.DL
,
1905 SB
.TII
.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR
), SubReg
)
1907 .addImm(Spill
.Lane
);
1908 if (SB
.NumSubRegs
> 1 && i
== 0)
1909 MIB
.addReg(SB
.SuperReg
, RegState::ImplicitDefine
);
1912 Indexes
->replaceMachineInstrInMaps(*MI
, *MIB
);
1914 Indexes
->insertMachineInstrInMaps(*MIB
);
1920 // Per VGPR helper data
1921 auto PVD
= SB
.getPerVGPRData();
1923 for (unsigned Offset
= 0; Offset
< PVD
.NumVGPRs
; ++Offset
) {
1924 // Load in VGPR data
1925 SB
.readWriteTmpVGPR(Offset
, /*IsLoad*/ true);
1928 for (unsigned i
= Offset
* PVD
.PerVGPR
,
1929 e
= std::min((Offset
+ 1) * PVD
.PerVGPR
, SB
.NumSubRegs
);
1934 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1936 bool LastSubReg
= (i
+ 1 == e
);
1937 auto MIB
= BuildMI(*SB
.MBB
, MI
, SB
.DL
,
1938 SB
.TII
.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR
), SubReg
)
1939 .addReg(SB
.TmpVGPR
, getKillRegState(LastSubReg
))
1941 if (SB
.NumSubRegs
> 1 && i
== 0)
1942 MIB
.addReg(SB
.SuperReg
, RegState::ImplicitDefine
);
1945 Indexes
->replaceMachineInstrInMaps(*MI
, *MIB
);
1947 Indexes
->insertMachineInstrInMaps(*MIB
);
1955 MI
->eraseFromParent();
1958 LIS
->removeAllRegUnitsForPhysReg(SB
.SuperReg
);
1963 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI
,
1964 MachineBasicBlock
&RestoreMBB
,
1965 Register SGPR
, RegScavenger
*RS
) const {
1966 SGPRSpillBuilder
SB(*this, *ST
.getInstrInfo(), isWave32
, MI
, SGPR
, false, 0,
1969 // Generate the spill of SGPR to SB.TmpVGPR.
1970 unsigned SubKillState
= getKillRegState((SB
.NumSubRegs
== 1) && SB
.IsKill
);
1971 auto PVD
= SB
.getPerVGPRData();
1972 for (unsigned Offset
= 0; Offset
< PVD
.NumVGPRs
; ++Offset
) {
1973 unsigned TmpVGPRFlags
= RegState::Undef
;
1974 // Write sub registers into the VGPR
1975 for (unsigned i
= Offset
* PVD
.PerVGPR
,
1976 e
= std::min((Offset
+ 1) * PVD
.PerVGPR
, SB
.NumSubRegs
);
1981 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1983 MachineInstrBuilder WriteLane
=
1984 BuildMI(*SB
.MBB
, MI
, SB
.DL
, SB
.TII
.get(AMDGPU::V_WRITELANE_B32
),
1986 .addReg(SubReg
, SubKillState
)
1987 .addImm(i
% PVD
.PerVGPR
)
1988 .addReg(SB
.TmpVGPR
, TmpVGPRFlags
);
1990 // There could be undef components of a spilled super register.
1991 // TODO: Can we detect this and skip the spill?
1992 if (SB
.NumSubRegs
> 1) {
1993 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1994 unsigned SuperKillState
= 0;
1995 if (i
+ 1 == SB
.NumSubRegs
)
1996 SuperKillState
|= getKillRegState(SB
.IsKill
);
1997 WriteLane
.addReg(SB
.SuperReg
, RegState::Implicit
| SuperKillState
);
2000 // Don't need to write VGPR out.
2003 // Restore clobbered registers in the specified restore block.
2004 MI
= RestoreMBB
.end();
2005 SB
.setMI(&RestoreMBB
, MI
);
2006 // Generate the restore of SGPR from SB.TmpVGPR.
2007 for (unsigned Offset
= 0; Offset
< PVD
.NumVGPRs
; ++Offset
) {
2008 // Don't need to load VGPR in.
2010 for (unsigned i
= Offset
* PVD
.PerVGPR
,
2011 e
= std::min((Offset
+ 1) * PVD
.PerVGPR
, SB
.NumSubRegs
);
2016 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
2017 bool LastSubReg
= (i
+ 1 == e
);
2018 auto MIB
= BuildMI(*SB
.MBB
, MI
, SB
.DL
, SB
.TII
.get(AMDGPU::V_READLANE_B32
),
2020 .addReg(SB
.TmpVGPR
, getKillRegState(LastSubReg
))
2022 if (SB
.NumSubRegs
> 1 && i
== 0)
2023 MIB
.addReg(SB
.SuperReg
, RegState::ImplicitDefine
);
2028 SB
.MFI
.addToSpilledSGPRs(SB
.NumSubRegs
);
2032 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2033 /// a VGPR and the stack slot can be safely eliminated when all other users are
2035 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
2036 MachineBasicBlock::iterator MI
, int FI
, RegScavenger
*RS
,
2037 SlotIndexes
*Indexes
, LiveIntervals
*LIS
, bool SpillToPhysVGPRLane
) const {
2038 switch (MI
->getOpcode()) {
2039 case AMDGPU::SI_SPILL_S1024_SAVE
:
2040 case AMDGPU::SI_SPILL_S512_SAVE
:
2041 case AMDGPU::SI_SPILL_S384_SAVE
:
2042 case AMDGPU::SI_SPILL_S352_SAVE
:
2043 case AMDGPU::SI_SPILL_S320_SAVE
:
2044 case AMDGPU::SI_SPILL_S288_SAVE
:
2045 case AMDGPU::SI_SPILL_S256_SAVE
:
2046 case AMDGPU::SI_SPILL_S224_SAVE
:
2047 case AMDGPU::SI_SPILL_S192_SAVE
:
2048 case AMDGPU::SI_SPILL_S160_SAVE
:
2049 case AMDGPU::SI_SPILL_S128_SAVE
:
2050 case AMDGPU::SI_SPILL_S96_SAVE
:
2051 case AMDGPU::SI_SPILL_S64_SAVE
:
2052 case AMDGPU::SI_SPILL_S32_SAVE
:
2053 return spillSGPR(MI
, FI
, RS
, Indexes
, LIS
, true, SpillToPhysVGPRLane
);
2054 case AMDGPU::SI_SPILL_S1024_RESTORE
:
2055 case AMDGPU::SI_SPILL_S512_RESTORE
:
2056 case AMDGPU::SI_SPILL_S384_RESTORE
:
2057 case AMDGPU::SI_SPILL_S352_RESTORE
:
2058 case AMDGPU::SI_SPILL_S320_RESTORE
:
2059 case AMDGPU::SI_SPILL_S288_RESTORE
:
2060 case AMDGPU::SI_SPILL_S256_RESTORE
:
2061 case AMDGPU::SI_SPILL_S224_RESTORE
:
2062 case AMDGPU::SI_SPILL_S192_RESTORE
:
2063 case AMDGPU::SI_SPILL_S160_RESTORE
:
2064 case AMDGPU::SI_SPILL_S128_RESTORE
:
2065 case AMDGPU::SI_SPILL_S96_RESTORE
:
2066 case AMDGPU::SI_SPILL_S64_RESTORE
:
2067 case AMDGPU::SI_SPILL_S32_RESTORE
:
2068 return restoreSGPR(MI
, FI
, RS
, Indexes
, LIS
, true, SpillToPhysVGPRLane
);
2070 llvm_unreachable("not an SGPR spill instruction");
2074 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI
,
2075 int SPAdj
, unsigned FIOperandNum
,
2076 RegScavenger
*RS
) const {
2077 MachineFunction
*MF
= MI
->getParent()->getParent();
2078 MachineBasicBlock
*MBB
= MI
->getParent();
2079 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
2080 MachineFrameInfo
&FrameInfo
= MF
->getFrameInfo();
2081 const SIInstrInfo
*TII
= ST
.getInstrInfo();
2082 DebugLoc DL
= MI
->getDebugLoc();
2084 assert(SPAdj
== 0 && "unhandled SP adjustment in call sequence?");
2086 assert(MF
->getRegInfo().isReserved(MFI
->getScratchRSrcReg()) &&
2087 "unreserved scratch RSRC register");
2089 MachineOperand
&FIOp
= MI
->getOperand(FIOperandNum
);
2090 int Index
= MI
->getOperand(FIOperandNum
).getIndex();
2092 Register FrameReg
= FrameInfo
.isFixedObjectIndex(Index
) && hasBasePointer(*MF
)
2094 : getFrameRegister(*MF
);
2096 switch (MI
->getOpcode()) {
2097 // SGPR register spill
2098 case AMDGPU::SI_SPILL_S1024_SAVE
:
2099 case AMDGPU::SI_SPILL_S512_SAVE
:
2100 case AMDGPU::SI_SPILL_S384_SAVE
:
2101 case AMDGPU::SI_SPILL_S352_SAVE
:
2102 case AMDGPU::SI_SPILL_S320_SAVE
:
2103 case AMDGPU::SI_SPILL_S288_SAVE
:
2104 case AMDGPU::SI_SPILL_S256_SAVE
:
2105 case AMDGPU::SI_SPILL_S224_SAVE
:
2106 case AMDGPU::SI_SPILL_S192_SAVE
:
2107 case AMDGPU::SI_SPILL_S160_SAVE
:
2108 case AMDGPU::SI_SPILL_S128_SAVE
:
2109 case AMDGPU::SI_SPILL_S96_SAVE
:
2110 case AMDGPU::SI_SPILL_S64_SAVE
:
2111 case AMDGPU::SI_SPILL_S32_SAVE
: {
2112 return spillSGPR(MI
, Index
, RS
);
2115 // SGPR register restore
2116 case AMDGPU::SI_SPILL_S1024_RESTORE
:
2117 case AMDGPU::SI_SPILL_S512_RESTORE
:
2118 case AMDGPU::SI_SPILL_S384_RESTORE
:
2119 case AMDGPU::SI_SPILL_S352_RESTORE
:
2120 case AMDGPU::SI_SPILL_S320_RESTORE
:
2121 case AMDGPU::SI_SPILL_S288_RESTORE
:
2122 case AMDGPU::SI_SPILL_S256_RESTORE
:
2123 case AMDGPU::SI_SPILL_S224_RESTORE
:
2124 case AMDGPU::SI_SPILL_S192_RESTORE
:
2125 case AMDGPU::SI_SPILL_S160_RESTORE
:
2126 case AMDGPU::SI_SPILL_S128_RESTORE
:
2127 case AMDGPU::SI_SPILL_S96_RESTORE
:
2128 case AMDGPU::SI_SPILL_S64_RESTORE
:
2129 case AMDGPU::SI_SPILL_S32_RESTORE
: {
2130 return restoreSGPR(MI
, Index
, RS
);
2133 // VGPR register spill
2134 case AMDGPU::SI_SPILL_V1024_SAVE
:
2135 case AMDGPU::SI_SPILL_V512_SAVE
:
2136 case AMDGPU::SI_SPILL_V384_SAVE
:
2137 case AMDGPU::SI_SPILL_V352_SAVE
:
2138 case AMDGPU::SI_SPILL_V320_SAVE
:
2139 case AMDGPU::SI_SPILL_V288_SAVE
:
2140 case AMDGPU::SI_SPILL_V256_SAVE
:
2141 case AMDGPU::SI_SPILL_V224_SAVE
:
2142 case AMDGPU::SI_SPILL_V192_SAVE
:
2143 case AMDGPU::SI_SPILL_V160_SAVE
:
2144 case AMDGPU::SI_SPILL_V128_SAVE
:
2145 case AMDGPU::SI_SPILL_V96_SAVE
:
2146 case AMDGPU::SI_SPILL_V64_SAVE
:
2147 case AMDGPU::SI_SPILL_V32_SAVE
:
2148 case AMDGPU::SI_SPILL_A1024_SAVE
:
2149 case AMDGPU::SI_SPILL_A512_SAVE
:
2150 case AMDGPU::SI_SPILL_A384_SAVE
:
2151 case AMDGPU::SI_SPILL_A352_SAVE
:
2152 case AMDGPU::SI_SPILL_A320_SAVE
:
2153 case AMDGPU::SI_SPILL_A288_SAVE
:
2154 case AMDGPU::SI_SPILL_A256_SAVE
:
2155 case AMDGPU::SI_SPILL_A224_SAVE
:
2156 case AMDGPU::SI_SPILL_A192_SAVE
:
2157 case AMDGPU::SI_SPILL_A160_SAVE
:
2158 case AMDGPU::SI_SPILL_A128_SAVE
:
2159 case AMDGPU::SI_SPILL_A96_SAVE
:
2160 case AMDGPU::SI_SPILL_A64_SAVE
:
2161 case AMDGPU::SI_SPILL_A32_SAVE
:
2162 case AMDGPU::SI_SPILL_AV1024_SAVE
:
2163 case AMDGPU::SI_SPILL_AV512_SAVE
:
2164 case AMDGPU::SI_SPILL_AV384_SAVE
:
2165 case AMDGPU::SI_SPILL_AV352_SAVE
:
2166 case AMDGPU::SI_SPILL_AV320_SAVE
:
2167 case AMDGPU::SI_SPILL_AV288_SAVE
:
2168 case AMDGPU::SI_SPILL_AV256_SAVE
:
2169 case AMDGPU::SI_SPILL_AV224_SAVE
:
2170 case AMDGPU::SI_SPILL_AV192_SAVE
:
2171 case AMDGPU::SI_SPILL_AV160_SAVE
:
2172 case AMDGPU::SI_SPILL_AV128_SAVE
:
2173 case AMDGPU::SI_SPILL_AV96_SAVE
:
2174 case AMDGPU::SI_SPILL_AV64_SAVE
:
2175 case AMDGPU::SI_SPILL_AV32_SAVE
:
2176 case AMDGPU::SI_SPILL_WWM_V32_SAVE
:
2177 case AMDGPU::SI_SPILL_WWM_AV32_SAVE
: {
2178 const MachineOperand
*VData
= TII
->getNamedOperand(*MI
,
2179 AMDGPU::OpName::vdata
);
2180 assert(TII
->getNamedOperand(*MI
, AMDGPU::OpName::soffset
)->getReg() ==
2181 MFI
->getStackPtrOffsetReg());
2183 unsigned Opc
= ST
.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2184 : AMDGPU::BUFFER_STORE_DWORD_OFFSET
;
2185 auto *MBB
= MI
->getParent();
2186 bool IsWWMRegSpill
= TII
->isWWMRegSpillOpcode(MI
->getOpcode());
2187 if (IsWWMRegSpill
) {
2188 TII
->insertScratchExecCopy(*MF
, *MBB
, MI
, DL
, MFI
->getSGPRForEXECCopy(),
2189 RS
->isRegUsed(AMDGPU::SCC
));
2191 buildSpillLoadStore(
2192 *MBB
, MI
, DL
, Opc
, Index
, VData
->getReg(), VData
->isKill(), FrameReg
,
2193 TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
)->getImm(),
2194 *MI
->memoperands_begin(), RS
);
2195 MFI
->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI
->getOpcode()));
2197 TII
->restoreExec(*MF
, *MBB
, MI
, DL
, MFI
->getSGPRForEXECCopy());
2199 MI
->eraseFromParent();
2202 case AMDGPU::SI_SPILL_V32_RESTORE
:
2203 case AMDGPU::SI_SPILL_V64_RESTORE
:
2204 case AMDGPU::SI_SPILL_V96_RESTORE
:
2205 case AMDGPU::SI_SPILL_V128_RESTORE
:
2206 case AMDGPU::SI_SPILL_V160_RESTORE
:
2207 case AMDGPU::SI_SPILL_V192_RESTORE
:
2208 case AMDGPU::SI_SPILL_V224_RESTORE
:
2209 case AMDGPU::SI_SPILL_V256_RESTORE
:
2210 case AMDGPU::SI_SPILL_V288_RESTORE
:
2211 case AMDGPU::SI_SPILL_V320_RESTORE
:
2212 case AMDGPU::SI_SPILL_V352_RESTORE
:
2213 case AMDGPU::SI_SPILL_V384_RESTORE
:
2214 case AMDGPU::SI_SPILL_V512_RESTORE
:
2215 case AMDGPU::SI_SPILL_V1024_RESTORE
:
2216 case AMDGPU::SI_SPILL_A32_RESTORE
:
2217 case AMDGPU::SI_SPILL_A64_RESTORE
:
2218 case AMDGPU::SI_SPILL_A96_RESTORE
:
2219 case AMDGPU::SI_SPILL_A128_RESTORE
:
2220 case AMDGPU::SI_SPILL_A160_RESTORE
:
2221 case AMDGPU::SI_SPILL_A192_RESTORE
:
2222 case AMDGPU::SI_SPILL_A224_RESTORE
:
2223 case AMDGPU::SI_SPILL_A256_RESTORE
:
2224 case AMDGPU::SI_SPILL_A288_RESTORE
:
2225 case AMDGPU::SI_SPILL_A320_RESTORE
:
2226 case AMDGPU::SI_SPILL_A352_RESTORE
:
2227 case AMDGPU::SI_SPILL_A384_RESTORE
:
2228 case AMDGPU::SI_SPILL_A512_RESTORE
:
2229 case AMDGPU::SI_SPILL_A1024_RESTORE
:
2230 case AMDGPU::SI_SPILL_AV32_RESTORE
:
2231 case AMDGPU::SI_SPILL_AV64_RESTORE
:
2232 case AMDGPU::SI_SPILL_AV96_RESTORE
:
2233 case AMDGPU::SI_SPILL_AV128_RESTORE
:
2234 case AMDGPU::SI_SPILL_AV160_RESTORE
:
2235 case AMDGPU::SI_SPILL_AV192_RESTORE
:
2236 case AMDGPU::SI_SPILL_AV224_RESTORE
:
2237 case AMDGPU::SI_SPILL_AV256_RESTORE
:
2238 case AMDGPU::SI_SPILL_AV288_RESTORE
:
2239 case AMDGPU::SI_SPILL_AV320_RESTORE
:
2240 case AMDGPU::SI_SPILL_AV352_RESTORE
:
2241 case AMDGPU::SI_SPILL_AV384_RESTORE
:
2242 case AMDGPU::SI_SPILL_AV512_RESTORE
:
2243 case AMDGPU::SI_SPILL_AV1024_RESTORE
:
2244 case AMDGPU::SI_SPILL_WWM_V32_RESTORE
:
2245 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE
: {
2246 const MachineOperand
*VData
= TII
->getNamedOperand(*MI
,
2247 AMDGPU::OpName::vdata
);
2248 assert(TII
->getNamedOperand(*MI
, AMDGPU::OpName::soffset
)->getReg() ==
2249 MFI
->getStackPtrOffsetReg());
2251 unsigned Opc
= ST
.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2252 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET
;
2253 auto *MBB
= MI
->getParent();
2254 bool IsWWMRegSpill
= TII
->isWWMRegSpillOpcode(MI
->getOpcode());
2255 if (IsWWMRegSpill
) {
2256 TII
->insertScratchExecCopy(*MF
, *MBB
, MI
, DL
, MFI
->getSGPRForEXECCopy(),
2257 RS
->isRegUsed(AMDGPU::SCC
));
2260 buildSpillLoadStore(
2261 *MBB
, MI
, DL
, Opc
, Index
, VData
->getReg(), VData
->isKill(), FrameReg
,
2262 TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
)->getImm(),
2263 *MI
->memoperands_begin(), RS
);
2266 TII
->restoreExec(*MF
, *MBB
, MI
, DL
, MFI
->getSGPRForEXECCopy());
2268 MI
->eraseFromParent();
2273 // Other access to frame index
2274 const DebugLoc
&DL
= MI
->getDebugLoc();
2276 int64_t Offset
= FrameInfo
.getObjectOffset(Index
);
2277 if (ST
.enableFlatScratch()) {
2278 if (TII
->isFLATScratch(*MI
)) {
2279 assert((int16_t)FIOperandNum
==
2280 AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
2281 AMDGPU::OpName::saddr
));
2283 // The offset is always swizzled, just replace it
2285 FIOp
.ChangeToRegister(FrameReg
, false);
2287 MachineOperand
*OffsetOp
=
2288 TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
);
2289 int64_t NewOffset
= Offset
+ OffsetOp
->getImm();
2290 if (TII
->isLegalFLATOffset(NewOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
2291 SIInstrFlags::FlatScratch
)) {
2292 OffsetOp
->setImm(NewOffset
);
2299 unsigned Opc
= MI
->getOpcode();
2301 if (AMDGPU::hasNamedOperand(Opc
, AMDGPU::OpName::vaddr
)) {
2302 NewOpc
= AMDGPU::getFlatScratchInstSVfromSVS(Opc
);
2303 } else if (ST
.hasFlatScratchSTMode()) {
2304 // On GFX10 we have ST mode to use no registers for an address.
2305 // Otherwise we need to materialize 0 into an SGPR.
2306 NewOpc
= AMDGPU::getFlatScratchInstSTfromSS(Opc
);
2310 // removeOperand doesn't fixup tied operand indexes as it goes, so
2311 // it asserts. Untie vdst_in for now and retie them afterwards.
2312 int VDstIn
= AMDGPU::getNamedOperandIdx(Opc
,
2313 AMDGPU::OpName::vdst_in
);
2314 bool TiedVDst
= VDstIn
!= -1 &&
2315 MI
->getOperand(VDstIn
).isReg() &&
2316 MI
->getOperand(VDstIn
).isTied();
2318 MI
->untieRegOperand(VDstIn
);
2321 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::saddr
));
2325 AMDGPU::getNamedOperandIdx(NewOpc
, AMDGPU::OpName::vdst
);
2327 AMDGPU::getNamedOperandIdx(NewOpc
, AMDGPU::OpName::vdst_in
);
2328 assert (NewVDst
!= -1 && NewVDstIn
!= -1 && "Must be tied!");
2329 MI
->tieOperands(NewVDst
, NewVDstIn
);
2331 MI
->setDesc(TII
->get(NewOpc
));
2338 FIOp
.ChangeToImmediate(Offset
);
2339 if (TII
->isImmOperandLegal(*MI
, FIOperandNum
, FIOp
))
2343 // We need to use register here. Check if we can use an SGPR or need
2345 FIOp
.ChangeToRegister(AMDGPU::M0
, false);
2346 bool UseSGPR
= TII
->isOperandLegal(*MI
, FIOperandNum
, &FIOp
);
2348 if (!Offset
&& FrameReg
&& UseSGPR
) {
2349 FIOp
.setReg(FrameReg
);
2353 const TargetRegisterClass
*RC
= UseSGPR
? &AMDGPU::SReg_32_XM0RegClass
2354 : &AMDGPU::VGPR_32RegClass
;
2357 RS
->scavengeRegisterBackwards(*RC
, MI
, false, 0, !UseSGPR
);
2358 FIOp
.setReg(TmpReg
);
2361 if ((!FrameReg
|| !Offset
) && TmpReg
) {
2362 unsigned Opc
= UseSGPR
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
2363 auto MIB
= BuildMI(*MBB
, MI
, DL
, TII
->get(Opc
), TmpReg
);
2365 MIB
.addReg(FrameReg
);
2372 bool NeedSaveSCC
= RS
->isRegUsed(AMDGPU::SCC
) &&
2373 !MI
->definesRegister(AMDGPU::SCC
, /*TRI=*/nullptr);
2377 : RS
->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass
,
2378 MI
, false, 0, !UseSGPR
);
2380 // TODO: for flat scratch another attempt can be made with a VGPR index
2381 // if no SGPRs can be scavenged.
2382 if ((!TmpSReg
&& !FrameReg
) || (!TmpReg
&& !UseSGPR
))
2383 report_fatal_error("Cannot scavenge register in FI elimination!");
2386 // Use frame register and restore it after.
2388 FIOp
.setReg(FrameReg
);
2389 FIOp
.setIsKill(false);
2393 assert(!(Offset
& 0x1) && "Flat scratch offset must be aligned!");
2394 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADDC_U32
), TmpSReg
)
2397 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_BITCMP1_B32
))
2400 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_BITSET0_B32
), TmpSReg
)
2404 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), TmpSReg
)
2410 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), TmpReg
)
2411 .addReg(TmpSReg
, RegState::Kill
);
2413 if (TmpSReg
== FrameReg
) {
2414 // Undo frame register modification.
2416 !MI
->registerDefIsDead(AMDGPU::SCC
, /*TRI=*/nullptr)) {
2417 MachineBasicBlock::iterator I
=
2418 BuildMI(*MBB
, std::next(MI
), DL
, TII
->get(AMDGPU::S_ADDC_U32
),
2422 I
= BuildMI(*MBB
, std::next(I
), DL
, TII
->get(AMDGPU::S_BITCMP1_B32
))
2425 BuildMI(*MBB
, std::next(I
), DL
, TII
->get(AMDGPU::S_BITSET0_B32
),
2430 BuildMI(*MBB
, std::next(MI
), DL
, TII
->get(AMDGPU::S_ADD_I32
),
2440 bool IsMUBUF
= TII
->isMUBUF(*MI
);
2442 if (!IsMUBUF
&& !MFI
->isBottomOfStack()) {
2443 // Convert to a swizzled stack address by scaling by the wave size.
2444 // In an entry function/kernel the offset is already swizzled.
2445 bool IsSALU
= isSGPRClass(TII
->getOpRegClass(*MI
, FIOperandNum
));
2446 bool LiveSCC
= RS
->isRegUsed(AMDGPU::SCC
) &&
2447 !MI
->definesRegister(AMDGPU::SCC
, /*TRI=*/nullptr);
2448 const TargetRegisterClass
*RC
= IsSALU
&& !LiveSCC
2449 ? &AMDGPU::SReg_32RegClass
2450 : &AMDGPU::VGPR_32RegClass
;
2451 bool IsCopy
= MI
->getOpcode() == AMDGPU::V_MOV_B32_e32
||
2452 MI
->getOpcode() == AMDGPU::V_MOV_B32_e64
;
2453 Register ResultReg
=
2454 IsCopy
? MI
->getOperand(0).getReg()
2455 : RS
->scavengeRegisterBackwards(*RC
, MI
, false, 0);
2457 int64_t Offset
= FrameInfo
.getObjectOffset(Index
);
2459 unsigned OpCode
= IsSALU
&& !LiveSCC
? AMDGPU::S_LSHR_B32
2460 : AMDGPU::V_LSHRREV_B32_e64
;
2461 auto Shift
= BuildMI(*MBB
, MI
, DL
, TII
->get(OpCode
), ResultReg
);
2462 if (OpCode
== AMDGPU::V_LSHRREV_B32_e64
)
2463 // For V_LSHRREV, the operands are reversed (the shift count goes
2465 Shift
.addImm(ST
.getWavefrontSizeLog2()).addReg(FrameReg
);
2467 Shift
.addReg(FrameReg
).addImm(ST
.getWavefrontSizeLog2());
2468 if (IsSALU
&& !LiveSCC
)
2469 Shift
.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
2470 if (IsSALU
&& LiveSCC
) {
2471 Register NewDest
= RS
->scavengeRegisterBackwards(
2472 AMDGPU::SReg_32RegClass
, Shift
, false, 0);
2473 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::V_READFIRSTLANE_B32
),
2476 ResultReg
= NewDest
;
2479 MachineInstrBuilder MIB
;
2481 if ((MIB
= TII
->getAddNoCarry(*MBB
, MI
, DL
, ResultReg
, *RS
)) !=
2483 // Reuse ResultReg in intermediate step.
2484 Register ScaledReg
= ResultReg
;
2486 BuildMI(*MBB
, *MIB
, DL
, TII
->get(AMDGPU::V_LSHRREV_B32_e64
),
2488 .addImm(ST
.getWavefrontSizeLog2())
2491 const bool IsVOP2
= MIB
->getOpcode() == AMDGPU::V_ADD_U32_e32
;
2493 // TODO: Fold if use instruction is another add of a constant.
2494 if (IsVOP2
|| AMDGPU::isInlinableLiteral32(Offset
, ST
.hasInv2PiInlineImm())) {
2495 // FIXME: This can fail
2497 MIB
.addReg(ScaledReg
, RegState::Kill
);
2499 MIB
.addImm(0); // clamp bit
2501 assert(MIB
->getOpcode() == AMDGPU::V_ADD_CO_U32_e64
&&
2502 "Need to reuse carry out register");
2504 // Use scavenged unused carry out as offset register.
2505 Register ConstOffsetReg
;
2507 ConstOffsetReg
= getSubReg(MIB
.getReg(1), AMDGPU::sub0
);
2509 ConstOffsetReg
= MIB
.getReg(1);
2511 BuildMI(*MBB
, *MIB
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ConstOffsetReg
)
2513 MIB
.addReg(ConstOffsetReg
, RegState::Kill
);
2514 MIB
.addReg(ScaledReg
, RegState::Kill
);
2515 MIB
.addImm(0); // clamp bit
2519 if (!MIB
|| IsSALU
) {
2520 // We have to produce a carry out, and there isn't a free SGPR pair
2521 // for it. We can keep the whole computation on the SALU to avoid
2522 // clobbering an additional register at the cost of an extra mov.
2524 // We may have 1 free scratch SGPR even though a carry out is
2525 // unavailable. Only one additional mov is needed.
2526 Register TmpScaledReg
= RS
->scavengeRegisterBackwards(
2527 AMDGPU::SReg_32_XM0RegClass
, MI
, false, 0, false);
2528 Register ScaledReg
= TmpScaledReg
.isValid() ? TmpScaledReg
: FrameReg
;
2530 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_LSHR_B32
), ScaledReg
)
2532 .addImm(ST
.getWavefrontSizeLog2());
2533 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), ScaledReg
)
2534 .addReg(ScaledReg
, RegState::Kill
)
2537 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::COPY
), ResultReg
)
2538 .addReg(ScaledReg
, RegState::Kill
);
2540 ResultReg
= ScaledReg
;
2542 // If there were truly no free SGPRs, we need to undo everything.
2543 if (!TmpScaledReg
.isValid()) {
2544 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), ScaledReg
)
2545 .addReg(ScaledReg
, RegState::Kill
)
2547 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_LSHL_B32
), ScaledReg
)
2549 .addImm(ST
.getWavefrontSizeLog2());
2554 // Don't introduce an extra copy if we're just materializing in a mov.
2556 MI
->eraseFromParent();
2559 FIOp
.ChangeToRegister(ResultReg
, false, false, true);
2564 // Disable offen so we don't need a 0 vgpr base.
2565 assert(static_cast<int>(FIOperandNum
) ==
2566 AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
2567 AMDGPU::OpName::vaddr
));
2569 auto &SOffset
= *TII
->getNamedOperand(*MI
, AMDGPU::OpName::soffset
);
2570 assert((SOffset
.isImm() && SOffset
.getImm() == 0));
2572 if (FrameReg
!= AMDGPU::NoRegister
)
2573 SOffset
.ChangeToRegister(FrameReg
, false);
2575 int64_t Offset
= FrameInfo
.getObjectOffset(Index
);
2577 = TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
)->getImm();
2578 int64_t NewOffset
= OldImm
+ Offset
;
2580 if (TII
->isLegalMUBUFImmOffset(NewOffset
) &&
2581 buildMUBUFOffsetLoadStore(ST
, FrameInfo
, MI
, Index
, NewOffset
)) {
2582 MI
->eraseFromParent();
2587 // If the offset is simply too big, don't convert to a scratch wave offset
2590 FIOp
.ChangeToImmediate(Offset
);
2591 if (!TII
->isImmOperandLegal(*MI
, FIOperandNum
, FIOp
)) {
2592 Register TmpReg
= RS
->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass
,
2594 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), TmpReg
)
2596 FIOp
.ChangeToRegister(TmpReg
, false, false, true);
2603 StringRef
SIRegisterInfo::getRegAsmName(MCRegister Reg
) const {
2604 return AMDGPUInstPrinter::getRegisterName(Reg
);
2607 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass
&RC
) {
2608 return getRegBitWidth(RC
.getID());
2611 static const TargetRegisterClass
*
2612 getAnyVGPRClassForBitWidth(unsigned BitWidth
) {
2614 return &AMDGPU::VReg_64RegClass
;
2616 return &AMDGPU::VReg_96RegClass
;
2617 if (BitWidth
== 128)
2618 return &AMDGPU::VReg_128RegClass
;
2619 if (BitWidth
== 160)
2620 return &AMDGPU::VReg_160RegClass
;
2621 if (BitWidth
== 192)
2622 return &AMDGPU::VReg_192RegClass
;
2623 if (BitWidth
== 224)
2624 return &AMDGPU::VReg_224RegClass
;
2625 if (BitWidth
== 256)
2626 return &AMDGPU::VReg_256RegClass
;
2627 if (BitWidth
== 288)
2628 return &AMDGPU::VReg_288RegClass
;
2629 if (BitWidth
== 320)
2630 return &AMDGPU::VReg_320RegClass
;
2631 if (BitWidth
== 352)
2632 return &AMDGPU::VReg_352RegClass
;
2633 if (BitWidth
== 384)
2634 return &AMDGPU::VReg_384RegClass
;
2635 if (BitWidth
== 512)
2636 return &AMDGPU::VReg_512RegClass
;
2637 if (BitWidth
== 1024)
2638 return &AMDGPU::VReg_1024RegClass
;
2643 static const TargetRegisterClass
*
2644 getAlignedVGPRClassForBitWidth(unsigned BitWidth
) {
2646 return &AMDGPU::VReg_64_Align2RegClass
;
2648 return &AMDGPU::VReg_96_Align2RegClass
;
2649 if (BitWidth
== 128)
2650 return &AMDGPU::VReg_128_Align2RegClass
;
2651 if (BitWidth
== 160)
2652 return &AMDGPU::VReg_160_Align2RegClass
;
2653 if (BitWidth
== 192)
2654 return &AMDGPU::VReg_192_Align2RegClass
;
2655 if (BitWidth
== 224)
2656 return &AMDGPU::VReg_224_Align2RegClass
;
2657 if (BitWidth
== 256)
2658 return &AMDGPU::VReg_256_Align2RegClass
;
2659 if (BitWidth
== 288)
2660 return &AMDGPU::VReg_288_Align2RegClass
;
2661 if (BitWidth
== 320)
2662 return &AMDGPU::VReg_320_Align2RegClass
;
2663 if (BitWidth
== 352)
2664 return &AMDGPU::VReg_352_Align2RegClass
;
2665 if (BitWidth
== 384)
2666 return &AMDGPU::VReg_384_Align2RegClass
;
2667 if (BitWidth
== 512)
2668 return &AMDGPU::VReg_512_Align2RegClass
;
2669 if (BitWidth
== 1024)
2670 return &AMDGPU::VReg_1024_Align2RegClass
;
2675 const TargetRegisterClass
*
2676 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth
) const {
2678 return &AMDGPU::VReg_1RegClass
;
2680 return &AMDGPU::VGPR_16RegClass
;
2682 return &AMDGPU::VGPR_32RegClass
;
2683 return ST
.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth
)
2684 : getAnyVGPRClassForBitWidth(BitWidth
);
2687 static const TargetRegisterClass
*
2688 getAnyAGPRClassForBitWidth(unsigned BitWidth
) {
2690 return &AMDGPU::AReg_64RegClass
;
2692 return &AMDGPU::AReg_96RegClass
;
2693 if (BitWidth
== 128)
2694 return &AMDGPU::AReg_128RegClass
;
2695 if (BitWidth
== 160)
2696 return &AMDGPU::AReg_160RegClass
;
2697 if (BitWidth
== 192)
2698 return &AMDGPU::AReg_192RegClass
;
2699 if (BitWidth
== 224)
2700 return &AMDGPU::AReg_224RegClass
;
2701 if (BitWidth
== 256)
2702 return &AMDGPU::AReg_256RegClass
;
2703 if (BitWidth
== 288)
2704 return &AMDGPU::AReg_288RegClass
;
2705 if (BitWidth
== 320)
2706 return &AMDGPU::AReg_320RegClass
;
2707 if (BitWidth
== 352)
2708 return &AMDGPU::AReg_352RegClass
;
2709 if (BitWidth
== 384)
2710 return &AMDGPU::AReg_384RegClass
;
2711 if (BitWidth
== 512)
2712 return &AMDGPU::AReg_512RegClass
;
2713 if (BitWidth
== 1024)
2714 return &AMDGPU::AReg_1024RegClass
;
2719 static const TargetRegisterClass
*
2720 getAlignedAGPRClassForBitWidth(unsigned BitWidth
) {
2722 return &AMDGPU::AReg_64_Align2RegClass
;
2724 return &AMDGPU::AReg_96_Align2RegClass
;
2725 if (BitWidth
== 128)
2726 return &AMDGPU::AReg_128_Align2RegClass
;
2727 if (BitWidth
== 160)
2728 return &AMDGPU::AReg_160_Align2RegClass
;
2729 if (BitWidth
== 192)
2730 return &AMDGPU::AReg_192_Align2RegClass
;
2731 if (BitWidth
== 224)
2732 return &AMDGPU::AReg_224_Align2RegClass
;
2733 if (BitWidth
== 256)
2734 return &AMDGPU::AReg_256_Align2RegClass
;
2735 if (BitWidth
== 288)
2736 return &AMDGPU::AReg_288_Align2RegClass
;
2737 if (BitWidth
== 320)
2738 return &AMDGPU::AReg_320_Align2RegClass
;
2739 if (BitWidth
== 352)
2740 return &AMDGPU::AReg_352_Align2RegClass
;
2741 if (BitWidth
== 384)
2742 return &AMDGPU::AReg_384_Align2RegClass
;
2743 if (BitWidth
== 512)
2744 return &AMDGPU::AReg_512_Align2RegClass
;
2745 if (BitWidth
== 1024)
2746 return &AMDGPU::AReg_1024_Align2RegClass
;
2751 const TargetRegisterClass
*
2752 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth
) const {
2754 return &AMDGPU::AGPR_LO16RegClass
;
2756 return &AMDGPU::AGPR_32RegClass
;
2757 return ST
.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth
)
2758 : getAnyAGPRClassForBitWidth(BitWidth
);
2761 static const TargetRegisterClass
*
2762 getAnyVectorSuperClassForBitWidth(unsigned BitWidth
) {
2764 return &AMDGPU::AV_64RegClass
;
2766 return &AMDGPU::AV_96RegClass
;
2767 if (BitWidth
== 128)
2768 return &AMDGPU::AV_128RegClass
;
2769 if (BitWidth
== 160)
2770 return &AMDGPU::AV_160RegClass
;
2771 if (BitWidth
== 192)
2772 return &AMDGPU::AV_192RegClass
;
2773 if (BitWidth
== 224)
2774 return &AMDGPU::AV_224RegClass
;
2775 if (BitWidth
== 256)
2776 return &AMDGPU::AV_256RegClass
;
2777 if (BitWidth
== 288)
2778 return &AMDGPU::AV_288RegClass
;
2779 if (BitWidth
== 320)
2780 return &AMDGPU::AV_320RegClass
;
2781 if (BitWidth
== 352)
2782 return &AMDGPU::AV_352RegClass
;
2783 if (BitWidth
== 384)
2784 return &AMDGPU::AV_384RegClass
;
2785 if (BitWidth
== 512)
2786 return &AMDGPU::AV_512RegClass
;
2787 if (BitWidth
== 1024)
2788 return &AMDGPU::AV_1024RegClass
;
2793 static const TargetRegisterClass
*
2794 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth
) {
2796 return &AMDGPU::AV_64_Align2RegClass
;
2798 return &AMDGPU::AV_96_Align2RegClass
;
2799 if (BitWidth
== 128)
2800 return &AMDGPU::AV_128_Align2RegClass
;
2801 if (BitWidth
== 160)
2802 return &AMDGPU::AV_160_Align2RegClass
;
2803 if (BitWidth
== 192)
2804 return &AMDGPU::AV_192_Align2RegClass
;
2805 if (BitWidth
== 224)
2806 return &AMDGPU::AV_224_Align2RegClass
;
2807 if (BitWidth
== 256)
2808 return &AMDGPU::AV_256_Align2RegClass
;
2809 if (BitWidth
== 288)
2810 return &AMDGPU::AV_288_Align2RegClass
;
2811 if (BitWidth
== 320)
2812 return &AMDGPU::AV_320_Align2RegClass
;
2813 if (BitWidth
== 352)
2814 return &AMDGPU::AV_352_Align2RegClass
;
2815 if (BitWidth
== 384)
2816 return &AMDGPU::AV_384_Align2RegClass
;
2817 if (BitWidth
== 512)
2818 return &AMDGPU::AV_512_Align2RegClass
;
2819 if (BitWidth
== 1024)
2820 return &AMDGPU::AV_1024_Align2RegClass
;
2825 const TargetRegisterClass
*
2826 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth
) const {
2828 return &AMDGPU::AV_32RegClass
;
2829 return ST
.needsAlignedVGPRs()
2830 ? getAlignedVectorSuperClassForBitWidth(BitWidth
)
2831 : getAnyVectorSuperClassForBitWidth(BitWidth
);
2834 const TargetRegisterClass
*
2835 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth
) {
2837 return &AMDGPU::SGPR_LO16RegClass
;
2839 return &AMDGPU::SReg_32RegClass
;
2841 return &AMDGPU::SReg_64RegClass
;
2843 return &AMDGPU::SGPR_96RegClass
;
2844 if (BitWidth
== 128)
2845 return &AMDGPU::SGPR_128RegClass
;
2846 if (BitWidth
== 160)
2847 return &AMDGPU::SGPR_160RegClass
;
2848 if (BitWidth
== 192)
2849 return &AMDGPU::SGPR_192RegClass
;
2850 if (BitWidth
== 224)
2851 return &AMDGPU::SGPR_224RegClass
;
2852 if (BitWidth
== 256)
2853 return &AMDGPU::SGPR_256RegClass
;
2854 if (BitWidth
== 288)
2855 return &AMDGPU::SGPR_288RegClass
;
2856 if (BitWidth
== 320)
2857 return &AMDGPU::SGPR_320RegClass
;
2858 if (BitWidth
== 352)
2859 return &AMDGPU::SGPR_352RegClass
;
2860 if (BitWidth
== 384)
2861 return &AMDGPU::SGPR_384RegClass
;
2862 if (BitWidth
== 512)
2863 return &AMDGPU::SGPR_512RegClass
;
2864 if (BitWidth
== 1024)
2865 return &AMDGPU::SGPR_1024RegClass
;
2870 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo
&MRI
,
2871 Register Reg
) const {
2872 const TargetRegisterClass
*RC
;
2873 if (Reg
.isVirtual())
2874 RC
= MRI
.getRegClass(Reg
);
2876 RC
= getPhysRegBaseClass(Reg
);
2877 return RC
? isSGPRClass(RC
) : false;
2880 const TargetRegisterClass
*
2881 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass
*SRC
) const {
2882 unsigned Size
= getRegSizeInBits(*SRC
);
2883 const TargetRegisterClass
*VRC
= getVGPRClassForBitWidth(Size
);
2884 assert(VRC
&& "Invalid register class size");
2888 const TargetRegisterClass
*
2889 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass
*SRC
) const {
2890 unsigned Size
= getRegSizeInBits(*SRC
);
2891 const TargetRegisterClass
*ARC
= getAGPRClassForBitWidth(Size
);
2892 assert(ARC
&& "Invalid register class size");
2896 const TargetRegisterClass
*
2897 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass
*VRC
) const {
2898 unsigned Size
= getRegSizeInBits(*VRC
);
2900 return &AMDGPU::SGPR_32RegClass
;
2901 const TargetRegisterClass
*SRC
= getSGPRClassForBitWidth(Size
);
2902 assert(SRC
&& "Invalid register class size");
2906 const TargetRegisterClass
*
2907 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass
*SuperRC
,
2908 const TargetRegisterClass
*SubRC
,
2909 unsigned SubIdx
) const {
2910 // Ensure this subregister index is aligned in the super register.
2911 const TargetRegisterClass
*MatchRC
=
2912 getMatchingSuperRegClass(SuperRC
, SubRC
, SubIdx
);
2913 return MatchRC
&& MatchRC
->hasSubClassEq(SuperRC
) ? MatchRC
: nullptr;
2916 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType
) const {
2917 if (OpType
>= AMDGPU::OPERAND_REG_INLINE_AC_FIRST
&&
2918 OpType
<= AMDGPU::OPERAND_REG_INLINE_AC_LAST
)
2919 return !ST
.hasMFMAInlineLiteralBug();
2921 return OpType
>= AMDGPU::OPERAND_SRC_FIRST
&&
2922 OpType
<= AMDGPU::OPERAND_SRC_LAST
;
2925 bool SIRegisterInfo::shouldRewriteCopySrc(
2926 const TargetRegisterClass
*DefRC
,
2928 const TargetRegisterClass
*SrcRC
,
2929 unsigned SrcSubReg
) const {
2930 // We want to prefer the smallest register class possible, so we don't want to
2931 // stop and rewrite on anything that looks like a subregister
2932 // extract. Operations mostly don't care about the super register class, so we
2933 // only want to stop on the most basic of copies between the same register
2936 // e.g. if we have something like
2939 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
2940 // %3 = COPY %2, sub0
2942 // We want to look through the COPY to find:
2946 return getCommonSubClass(DefRC
, SrcRC
) != nullptr;
2949 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType
) const {
2950 // TODO: 64-bit operands have extending behavior from 32-bit literal.
2951 return OpType
>= AMDGPU::OPERAND_REG_IMM_FIRST
&&
2952 OpType
<= AMDGPU::OPERAND_REG_IMM_LAST
;
2955 /// Returns a lowest register that is not used at any point in the function.
2956 /// If all registers are used, then this function will return
2957 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
2958 /// highest unused register.
2959 MCRegister
SIRegisterInfo::findUnusedRegister(
2960 const MachineRegisterInfo
&MRI
, const TargetRegisterClass
*RC
,
2961 const MachineFunction
&MF
, bool ReserveHighestRegister
) const {
2962 if (ReserveHighestRegister
) {
2963 for (MCRegister Reg
: reverse(*RC
))
2964 if (MRI
.isAllocatable(Reg
) && !MRI
.isPhysRegUsed(Reg
))
2967 for (MCRegister Reg
: *RC
)
2968 if (MRI
.isAllocatable(Reg
) && !MRI
.isPhysRegUsed(Reg
))
2971 return MCRegister();
2974 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo
&MRI
,
2975 const RegisterBankInfo
&RBI
,
2976 Register Reg
) const {
2977 auto *RB
= RBI
.getRegBank(Reg
, MRI
, *MRI
.getTargetRegisterInfo());
2981 return !RBI
.isDivergentRegBank(RB
);
2984 ArrayRef
<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass
*RC
,
2985 unsigned EltSize
) const {
2986 const unsigned RegBitWidth
= AMDGPU::getRegBitWidth(*RC
);
2987 assert(RegBitWidth
>= 32 && RegBitWidth
<= 1024);
2989 const unsigned RegDWORDs
= RegBitWidth
/ 32;
2990 const unsigned EltDWORDs
= EltSize
/ 4;
2991 assert(RegSplitParts
.size() + 1 >= EltDWORDs
);
2993 const std::vector
<int16_t> &Parts
= RegSplitParts
[EltDWORDs
- 1];
2994 const unsigned NumParts
= RegDWORDs
/ EltDWORDs
;
2996 return ArrayRef(Parts
.data(), NumParts
);
2999 const TargetRegisterClass
*
3000 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo
&MRI
,
3001 Register Reg
) const {
3002 return Reg
.isVirtual() ? MRI
.getRegClass(Reg
) : getPhysRegBaseClass(Reg
);
3005 const TargetRegisterClass
*
3006 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo
&MRI
,
3007 const MachineOperand
&MO
) const {
3008 const TargetRegisterClass
*SrcRC
= getRegClassForReg(MRI
, MO
.getReg());
3009 return getSubRegisterClass(SrcRC
, MO
.getSubReg());
3012 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo
&MRI
,
3013 Register Reg
) const {
3014 const TargetRegisterClass
*RC
= getRegClassForReg(MRI
, Reg
);
3015 // Registers without classes are unaddressable, SGPR-like registers.
3016 return RC
&& isVGPRClass(RC
);
3019 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo
&MRI
,
3020 Register Reg
) const {
3021 const TargetRegisterClass
*RC
= getRegClassForReg(MRI
, Reg
);
3023 // Registers without classes are unaddressable, SGPR-like registers.
3024 return RC
&& isAGPRClass(RC
);
3027 bool SIRegisterInfo::shouldCoalesce(MachineInstr
*MI
,
3028 const TargetRegisterClass
*SrcRC
,
3030 const TargetRegisterClass
*DstRC
,
3032 const TargetRegisterClass
*NewRC
,
3033 LiveIntervals
&LIS
) const {
3034 unsigned SrcSize
= getRegSizeInBits(*SrcRC
);
3035 unsigned DstSize
= getRegSizeInBits(*DstRC
);
3036 unsigned NewSize
= getRegSizeInBits(*NewRC
);
3038 // Do not increase size of registers beyond dword, we would need to allocate
3039 // adjacent registers and constraint regalloc more than needed.
3041 // Always allow dword coalescing.
3042 if (SrcSize
<= 32 || DstSize
<= 32)
3045 return NewSize
<= DstSize
|| NewSize
<= SrcSize
;
3048 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass
*RC
,
3049 MachineFunction
&MF
) const {
3050 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
3052 unsigned Occupancy
= ST
.getOccupancyWithLocalMemSize(MFI
->getLDSSize(),
3054 switch (RC
->getID()) {
3056 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC
, MF
);
3057 case AMDGPU::VGPR_32RegClassID
:
3058 return std::min(ST
.getMaxNumVGPRs(Occupancy
), ST
.getMaxNumVGPRs(MF
));
3059 case AMDGPU::SGPR_32RegClassID
:
3060 case AMDGPU::SGPR_LO16RegClassID
:
3061 return std::min(ST
.getMaxNumSGPRs(Occupancy
, true), ST
.getMaxNumSGPRs(MF
));
3065 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction
&MF
,
3066 unsigned Idx
) const {
3067 if (Idx
== AMDGPU::RegisterPressureSets::VGPR_32
||
3068 Idx
== AMDGPU::RegisterPressureSets::AGPR_32
)
3069 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass
,
3070 const_cast<MachineFunction
&>(MF
));
3072 if (Idx
== AMDGPU::RegisterPressureSets::SReg_32
)
3073 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass
,
3074 const_cast<MachineFunction
&>(MF
));
3076 llvm_unreachable("Unexpected register pressure set!");
3079 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit
) const {
3080 static const int Empty
[] = { -1 };
3082 if (RegPressureIgnoredUnits
[RegUnit
])
3085 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit
);
3088 MCRegister
SIRegisterInfo::getReturnAddressReg(const MachineFunction
&MF
) const {
3089 // Not a callee saved register.
3090 return AMDGPU::SGPR30_SGPR31
;
3093 const TargetRegisterClass
*
3094 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size
,
3095 const RegisterBank
&RB
) const {
3096 switch (RB
.getID()) {
3097 case AMDGPU::VGPRRegBankID
:
3098 return getVGPRClassForBitWidth(
3099 std::max(ST
.useRealTrue16Insts() ? 16u : 32u, Size
));
3100 case AMDGPU::VCCRegBankID
:
3102 return isWave32
? &AMDGPU::SReg_32_XM0_XEXECRegClass
3103 : &AMDGPU::SReg_64_XEXECRegClass
;
3104 case AMDGPU::SGPRRegBankID
:
3105 return getSGPRClassForBitWidth(std::max(32u, Size
));
3106 case AMDGPU::AGPRRegBankID
:
3107 return getAGPRClassForBitWidth(std::max(32u, Size
));
3109 llvm_unreachable("unknown register bank");
3113 const TargetRegisterClass
*
3114 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand
&MO
,
3115 const MachineRegisterInfo
&MRI
) const {
3116 const RegClassOrRegBank
&RCOrRB
= MRI
.getRegClassOrRegBank(MO
.getReg());
3117 if (const RegisterBank
*RB
= RCOrRB
.dyn_cast
<const RegisterBank
*>())
3118 return getRegClassForTypeOnBank(MRI
.getType(MO
.getReg()), *RB
);
3120 if (const auto *RC
= RCOrRB
.dyn_cast
<const TargetRegisterClass
*>())
3121 return getAllocatableClass(RC
);
3126 MCRegister
SIRegisterInfo::getVCC() const {
3127 return isWave32
? AMDGPU::VCC_LO
: AMDGPU::VCC
;
3130 MCRegister
SIRegisterInfo::getExec() const {
3131 return isWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
3134 const TargetRegisterClass
*SIRegisterInfo::getVGPR64Class() const {
3135 // VGPR tuples have an alignment requirement on gfx90a variants.
3136 return ST
.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3137 : &AMDGPU::VReg_64RegClass
;
3140 const TargetRegisterClass
*
3141 SIRegisterInfo::getRegClass(unsigned RCID
) const {
3142 switch ((int)RCID
) {
3143 case AMDGPU::SReg_1RegClassID
:
3145 case AMDGPU::SReg_1_XEXECRegClassID
:
3146 return isWave32
? &AMDGPU::SReg_32_XM0_XEXECRegClass
3147 : &AMDGPU::SReg_64_XEXECRegClass
;
3151 return AMDGPUGenRegisterInfo::getRegClass(RCID
);
3155 // Find reaching register definition
3156 MachineInstr
*SIRegisterInfo::findReachingDef(Register Reg
, unsigned SubReg
,
3158 MachineRegisterInfo
&MRI
,
3159 LiveIntervals
*LIS
) const {
3160 auto &MDT
= LIS
->getDomTree();
3161 SlotIndex UseIdx
= LIS
->getInstructionIndex(Use
);
3164 if (Reg
.isVirtual()) {
3165 if (!LIS
->hasInterval(Reg
))
3167 LiveInterval
&LI
= LIS
->getInterval(Reg
);
3168 LaneBitmask SubLanes
= SubReg
? getSubRegIndexLaneMask(SubReg
)
3169 : MRI
.getMaxLaneMaskForVReg(Reg
);
3170 VNInfo
*V
= nullptr;
3171 if (LI
.hasSubRanges()) {
3172 for (auto &S
: LI
.subranges()) {
3173 if ((S
.LaneMask
& SubLanes
) == SubLanes
) {
3174 V
= S
.getVNInfoAt(UseIdx
);
3179 V
= LI
.getVNInfoAt(UseIdx
);
3186 for (MCRegUnit Unit
: regunits(Reg
.asMCReg())) {
3187 LiveRange
&LR
= LIS
->getRegUnit(Unit
);
3188 if (VNInfo
*V
= LR
.getVNInfoAt(UseIdx
)) {
3189 if (!DefIdx
.isValid() ||
3190 MDT
.dominates(LIS
->getInstructionFromIndex(DefIdx
),
3191 LIS
->getInstructionFromIndex(V
->def
)))
3199 MachineInstr
*Def
= LIS
->getInstructionFromIndex(DefIdx
);
3201 if (!Def
|| !MDT
.dominates(Def
, &Use
))
3204 assert(Def
->modifiesRegister(Reg
, this));
3209 MCPhysReg
SIRegisterInfo::get32BitRegister(MCPhysReg Reg
) const {
3210 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg
)) <= 32);
3212 for (const TargetRegisterClass
&RC
: { AMDGPU::VGPR_32RegClass
,
3213 AMDGPU::SReg_32RegClass
,
3214 AMDGPU::AGPR_32RegClass
} ) {
3215 if (MCPhysReg Super
= getMatchingSuperReg(Reg
, AMDGPU::lo16
, &RC
))
3218 if (MCPhysReg Super
= getMatchingSuperReg(Reg
, AMDGPU::hi16
,
3219 &AMDGPU::VGPR_32RegClass
)) {
3223 return AMDGPU::NoRegister
;
3226 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass
&RC
) const {
3227 if (!ST
.needsAlignedVGPRs())
3230 if (isVGPRClass(&RC
))
3231 return RC
.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC
)));
3232 if (isAGPRClass(&RC
))
3233 return RC
.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC
)));
3234 if (isVectorSuperClass(&RC
))
3235 return RC
.hasSuperClassEq(
3236 getVectorSuperClassForBitWidth(getRegSizeInBits(RC
)));
3241 const TargetRegisterClass
*
3242 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass
*RC
) const {
3243 if (!RC
|| !ST
.needsAlignedVGPRs())
3246 unsigned Size
= getRegSizeInBits(*RC
);
3250 if (isVGPRClass(RC
))
3251 return getAlignedVGPRClassForBitWidth(Size
);
3252 if (isAGPRClass(RC
))
3253 return getAlignedAGPRClassForBitWidth(Size
);
3254 if (isVectorSuperClass(RC
))
3255 return getAlignedVectorSuperClassForBitWidth(Size
);
3261 SIRegisterInfo::getAllSGPR128(const MachineFunction
&MF
) const {
3262 return ArrayRef(AMDGPU::SGPR_128RegClass
.begin(), ST
.getMaxNumSGPRs(MF
) / 4);
3266 SIRegisterInfo::getAllSGPR64(const MachineFunction
&MF
) const {
3267 return ArrayRef(AMDGPU::SGPR_64RegClass
.begin(), ST
.getMaxNumSGPRs(MF
) / 2);
3271 SIRegisterInfo::getAllSGPR32(const MachineFunction
&MF
) const {
3272 return ArrayRef(AMDGPU::SGPR_32RegClass
.begin(), ST
.getMaxNumSGPRs(MF
));
3276 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass
*RC
,
3277 unsigned SubReg
) const {
3278 switch (RC
->TSFlags
& SIRCFlags::RegKindMask
) {
3279 case SIRCFlags::HasSGPR
:
3280 return std::min(128u, getSubRegIdxSize(SubReg
));
3281 case SIRCFlags::HasAGPR
:
3282 case SIRCFlags::HasVGPR
:
3283 case SIRCFlags::HasVGPR
| SIRCFlags::HasAGPR
:
3284 return std::min(32u, getSubRegIdxSize(SubReg
));