1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// SI implementation of the TargetRegisterInfo class.
12 //===----------------------------------------------------------------------===//
14 #include "SIRegisterInfo.h"
16 #include "AMDGPURegisterBankInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUInstPrinter.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/RegisterScavenging.h"
27 #define GET_REGINFO_TARGET_DESC
28 #include "AMDGPUGenRegisterInfo.inc"
30 static cl::opt
<bool> EnableSpillSGPRToVGPR(
31 "amdgpu-spill-sgpr-to-vgpr",
32 cl::desc("Enable spilling VGPRs to SGPRs"),
36 std::array
<std::vector
<int16_t>, 16> SIRegisterInfo::RegSplitParts
;
37 std::array
<std::array
<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable
;
39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
42 // meaning index 7 in SubRegFromChannelTable.
43 static const std::array
<unsigned, 17> SubRegFromChannelTableWidthMap
= {
44 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
48 // A temporary struct to spill SGPRs.
49 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
50 // just v_writelane and v_readlane.
52 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
53 // is saved to scratch (or the other way around for loads).
54 // For this, a VGPR is required where the needed lanes can be clobbered. The
55 // RegScavenger can provide a VGPR where currently active lanes can be
56 // clobbered, but we still need to save inactive lanes.
57 // The high-level steps are:
58 // - Try to scavenge SGPR(s) to save exec
59 // - Try to scavenge VGPR
60 // - Save needed, all or inactive lanes of a TmpVGPR
61 // - Spill/Restore SGPRs using TmpVGPR
64 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
65 // cannot scavenge temporary SGPRs to save exec, we use the following code:
66 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved
68 // buffer_store_dword TmpVGPR ; save inactive lanes
70 struct SGPRSpillBuilder
{
79 MachineBasicBlock::iterator MI
;
80 ArrayRef
<int16_t> SplitParts
;
85 /* When spilling to stack */
86 // The SGPRs are written into this VGPR, which is then written to scratch
87 // (or vice versa for loads).
88 Register TmpVGPR
= AMDGPU::NoRegister
;
89 // Temporary spill slot to save TmpVGPR to.
91 // If TmpVGPR is live before the spill or if it is scavenged.
92 bool TmpVGPRLive
= false;
93 // Scavenged SGPR to save EXEC.
94 Register SavedExecReg
= AMDGPU::NoRegister
;
95 // Stack index to write the SGPRs to.
100 MachineBasicBlock
&MBB
;
102 SIMachineFunctionInfo
&MFI
;
103 const SIInstrInfo
&TII
;
104 const SIRegisterInfo
&TRI
;
110 SGPRSpillBuilder(const SIRegisterInfo
&TRI
, const SIInstrInfo
&TII
,
111 bool IsWave32
, MachineBasicBlock::iterator MI
, int Index
,
113 : SuperReg(MI
->getOperand(0).getReg()), MI(MI
),
114 IsKill(MI
->getOperand(0).isKill()), DL(MI
->getDebugLoc()), Index(Index
),
115 RS(RS
), MBB(*MI
->getParent()), MF(*MBB
.getParent()),
116 MFI(*MF
.getInfo
<SIMachineFunctionInfo
>()), TII(TII
), TRI(TRI
),
118 const TargetRegisterClass
*RC
= TRI
.getPhysRegClass(SuperReg
);
119 SplitParts
= TRI
.getRegSplitParts(RC
, EltSize
);
120 NumSubRegs
= SplitParts
.empty() ? 1 : SplitParts
.size();
123 ExecReg
= AMDGPU::EXEC_LO
;
124 MovOpc
= AMDGPU::S_MOV_B32
;
125 NotOpc
= AMDGPU::S_NOT_B32
;
127 ExecReg
= AMDGPU::EXEC
;
128 MovOpc
= AMDGPU::S_MOV_B64
;
129 NotOpc
= AMDGPU::S_NOT_B64
;
132 assert(SuperReg
!= AMDGPU::M0
&& "m0 should never spill");
133 assert(SuperReg
!= AMDGPU::EXEC_LO
&& SuperReg
!= AMDGPU::EXEC_HI
&&
134 SuperReg
!= AMDGPU::EXEC
&& "exec should never spill");
137 PerVGPRData
getPerVGPRData() {
139 Data
.PerVGPR
= IsWave32
? 32 : 64;
140 Data
.NumVGPRs
= (NumSubRegs
+ (Data
.PerVGPR
- 1)) / Data
.PerVGPR
;
141 Data
.VGPRLanes
= (1LL << std::min(Data
.PerVGPR
, NumSubRegs
)) - 1LL;
145 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
147 // Writes these instructions if an SGPR can be scavenged:
148 // s_mov_b64 s[6:7], exec ; Save exec
149 // s_mov_b64 exec, 3 ; Wanted lanemask
150 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
152 // Writes these instructions if no SGPR can be scavenged:
153 // buffer_store_dword v0 ; Only if no free VGPR was found
154 // s_not_b64 exec, exec
155 // buffer_store_dword v0 ; Save inactive lanes
156 // ; exec stays inverted, it is flipped back in
159 // Scavenged temporary VGPR to use. It must be scavenged once for any number
160 // of spilled subregs.
161 // FIXME: The liveness analysis is limited and does not tell if a register
162 // is in use in lanes that are currently inactive. We can never be sure if
163 // a register as actually in use in another lane, so we need to save all
164 // used lanes of the chosen VGPR.
165 assert(RS
&& "Cannot spill SGPR to memory without RegScavenger");
166 TmpVGPR
= RS
->scavengeRegister(&AMDGPU::VGPR_32RegClass
, MI
, 0, false);
168 // Reserve temporary stack slot
169 TmpVGPRIndex
= MFI
.getScavengeFI(MF
.getFrameInfo(), TRI
);
171 // Found a register that is dead in the currently active lanes, we only
172 // need to spill inactive lanes.
175 // Pick v0 because it doesn't make a difference.
176 TmpVGPR
= AMDGPU::VGPR0
;
180 // Try to scavenge SGPRs to save exec
181 assert(!SavedExecReg
&& "Exec is already saved, refuse to save again");
182 const TargetRegisterClass
&RC
=
183 IsWave32
? AMDGPU::SGPR_32RegClass
: AMDGPU::SGPR_64RegClass
;
184 RS
->setRegUsed(SuperReg
);
185 SavedExecReg
= RS
->scavengeRegister(&RC
, MI
, 0, false);
187 int64_t VGPRLanes
= getPerVGPRData().VGPRLanes
;
190 RS
->setRegUsed(SavedExecReg
);
191 // Set exec to needed lanes
192 BuildMI(MBB
, MI
, DL
, TII
.get(MovOpc
), SavedExecReg
).addReg(ExecReg
);
193 auto I
= BuildMI(MBB
, MI
, DL
, TII
.get(MovOpc
), ExecReg
).addImm(VGPRLanes
);
195 I
.addReg(TmpVGPR
, RegState::ImplicitDefine
);
196 // Spill needed lanes
197 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ false);
199 // Spill active lanes
201 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ false,
203 // Spill inactive lanes
204 auto I
= BuildMI(MBB
, MI
, DL
, TII
.get(NotOpc
), ExecReg
).addReg(ExecReg
);
206 I
.addReg(TmpVGPR
, RegState::ImplicitDefine
);
207 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ false);
211 // Writes these instructions if an SGPR can be scavenged:
212 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
213 // s_waitcnt vmcnt(0) ; If a free VGPR was found
214 // s_mov_b64 exec, s[6:7] ; Save exec
216 // Writes these instructions if no SGPR can be scavenged:
217 // buffer_load_dword v0 ; Restore inactive lanes
218 // s_waitcnt vmcnt(0) ; If a free VGPR was found
219 // s_not_b64 exec, exec
220 // buffer_load_dword v0 ; Only if no free VGPR was found
223 // Restore used lanes
224 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ true,
227 auto I
= BuildMI(MBB
, MI
, DL
, TII
.get(MovOpc
), ExecReg
)
228 .addReg(SavedExecReg
, RegState::Kill
);
229 // Add an implicit use of the load so it is not dead.
230 // FIXME This inserts an unnecessary waitcnt
232 I
.addReg(TmpVGPR
, RegState::ImplicitKill
);
235 // Restore inactive lanes
236 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ true,
238 auto I
= BuildMI(MBB
, MI
, DL
, TII
.get(NotOpc
), ExecReg
).addReg(ExecReg
);
240 I
.addReg(TmpVGPR
, RegState::ImplicitKill
);
242 // Restore active lanes
244 TRI
.buildVGPRSpillLoadStore(*this, TmpVGPRIndex
, 0, /*IsLoad*/ true);
248 // Write TmpVGPR to memory or read TmpVGPR from memory.
249 // Either using a single buffer_load/store if exec is set to the needed mask
255 void readWriteTmpVGPR(unsigned Offset
, bool IsLoad
) {
257 // Spill needed lanes
258 TRI
.buildVGPRSpillLoadStore(*this, Index
, Offset
, IsLoad
);
260 // Spill active lanes
261 TRI
.buildVGPRSpillLoadStore(*this, Index
, Offset
, IsLoad
,
263 // Spill inactive lanes
264 BuildMI(MBB
, MI
, DL
, TII
.get(NotOpc
), ExecReg
).addReg(ExecReg
);
265 TRI
.buildVGPRSpillLoadStore(*this, Index
, Offset
, IsLoad
);
266 BuildMI(MBB
, MI
, DL
, TII
.get(NotOpc
), ExecReg
).addReg(ExecReg
);
273 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget
&ST
)
274 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG
, ST
.getAMDGPUDwarfFlavour()), ST(ST
),
275 SpillSGPRToVGPR(EnableSpillSGPRToVGPR
), isWave32(ST
.isWave32()) {
277 assert(getSubRegIndexLaneMask(AMDGPU::sub0
).getAsInteger() == 3 &&
278 getSubRegIndexLaneMask(AMDGPU::sub31
).getAsInteger() == (3ULL << 62) &&
279 (getSubRegIndexLaneMask(AMDGPU::lo16
) |
280 getSubRegIndexLaneMask(AMDGPU::hi16
)).getAsInteger() ==
281 getSubRegIndexLaneMask(AMDGPU::sub0
).getAsInteger() &&
282 "getNumCoveredRegs() will not work with generated subreg masks!");
284 RegPressureIgnoredUnits
.resize(getNumRegUnits());
285 RegPressureIgnoredUnits
.set(
286 *MCRegUnitIterator(MCRegister::from(AMDGPU::M0
), this));
287 for (auto Reg
: AMDGPU::VGPR_HI16RegClass
)
288 RegPressureIgnoredUnits
.set(*MCRegUnitIterator(Reg
, this));
290 // HACK: Until this is fully tablegen'd.
291 static llvm::once_flag InitializeRegSplitPartsFlag
;
293 static auto InitializeRegSplitPartsOnce
= [this]() {
294 for (unsigned Idx
= 1, E
= getNumSubRegIndices() - 1; Idx
< E
; ++Idx
) {
295 unsigned Size
= getSubRegIdxSize(Idx
);
298 std::vector
<int16_t> &Vec
= RegSplitParts
[Size
/ 32 - 1];
299 unsigned Pos
= getSubRegIdxOffset(Idx
);
304 unsigned MaxNumParts
= 1024 / Size
; // Maximum register is 1024 bits.
305 Vec
.resize(MaxNumParts
);
311 static llvm::once_flag InitializeSubRegFromChannelTableFlag
;
313 static auto InitializeSubRegFromChannelTableOnce
= [this]() {
314 for (auto &Row
: SubRegFromChannelTable
)
315 Row
.fill(AMDGPU::NoSubRegister
);
316 for (uint16_t Idx
= 1; Idx
< getNumSubRegIndices(); ++Idx
) {
317 unsigned Width
= AMDGPUSubRegIdxRanges
[Idx
].Size
/ 32;
318 unsigned Offset
= AMDGPUSubRegIdxRanges
[Idx
].Offset
/ 32;
319 assert(Width
< SubRegFromChannelTableWidthMap
.size());
320 Width
= SubRegFromChannelTableWidthMap
[Width
];
323 unsigned TableIdx
= Width
- 1;
324 assert(TableIdx
< SubRegFromChannelTable
.size());
325 assert(Offset
< SubRegFromChannelTable
[TableIdx
].size());
326 SubRegFromChannelTable
[TableIdx
][Offset
] = Idx
;
330 llvm::call_once(InitializeRegSplitPartsFlag
, InitializeRegSplitPartsOnce
);
331 llvm::call_once(InitializeSubRegFromChannelTableFlag
,
332 InitializeSubRegFromChannelTableOnce
);
335 void SIRegisterInfo::reserveRegisterTuples(BitVector
&Reserved
,
336 MCRegister Reg
) const {
337 MCRegAliasIterator
R(Reg
, this, true);
339 for (; R
.isValid(); ++R
)
343 // Forced to be here by one .inc
344 const MCPhysReg
*SIRegisterInfo::getCalleeSavedRegs(
345 const MachineFunction
*MF
) const {
346 CallingConv::ID CC
= MF
->getFunction().getCallingConv();
349 case CallingConv::Fast
:
350 case CallingConv::Cold
:
351 case CallingConv::AMDGPU_Gfx
:
352 return MF
->getSubtarget
<GCNSubtarget
>().hasGFX90AInsts()
353 ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
354 : CSR_AMDGPU_HighRegs_SaveList
;
356 // Dummy to not crash RegisterClassInfo.
357 static const MCPhysReg NoCalleeSavedReg
= AMDGPU::NoRegister
;
358 return &NoCalleeSavedReg
;
364 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction
*MF
) const {
368 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction
&MF
,
369 CallingConv::ID CC
) const {
372 case CallingConv::Fast
:
373 case CallingConv::Cold
:
374 case CallingConv::AMDGPU_Gfx
:
375 return MF
.getSubtarget
<GCNSubtarget
>().hasGFX90AInsts()
376 ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
377 : CSR_AMDGPU_HighRegs_RegMask
;
383 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
384 return CSR_AMDGPU_NoRegs_RegMask
;
387 Register
SIRegisterInfo::getFrameRegister(const MachineFunction
&MF
) const {
388 const SIFrameLowering
*TFI
=
389 MF
.getSubtarget
<GCNSubtarget
>().getFrameLowering();
390 const SIMachineFunctionInfo
*FuncInfo
= MF
.getInfo
<SIMachineFunctionInfo
>();
391 // During ISel lowering we always reserve the stack pointer in entry
392 // functions, but never actually want to reference it when accessing our own
393 // frame. If we need a frame pointer we use it, but otherwise we can just use
394 // an immediate "0" which we represent by returning NoRegister.
395 if (FuncInfo
->isEntryFunction()) {
396 return TFI
->hasFP(MF
) ? FuncInfo
->getFrameOffsetReg() : Register();
398 return TFI
->hasFP(MF
) ? FuncInfo
->getFrameOffsetReg()
399 : FuncInfo
->getStackPtrOffsetReg();
402 bool SIRegisterInfo::hasBasePointer(const MachineFunction
&MF
) const {
403 // When we need stack realignment, we can't reference off of the
404 // stack pointer, so we reserve a base pointer.
405 const MachineFrameInfo
&MFI
= MF
.getFrameInfo();
406 return MFI
.getNumFixedObjects() && shouldRealignStack(MF
);
409 Register
SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34
; }
411 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
412 return CSR_AMDGPU_AllVGPRs_RegMask
;
415 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
416 return CSR_AMDGPU_AllAGPRs_RegMask
;
419 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
420 return CSR_AMDGPU_AllVectorRegs_RegMask
;
423 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
424 return CSR_AMDGPU_AllAllocatableSRegs_RegMask
;
427 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel
,
429 assert(NumRegs
< SubRegFromChannelTableWidthMap
.size());
430 unsigned NumRegIndex
= SubRegFromChannelTableWidthMap
[NumRegs
];
431 assert(NumRegIndex
&& "Not implemented");
432 assert(Channel
< SubRegFromChannelTable
[NumRegIndex
- 1].size());
433 return SubRegFromChannelTable
[NumRegIndex
- 1][Channel
];
436 MCRegister
SIRegisterInfo::reservedPrivateSegmentBufferReg(
437 const MachineFunction
&MF
) const {
438 unsigned BaseIdx
= alignDown(ST
.getMaxNumSGPRs(MF
), 4) - 4;
439 MCRegister
BaseReg(AMDGPU::SGPR_32RegClass
.getRegister(BaseIdx
));
440 return getMatchingSuperReg(BaseReg
, AMDGPU::sub0
, &AMDGPU::SGPR_128RegClass
);
443 BitVector
SIRegisterInfo::getReservedRegs(const MachineFunction
&MF
) const {
444 BitVector
Reserved(getNumRegs());
445 Reserved
.set(AMDGPU::MODE
);
447 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
448 // this seems likely to result in bugs, so I'm marking them as reserved.
449 reserveRegisterTuples(Reserved
, AMDGPU::EXEC
);
450 reserveRegisterTuples(Reserved
, AMDGPU::FLAT_SCR
);
452 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
453 reserveRegisterTuples(Reserved
, AMDGPU::M0
);
455 // Reserve src_vccz, src_execz, src_scc.
456 reserveRegisterTuples(Reserved
, AMDGPU::SRC_VCCZ
);
457 reserveRegisterTuples(Reserved
, AMDGPU::SRC_EXECZ
);
458 reserveRegisterTuples(Reserved
, AMDGPU::SRC_SCC
);
460 // Reserve the memory aperture registers.
461 reserveRegisterTuples(Reserved
, AMDGPU::SRC_SHARED_BASE
);
462 reserveRegisterTuples(Reserved
, AMDGPU::SRC_SHARED_LIMIT
);
463 reserveRegisterTuples(Reserved
, AMDGPU::SRC_PRIVATE_BASE
);
464 reserveRegisterTuples(Reserved
, AMDGPU::SRC_PRIVATE_LIMIT
);
466 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
467 reserveRegisterTuples(Reserved
, AMDGPU::SRC_POPS_EXITING_WAVE_ID
);
469 // Reserve xnack_mask registers - support is not implemented in Codegen.
470 reserveRegisterTuples(Reserved
, AMDGPU::XNACK_MASK
);
472 // Reserve lds_direct register - support is not implemented in Codegen.
473 reserveRegisterTuples(Reserved
, AMDGPU::LDS_DIRECT
);
475 // Reserve Trap Handler registers - support is not implemented in Codegen.
476 reserveRegisterTuples(Reserved
, AMDGPU::TBA
);
477 reserveRegisterTuples(Reserved
, AMDGPU::TMA
);
478 reserveRegisterTuples(Reserved
, AMDGPU::TTMP0_TTMP1
);
479 reserveRegisterTuples(Reserved
, AMDGPU::TTMP2_TTMP3
);
480 reserveRegisterTuples(Reserved
, AMDGPU::TTMP4_TTMP5
);
481 reserveRegisterTuples(Reserved
, AMDGPU::TTMP6_TTMP7
);
482 reserveRegisterTuples(Reserved
, AMDGPU::TTMP8_TTMP9
);
483 reserveRegisterTuples(Reserved
, AMDGPU::TTMP10_TTMP11
);
484 reserveRegisterTuples(Reserved
, AMDGPU::TTMP12_TTMP13
);
485 reserveRegisterTuples(Reserved
, AMDGPU::TTMP14_TTMP15
);
487 // Reserve null register - it shall never be allocated
488 reserveRegisterTuples(Reserved
, AMDGPU::SGPR_NULL
);
490 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
491 // will result in bugs.
493 Reserved
.set(AMDGPU::VCC
);
494 Reserved
.set(AMDGPU::VCC_HI
);
497 unsigned MaxNumSGPRs
= ST
.getMaxNumSGPRs(MF
);
498 unsigned TotalNumSGPRs
= AMDGPU::SGPR_32RegClass
.getNumRegs();
499 for (unsigned i
= MaxNumSGPRs
; i
< TotalNumSGPRs
; ++i
) {
500 unsigned Reg
= AMDGPU::SGPR_32RegClass
.getRegister(i
);
501 reserveRegisterTuples(Reserved
, Reg
);
504 unsigned MaxNumVGPRs
= ST
.getMaxNumVGPRs(MF
);
505 // TODO: In an entry function without calls and AGPRs used it is possible
506 // to use the whole register budget for VGPRs. Even more it shall
507 // be possible to estimate maximum AGPR/VGPR pressure and split
508 // register file accordingly.
509 if (ST
.hasGFX90AInsts())
511 unsigned TotalNumVGPRs
= AMDGPU::VGPR_32RegClass
.getNumRegs();
512 for (unsigned i
= MaxNumVGPRs
; i
< TotalNumVGPRs
; ++i
) {
513 unsigned Reg
= AMDGPU::VGPR_32RegClass
.getRegister(i
);
514 reserveRegisterTuples(Reserved
, Reg
);
515 Reg
= AMDGPU::AGPR_32RegClass
.getRegister(i
);
516 reserveRegisterTuples(Reserved
, Reg
);
519 for (auto Reg
: AMDGPU::SReg_32RegClass
) {
520 Reserved
.set(getSubReg(Reg
, AMDGPU::hi16
));
521 Register Low
= getSubReg(Reg
, AMDGPU::lo16
);
522 // This is to prevent BB vcc liveness errors.
523 if (!AMDGPU::SGPR_LO16RegClass
.contains(Low
))
527 for (auto Reg
: AMDGPU::AGPR_32RegClass
) {
528 Reserved
.set(getSubReg(Reg
, AMDGPU::hi16
));
531 // Reserve all the rest AGPRs if there are no instructions to use it.
532 if (!ST
.hasMAIInsts()) {
533 for (unsigned i
= 0; i
< MaxNumVGPRs
; ++i
) {
534 unsigned Reg
= AMDGPU::AGPR_32RegClass
.getRegister(i
);
535 reserveRegisterTuples(Reserved
, Reg
);
539 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
541 Register ScratchRSrcReg
= MFI
->getScratchRSrcReg();
542 if (ScratchRSrcReg
!= AMDGPU::NoRegister
) {
543 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
545 // TODO: May need to reserve a VGPR if doing LDS spilling.
546 reserveRegisterTuples(Reserved
, ScratchRSrcReg
);
549 // We have to assume the SP is needed in case there are calls in the function,
550 // which is detected after the function is lowered. If we aren't really going
551 // to need SP, don't bother reserving it.
552 MCRegister StackPtrReg
= MFI
->getStackPtrOffsetReg();
555 reserveRegisterTuples(Reserved
, StackPtrReg
);
556 assert(!isSubRegister(ScratchRSrcReg
, StackPtrReg
));
559 MCRegister FrameReg
= MFI
->getFrameOffsetReg();
561 reserveRegisterTuples(Reserved
, FrameReg
);
562 assert(!isSubRegister(ScratchRSrcReg
, FrameReg
));
565 if (hasBasePointer(MF
)) {
566 MCRegister BasePtrReg
= getBaseRegister();
567 reserveRegisterTuples(Reserved
, BasePtrReg
);
568 assert(!isSubRegister(ScratchRSrcReg
, BasePtrReg
));
571 for (auto Reg
: MFI
->WWMReservedRegs
) {
572 reserveRegisterTuples(Reserved
, Reg
.first
);
575 // Reserve VGPRs used for SGPR spilling.
576 // Note we treat freezeReservedRegs unusually because we run register
577 // allocation in two phases. It's OK to re-freeze with new registers for the
580 for (auto &SpilledFI
: MFI
->sgpr_spill_vgprs()) {
581 for (auto &SpilledVGPR
: SpilledFI
.second
)
582 reserveRegisterTuples(Reserved
, SpilledVGPR
.VGPR
);
586 // FIXME: Stop using reserved registers for this.
587 for (MCPhysReg Reg
: MFI
->getAGPRSpillVGPRs())
588 reserveRegisterTuples(Reserved
, Reg
);
590 for (MCPhysReg Reg
: MFI
->getVGPRSpillAGPRs())
591 reserveRegisterTuples(Reserved
, Reg
);
593 for (auto SSpill
: MFI
->getSGPRSpillVGPRs())
594 reserveRegisterTuples(Reserved
, SSpill
.VGPR
);
599 bool SIRegisterInfo::shouldRealignStack(const MachineFunction
&MF
) const {
600 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
601 // On entry, the base address is 0, so it can't possibly need any more
604 // FIXME: Should be able to specify the entry frame alignment per calling
605 // convention instead.
606 if (Info
->isEntryFunction())
609 return TargetRegisterInfo::shouldRealignStack(MF
);
612 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction
&Fn
) const {
613 const SIMachineFunctionInfo
*Info
= Fn
.getInfo
<SIMachineFunctionInfo
>();
614 if (Info
->isEntryFunction()) {
615 const MachineFrameInfo
&MFI
= Fn
.getFrameInfo();
616 return MFI
.hasStackObjects() || MFI
.hasCalls();
619 // May need scavenger for dealing with callee saved registers.
623 bool SIRegisterInfo::requiresFrameIndexScavenging(
624 const MachineFunction
&MF
) const {
625 // Do not use frame virtual registers. They used to be used for SGPRs, but
626 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
627 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
632 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
633 const MachineFunction
&MF
) const {
634 const MachineFrameInfo
&MFI
= MF
.getFrameInfo();
635 return MFI
.hasStackObjects();
638 bool SIRegisterInfo::requiresVirtualBaseRegisters(
639 const MachineFunction
&) const {
640 // There are no special dedicated stack or frame pointers.
644 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr
*MI
) const {
645 assert(SIInstrInfo::isMUBUF(*MI
) || SIInstrInfo::isFLATScratch(*MI
));
647 int OffIdx
= AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
648 AMDGPU::OpName::offset
);
649 return MI
->getOperand(OffIdx
).getImm();
652 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr
*MI
,
654 if (!SIInstrInfo::isMUBUF(*MI
) && !SIInstrInfo::isFLATScratch(*MI
))
657 assert((Idx
== AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
658 AMDGPU::OpName::vaddr
) ||
659 (Idx
== AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
660 AMDGPU::OpName::saddr
))) &&
661 "Should never see frame index on non-address operand");
663 return getScratchInstrOffset(MI
);
666 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr
*MI
, int64_t Offset
) const {
667 if (!SIInstrInfo::isMUBUF(*MI
) && !SIInstrInfo::isFLATScratch(*MI
))
670 int64_t FullOffset
= Offset
+ getScratchInstrOffset(MI
);
672 if (SIInstrInfo::isMUBUF(*MI
))
673 return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset
);
675 const SIInstrInfo
*TII
= ST
.getInstrInfo();
676 return !TII
->isLegalFLATOffset(FullOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
677 SIInstrFlags::FlatScratch
);
680 Register
SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock
*MBB
,
682 int64_t Offset
) const {
683 MachineBasicBlock::iterator Ins
= MBB
->begin();
684 DebugLoc DL
; // Defaults to "unknown"
686 if (Ins
!= MBB
->end())
687 DL
= Ins
->getDebugLoc();
689 MachineFunction
*MF
= MBB
->getParent();
690 const SIInstrInfo
*TII
= ST
.getInstrInfo();
691 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
692 unsigned MovOpc
= ST
.enableFlatScratch() ? AMDGPU::S_MOV_B32
693 : AMDGPU::V_MOV_B32_e32
;
695 Register BaseReg
= MRI
.createVirtualRegister(
696 ST
.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
697 : &AMDGPU::VGPR_32RegClass
);
700 BuildMI(*MBB
, Ins
, DL
, TII
->get(MovOpc
), BaseReg
)
701 .addFrameIndex(FrameIdx
);
705 Register OffsetReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass
);
707 Register FIReg
= MRI
.createVirtualRegister(
708 ST
.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
709 : &AMDGPU::VGPR_32RegClass
);
711 BuildMI(*MBB
, Ins
, DL
, TII
->get(AMDGPU::S_MOV_B32
), OffsetReg
)
713 BuildMI(*MBB
, Ins
, DL
, TII
->get(MovOpc
), FIReg
)
714 .addFrameIndex(FrameIdx
);
716 if (ST
.enableFlatScratch() ) {
717 BuildMI(*MBB
, Ins
, DL
, TII
->get(AMDGPU::S_ADD_I32
), BaseReg
)
718 .addReg(OffsetReg
, RegState::Kill
)
723 TII
->getAddNoCarry(*MBB
, Ins
, DL
, BaseReg
)
724 .addReg(OffsetReg
, RegState::Kill
)
726 .addImm(0); // clamp bit
731 void SIRegisterInfo::resolveFrameIndex(MachineInstr
&MI
, Register BaseReg
,
732 int64_t Offset
) const {
733 const SIInstrInfo
*TII
= ST
.getInstrInfo();
734 bool IsFlat
= TII
->isFLATScratch(MI
);
737 // FIXME: Is it possible to be storing a frame index to itself?
739 for (const MachineOperand
&MO
: MI
.operands()) {
742 llvm_unreachable("should not see multiple frame indices");
749 MachineOperand
*FIOp
=
750 TII
->getNamedOperand(MI
, IsFlat
? AMDGPU::OpName::saddr
751 : AMDGPU::OpName::vaddr
);
753 MachineOperand
*OffsetOp
= TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
);
754 int64_t NewOffset
= OffsetOp
->getImm() + Offset
;
756 assert(FIOp
&& FIOp
->isFI() && "frame index must be address operand");
757 assert(TII
->isMUBUF(MI
) || TII
->isFLATScratch(MI
));
760 assert(TII
->isLegalFLATOffset(NewOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
761 SIInstrFlags::FlatScratch
) &&
762 "offset should be legal");
763 FIOp
->ChangeToRegister(BaseReg
, false);
764 OffsetOp
->setImm(NewOffset
);
769 MachineOperand
*SOffset
= TII
->getNamedOperand(MI
, AMDGPU::OpName::soffset
);
770 assert(SOffset
->isImm() && SOffset
->getImm() == 0);
773 assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset
) &&
774 "offset should be legal");
776 FIOp
->ChangeToRegister(BaseReg
, false);
777 OffsetOp
->setImm(NewOffset
);
780 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr
*MI
,
782 int64_t Offset
) const {
783 if (!SIInstrInfo::isMUBUF(*MI
) && !SIInstrInfo::isFLATScratch(*MI
))
786 int64_t NewOffset
= Offset
+ getScratchInstrOffset(MI
);
788 if (SIInstrInfo::isMUBUF(*MI
))
789 return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset
);
791 const SIInstrInfo
*TII
= ST
.getInstrInfo();
792 return TII
->isLegalFLATOffset(NewOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
793 SIInstrFlags::FlatScratch
);
796 const TargetRegisterClass
*SIRegisterInfo::getPointerRegClass(
797 const MachineFunction
&MF
, unsigned Kind
) const {
798 // This is inaccurate. It depends on the instruction and address space. The
799 // only place where we should hit this is for dealing with frame indexes /
800 // private accesses, so this is correct in that case.
801 return &AMDGPU::VGPR_32RegClass
;
804 static unsigned getNumSubRegsForSpillOp(unsigned Op
) {
807 case AMDGPU::SI_SPILL_S1024_SAVE
:
808 case AMDGPU::SI_SPILL_S1024_RESTORE
:
809 case AMDGPU::SI_SPILL_V1024_SAVE
:
810 case AMDGPU::SI_SPILL_V1024_RESTORE
:
811 case AMDGPU::SI_SPILL_A1024_SAVE
:
812 case AMDGPU::SI_SPILL_A1024_RESTORE
:
814 case AMDGPU::SI_SPILL_S512_SAVE
:
815 case AMDGPU::SI_SPILL_S512_RESTORE
:
816 case AMDGPU::SI_SPILL_V512_SAVE
:
817 case AMDGPU::SI_SPILL_V512_RESTORE
:
818 case AMDGPU::SI_SPILL_A512_SAVE
:
819 case AMDGPU::SI_SPILL_A512_RESTORE
:
821 case AMDGPU::SI_SPILL_S256_SAVE
:
822 case AMDGPU::SI_SPILL_S256_RESTORE
:
823 case AMDGPU::SI_SPILL_V256_SAVE
:
824 case AMDGPU::SI_SPILL_V256_RESTORE
:
825 case AMDGPU::SI_SPILL_A256_SAVE
:
826 case AMDGPU::SI_SPILL_A256_RESTORE
:
828 case AMDGPU::SI_SPILL_S224_SAVE
:
829 case AMDGPU::SI_SPILL_S224_RESTORE
:
830 case AMDGPU::SI_SPILL_V224_SAVE
:
831 case AMDGPU::SI_SPILL_V224_RESTORE
:
832 case AMDGPU::SI_SPILL_A224_SAVE
:
833 case AMDGPU::SI_SPILL_A224_RESTORE
:
835 case AMDGPU::SI_SPILL_S192_SAVE
:
836 case AMDGPU::SI_SPILL_S192_RESTORE
:
837 case AMDGPU::SI_SPILL_V192_SAVE
:
838 case AMDGPU::SI_SPILL_V192_RESTORE
:
839 case AMDGPU::SI_SPILL_A192_SAVE
:
840 case AMDGPU::SI_SPILL_A192_RESTORE
:
842 case AMDGPU::SI_SPILL_S160_SAVE
:
843 case AMDGPU::SI_SPILL_S160_RESTORE
:
844 case AMDGPU::SI_SPILL_V160_SAVE
:
845 case AMDGPU::SI_SPILL_V160_RESTORE
:
846 case AMDGPU::SI_SPILL_A160_SAVE
:
847 case AMDGPU::SI_SPILL_A160_RESTORE
:
849 case AMDGPU::SI_SPILL_S128_SAVE
:
850 case AMDGPU::SI_SPILL_S128_RESTORE
:
851 case AMDGPU::SI_SPILL_V128_SAVE
:
852 case AMDGPU::SI_SPILL_V128_RESTORE
:
853 case AMDGPU::SI_SPILL_A128_SAVE
:
854 case AMDGPU::SI_SPILL_A128_RESTORE
:
856 case AMDGPU::SI_SPILL_S96_SAVE
:
857 case AMDGPU::SI_SPILL_S96_RESTORE
:
858 case AMDGPU::SI_SPILL_V96_SAVE
:
859 case AMDGPU::SI_SPILL_V96_RESTORE
:
860 case AMDGPU::SI_SPILL_A96_SAVE
:
861 case AMDGPU::SI_SPILL_A96_RESTORE
:
863 case AMDGPU::SI_SPILL_S64_SAVE
:
864 case AMDGPU::SI_SPILL_S64_RESTORE
:
865 case AMDGPU::SI_SPILL_V64_SAVE
:
866 case AMDGPU::SI_SPILL_V64_RESTORE
:
867 case AMDGPU::SI_SPILL_A64_SAVE
:
868 case AMDGPU::SI_SPILL_A64_RESTORE
:
870 case AMDGPU::SI_SPILL_S32_SAVE
:
871 case AMDGPU::SI_SPILL_S32_RESTORE
:
872 case AMDGPU::SI_SPILL_V32_SAVE
:
873 case AMDGPU::SI_SPILL_V32_RESTORE
:
874 case AMDGPU::SI_SPILL_A32_SAVE
:
875 case AMDGPU::SI_SPILL_A32_RESTORE
:
877 default: llvm_unreachable("Invalid spill opcode");
881 static int getOffsetMUBUFStore(unsigned Opc
) {
883 case AMDGPU::BUFFER_STORE_DWORD_OFFEN
:
884 return AMDGPU::BUFFER_STORE_DWORD_OFFSET
;
885 case AMDGPU::BUFFER_STORE_BYTE_OFFEN
:
886 return AMDGPU::BUFFER_STORE_BYTE_OFFSET
;
887 case AMDGPU::BUFFER_STORE_SHORT_OFFEN
:
888 return AMDGPU::BUFFER_STORE_SHORT_OFFSET
;
889 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN
:
890 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET
;
891 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN
:
892 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET
;
893 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN
:
894 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET
;
895 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN
:
896 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET
;
902 static int getOffsetMUBUFLoad(unsigned Opc
) {
904 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN
:
905 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET
;
906 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN
:
907 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET
;
908 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN
:
909 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET
;
910 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN
:
911 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET
;
912 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN
:
913 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET
;
914 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN
:
915 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET
;
916 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN
:
917 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET
;
918 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN
:
919 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET
;
920 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN
:
921 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET
;
922 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN
:
923 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET
;
924 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN
:
925 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET
;
926 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN
:
927 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET
;
928 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN
:
929 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET
;
935 static MachineInstrBuilder
spillVGPRtoAGPR(const GCNSubtarget
&ST
,
936 MachineBasicBlock
&MBB
,
937 MachineBasicBlock::iterator MI
,
938 int Index
, unsigned Lane
,
939 unsigned ValueReg
, bool IsKill
) {
940 MachineFunction
*MF
= MBB
.getParent();
941 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
942 const SIInstrInfo
*TII
= ST
.getInstrInfo();
944 MCPhysReg Reg
= MFI
->getVGPRToAGPRSpill(Index
, Lane
);
946 if (Reg
== AMDGPU::NoRegister
)
947 return MachineInstrBuilder();
949 bool IsStore
= MI
->mayStore();
950 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
951 auto *TRI
= static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
953 unsigned Dst
= IsStore
? Reg
: ValueReg
;
954 unsigned Src
= IsStore
? ValueReg
: Reg
;
955 unsigned Opc
= (IsStore
^ TRI
->isVGPR(MRI
, Reg
)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
956 : AMDGPU::V_ACCVGPR_READ_B32_e64
;
958 auto MIB
= BuildMI(MBB
, MI
, MI
->getDebugLoc(), TII
->get(Opc
), Dst
)
959 .addReg(Src
, getKillRegState(IsKill
));
960 MIB
->setAsmPrinterFlag(MachineInstr::ReloadReuse
);
964 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
965 // need to handle the case where an SGPR may need to be spilled while spilling.
966 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget
&ST
,
967 MachineFrameInfo
&MFI
,
968 MachineBasicBlock::iterator MI
,
971 const SIInstrInfo
*TII
= ST
.getInstrInfo();
972 MachineBasicBlock
*MBB
= MI
->getParent();
973 const DebugLoc
&DL
= MI
->getDebugLoc();
974 bool IsStore
= MI
->mayStore();
976 unsigned Opc
= MI
->getOpcode();
977 int LoadStoreOp
= IsStore
?
978 getOffsetMUBUFStore(Opc
) : getOffsetMUBUFLoad(Opc
);
979 if (LoadStoreOp
== -1)
982 const MachineOperand
*Reg
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::vdata
);
983 if (spillVGPRtoAGPR(ST
, *MBB
, MI
, Index
, 0, Reg
->getReg(), false).getInstr())
986 MachineInstrBuilder NewMI
=
987 BuildMI(*MBB
, MI
, DL
, TII
->get(LoadStoreOp
))
989 .add(*TII
->getNamedOperand(*MI
, AMDGPU::OpName::srsrc
))
990 .add(*TII
->getNamedOperand(*MI
, AMDGPU::OpName::soffset
))
997 const MachineOperand
*VDataIn
= TII
->getNamedOperand(*MI
,
998 AMDGPU::OpName::vdata_in
);
1000 NewMI
.add(*VDataIn
);
1004 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo
*TII
,
1005 unsigned LoadStoreOp
,
1007 bool IsStore
= TII
->get(LoadStoreOp
).mayStore();
1009 AMDGPU::getNamedOperandIdx(LoadStoreOp
, AMDGPU::OpName::vaddr
) < 0 &&
1010 AMDGPU::getNamedOperandIdx(LoadStoreOp
, AMDGPU::OpName::saddr
) < 0;
1014 LoadStoreOp
= IsStore
? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1015 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR
;
1018 LoadStoreOp
= IsStore
? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1019 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR
;
1022 LoadStoreOp
= IsStore
? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1023 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR
;
1026 LoadStoreOp
= IsStore
? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1027 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR
;
1030 llvm_unreachable("Unexpected spill load/store size!");
1034 LoadStoreOp
= AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp
);
1039 void SIRegisterInfo::buildSpillLoadStore(
1040 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator MI
,
1041 unsigned LoadStoreOp
, int Index
, Register ValueReg
, bool IsKill
,
1042 MCRegister ScratchOffsetReg
, int64_t InstOffset
, MachineMemOperand
*MMO
,
1043 RegScavenger
*RS
, LivePhysRegs
*LiveRegs
) const {
1044 assert((!RS
|| !LiveRegs
) && "Only RS or LiveRegs can be set but not both");
1046 MachineFunction
*MF
= MBB
.getParent();
1047 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1048 const MachineFrameInfo
&MFI
= MF
->getFrameInfo();
1049 const SIMachineFunctionInfo
*FuncInfo
= MF
->getInfo
<SIMachineFunctionInfo
>();
1051 const MCInstrDesc
*Desc
= &TII
->get(LoadStoreOp
);
1052 const DebugLoc
&DL
= MI
!= MBB
.end() ? MI
->getDebugLoc() : DebugLoc();
1053 bool IsStore
= Desc
->mayStore();
1054 bool IsFlat
= TII
->isFLATScratch(LoadStoreOp
);
1056 bool Scavenged
= false;
1057 MCRegister SOffset
= ScratchOffsetReg
;
1059 const TargetRegisterClass
*RC
= getRegClassForReg(MF
->getRegInfo(), ValueReg
);
1060 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1061 const bool IsAGPR
= !ST
.hasGFX90AInsts() && hasAGPRs(RC
);
1062 const unsigned RegWidth
= AMDGPU::getRegBitWidth(RC
->getID()) / 8;
1064 // Always use 4 byte operations for AGPRs because we need to scavenge
1065 // a temporary VGPR.
1066 unsigned EltSize
= (IsFlat
&& !IsAGPR
) ? std::min(RegWidth
, 16u) : 4u;
1067 unsigned NumSubRegs
= RegWidth
/ EltSize
;
1068 unsigned Size
= NumSubRegs
* EltSize
;
1069 unsigned RemSize
= RegWidth
- Size
;
1070 unsigned NumRemSubRegs
= RemSize
? 1 : 0;
1071 int64_t Offset
= InstOffset
+ MFI
.getObjectOffset(Index
);
1072 int64_t MaxOffset
= Offset
+ Size
+ RemSize
- EltSize
;
1073 int64_t ScratchOffsetRegDelta
= 0;
1075 if (IsFlat
&& EltSize
> 4) {
1076 LoadStoreOp
= getFlatScratchSpillOpcode(TII
, LoadStoreOp
, EltSize
);
1077 Desc
= &TII
->get(LoadStoreOp
);
1080 Align Alignment
= MFI
.getObjectAlign(Index
);
1081 const MachinePointerInfo
&BasePtrInfo
= MMO
->getPointerInfo();
1083 assert((IsFlat
|| ((Offset
% EltSize
) == 0)) &&
1084 "unexpected VGPR spill offset");
1086 bool IsOffsetLegal
=
1087 IsFlat
? TII
->isLegalFLATOffset(MaxOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
1088 SIInstrFlags::FlatScratch
)
1089 : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset
);
1090 if (!IsOffsetLegal
|| (IsFlat
&& !SOffset
&& !ST
.hasFlatScratchSTMode())) {
1091 SOffset
= MCRegister();
1093 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1094 // we can simplify the adjustment of Offset here to just scale with
1097 Offset
*= ST
.getWavefrontSize();
1099 // We don't have access to the register scavenger if this function is called
1100 // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case.
1102 SOffset
= RS
->scavengeRegister(&AMDGPU::SGPR_32RegClass
, MI
, 0, false);
1103 } else if (LiveRegs
) {
1104 for (MCRegister Reg
: AMDGPU::SGPR_32RegClass
) {
1105 if (LiveRegs
->available(MF
->getRegInfo(), Reg
)) {
1113 // There are no free SGPRs, and since we are in the process of spilling
1114 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1115 // on SI/CI and on VI it is true until we implement spilling using scalar
1116 // stores), we have no way to free up an SGPR. Our solution here is to
1117 // add the offset directly to the ScratchOffset or StackPtrOffset
1118 // register, and then subtract the offset after the spill to return the
1119 // register to it's original value.
1120 if (!ScratchOffsetReg
)
1121 ScratchOffsetReg
= FuncInfo
->getStackPtrOffsetReg();
1122 SOffset
= ScratchOffsetReg
;
1123 ScratchOffsetRegDelta
= Offset
;
1129 report_fatal_error("could not scavenge SGPR to spill in entry function");
1131 if (ScratchOffsetReg
== AMDGPU::NoRegister
) {
1132 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_MOV_B32
), SOffset
).addImm(Offset
);
1134 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), SOffset
)
1135 .addReg(ScratchOffsetReg
)
1142 if (IsFlat
&& SOffset
== AMDGPU::NoRegister
) {
1143 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp
, AMDGPU::OpName::vaddr
) < 0
1144 && "Unexpected vaddr for flat scratch with a FI operand");
1146 assert(ST
.hasFlatScratchSTMode());
1147 LoadStoreOp
= AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp
);
1148 Desc
= &TII
->get(LoadStoreOp
);
1153 for (unsigned i
= 0, e
= NumSubRegs
+ NumRemSubRegs
, RegOffset
= 0; i
!= e
;
1154 ++i
, RegOffset
+= EltSize
) {
1155 if (i
== NumSubRegs
) {
1157 LoadStoreOp
= getFlatScratchSpillOpcode(TII
, LoadStoreOp
, EltSize
);
1159 Desc
= &TII
->get(LoadStoreOp
);
1161 unsigned NumRegs
= EltSize
/ 4;
1162 Register SubReg
= e
== 1
1164 : Register(getSubReg(ValueReg
,
1165 getSubRegFromChannel(RegOffset
/ 4, NumRegs
)));
1167 unsigned SOffsetRegState
= 0;
1168 unsigned SrcDstRegState
= getDefRegState(!IsStore
);
1170 SOffsetRegState
|= getKillRegState(Scavenged
);
1171 // The last implicit use carries the "Kill" flag.
1172 SrcDstRegState
|= getKillRegState(IsKill
);
1175 // Make sure the whole register is defined if there are undef components by
1176 // adding an implicit def of the super-reg on the first instruction.
1177 bool NeedSuperRegDef
= e
> 1 && IsStore
&& i
== 0;
1178 bool NeedSuperRegImpOperand
= e
> 1;
1180 unsigned Lane
= RegOffset
/ 4;
1181 unsigned LaneE
= (RegOffset
+ EltSize
) / 4;
1182 for ( ; Lane
!= LaneE
; ++Lane
) {
1183 bool IsSubReg
= e
> 1 || EltSize
> 4;
1184 Register Sub
= IsSubReg
1185 ? Register(getSubReg(ValueReg
, getSubRegFromChannel(Lane
)))
1187 auto MIB
= spillVGPRtoAGPR(ST
, MBB
, MI
, Index
, Lane
, Sub
, IsKill
);
1188 if (!MIB
.getInstr())
1190 if (NeedSuperRegDef
|| (IsSubReg
&& IsStore
&& Lane
== 0)) {
1191 MIB
.addReg(ValueReg
, RegState::ImplicitDefine
);
1192 NeedSuperRegDef
= false;
1194 if (IsSubReg
|| NeedSuperRegImpOperand
) {
1195 NeedSuperRegImpOperand
= true;
1196 unsigned State
= SrcDstRegState
;
1197 if (Lane
+ 1 != LaneE
)
1198 State
&= ~RegState::Kill
;
1199 MIB
.addReg(ValueReg
, RegState::Implicit
| State
);
1203 if (Lane
== LaneE
) // Fully spilled into AGPRs.
1206 // Offset in bytes from the beginning of the ValueReg to its portion we
1207 // still need to spill. It may differ from RegOffset if a portion of
1208 // current SubReg has been already spilled into AGPRs by the loop above.
1209 unsigned RemRegOffset
= Lane
* 4;
1210 unsigned RemEltSize
= EltSize
- (RemRegOffset
- RegOffset
);
1211 if (RemEltSize
!= EltSize
) { // Partially spilled to AGPRs
1212 assert(IsFlat
&& EltSize
> 4);
1214 unsigned NumRegs
= RemEltSize
/ 4;
1215 SubReg
= Register(getSubReg(ValueReg
,
1216 getSubRegFromChannel(RemRegOffset
/ 4, NumRegs
)));
1217 unsigned Opc
= getFlatScratchSpillOpcode(TII
, LoadStoreOp
, RemEltSize
);
1218 Desc
= &TII
->get(Opc
);
1221 unsigned FinalReg
= SubReg
;
1224 assert(EltSize
== 4);
1227 assert(RS
&& "Needs to have RegScavenger to spill an AGPR!");
1228 // FIXME: change to scavengeRegisterBackwards()
1229 TmpReg
= RS
->scavengeRegister(&AMDGPU::VGPR_32RegClass
, MI
, 0);
1230 RS
->setRegUsed(TmpReg
);
1233 auto AccRead
= BuildMI(MBB
, MI
, DL
,
1234 TII
->get(AMDGPU::V_ACCVGPR_READ_B32_e64
), TmpReg
)
1235 .addReg(SubReg
, getKillRegState(IsKill
));
1236 if (NeedSuperRegDef
)
1237 AccRead
.addReg(ValueReg
, RegState::ImplicitDefine
);
1238 AccRead
->setAsmPrinterFlag(MachineInstr::ReloadReuse
);
1243 MachinePointerInfo PInfo
= BasePtrInfo
.getWithOffset(RemRegOffset
);
1244 MachineMemOperand
*NewMMO
=
1245 MF
->getMachineMemOperand(PInfo
, MMO
->getFlags(), RemEltSize
,
1246 commonAlignment(Alignment
, RemRegOffset
));
1249 BuildMI(MBB
, MI
, DL
, *Desc
)
1250 .addReg(SubReg
, getDefRegState(!IsStore
) | getKillRegState(IsKill
));
1252 MIB
.addReg(FuncInfo
->getScratchRSrcReg());
1254 if (SOffset
== AMDGPU::NoRegister
) {
1258 MIB
.addReg(SOffset
, SOffsetRegState
);
1260 MIB
.addImm(Offset
+ RemRegOffset
)
1263 MIB
.addImm(0) // tfe
1265 MIB
.addMemOperand(NewMMO
);
1267 if (!IsAGPR
&& NeedSuperRegDef
)
1268 MIB
.addReg(ValueReg
, RegState::ImplicitDefine
);
1270 if (!IsStore
&& TmpReg
!= AMDGPU::NoRegister
) {
1271 MIB
= BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
),
1273 .addReg(TmpReg
, RegState::Kill
);
1274 MIB
->setAsmPrinterFlag(MachineInstr::ReloadReuse
);
1277 if (NeedSuperRegImpOperand
)
1278 MIB
.addReg(ValueReg
, RegState::Implicit
| SrcDstRegState
);
1281 if (ScratchOffsetRegDelta
!= 0) {
1282 // Subtract the offset we added to the ScratchOffset register.
1283 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), SOffset
)
1285 .addImm(-ScratchOffsetRegDelta
);
1289 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder
&SB
, int Index
,
1290 int Offset
, bool IsLoad
,
1291 bool IsKill
) const {
1293 MachineFrameInfo
&FrameInfo
= SB
.MF
.getFrameInfo();
1294 assert(FrameInfo
.getStackID(Index
) != TargetStackID::SGPRSpill
);
1297 FrameInfo
.isFixedObjectIndex(Index
) && hasBasePointer(SB
.MF
)
1299 : getFrameRegister(SB
.MF
);
1301 Align Alignment
= FrameInfo
.getObjectAlign(Index
);
1302 MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(SB
.MF
, Index
);
1303 MachineMemOperand
*MMO
= SB
.MF
.getMachineMemOperand(
1304 PtrInfo
, IsLoad
? MachineMemOperand::MOLoad
: MachineMemOperand::MOStore
,
1305 SB
.EltSize
, Alignment
);
1308 unsigned Opc
= ST
.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1309 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET
;
1310 buildSpillLoadStore(SB
.MBB
, SB
.MI
, Opc
, Index
, SB
.TmpVGPR
, false, FrameReg
,
1311 Offset
* SB
.EltSize
, MMO
, SB
.RS
);
1313 unsigned Opc
= ST
.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1314 : AMDGPU::BUFFER_STORE_DWORD_OFFSET
;
1315 buildSpillLoadStore(SB
.MBB
, SB
.MI
, Opc
, Index
, SB
.TmpVGPR
, IsKill
, FrameReg
,
1316 Offset
* SB
.EltSize
, MMO
, SB
.RS
);
1317 // This only ever adds one VGPR spill
1318 SB
.MFI
.addToSpilledVGPRs(1);
1322 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI
,
1326 bool OnlyToVGPR
) const {
1327 SGPRSpillBuilder
SB(*this, *ST
.getInstrInfo(), isWave32
, MI
, Index
, RS
);
1329 ArrayRef
<SIMachineFunctionInfo::SpilledReg
> VGPRSpills
=
1330 SB
.MFI
.getSGPRToVGPRSpills(Index
);
1331 bool SpillToVGPR
= !VGPRSpills
.empty();
1332 if (OnlyToVGPR
&& !SpillToVGPR
)
1335 assert(SpillToVGPR
|| (SB
.SuperReg
!= SB
.MFI
.getStackPtrOffsetReg() &&
1336 SB
.SuperReg
!= SB
.MFI
.getFrameOffsetReg()));
1339 for (unsigned i
= 0, e
= SB
.NumSubRegs
; i
< e
; ++i
) {
1343 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1344 SIMachineFunctionInfo::SpilledReg Spill
= VGPRSpills
[i
];
1346 bool UseKill
= SB
.IsKill
&& i
== SB
.NumSubRegs
- 1;
1348 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1349 // spill to this specific vgpr in the first basic block.
1350 auto MIB
= BuildMI(SB
.MBB
, MI
, SB
.DL
, SB
.TII
.get(AMDGPU::V_WRITELANE_B32
),
1352 .addReg(SubReg
, getKillRegState(UseKill
))
1354 .addReg(Spill
.VGPR
);
1357 LIS
->ReplaceMachineInstrInMaps(*MI
, *MIB
);
1359 LIS
->InsertMachineInstrInMaps(*MIB
);
1362 if (i
== 0 && SB
.NumSubRegs
> 1) {
1363 // We may be spilling a super-register which is only partially defined,
1364 // and need to ensure later spills think the value is defined.
1365 MIB
.addReg(SB
.SuperReg
, RegState::ImplicitDefine
);
1368 if (SB
.NumSubRegs
> 1)
1369 MIB
.addReg(SB
.SuperReg
, getKillRegState(UseKill
) | RegState::Implicit
);
1371 // FIXME: Since this spills to another register instead of an actual
1372 // frame index, we should delete the frame index when all references to
1378 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
1379 unsigned SubKillState
= getKillRegState((SB
.NumSubRegs
== 1) && SB
.IsKill
);
1381 // Per VGPR helper data
1382 auto PVD
= SB
.getPerVGPRData();
1384 for (unsigned Offset
= 0; Offset
< PVD
.NumVGPRs
; ++Offset
) {
1385 unsigned TmpVGPRFlags
= RegState::Undef
;
1387 // Write sub registers into the VGPR
1388 for (unsigned i
= Offset
* PVD
.PerVGPR
,
1389 e
= std::min((Offset
+ 1) * PVD
.PerVGPR
, SB
.NumSubRegs
);
1394 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1396 MachineInstrBuilder WriteLane
=
1397 BuildMI(SB
.MBB
, MI
, SB
.DL
, SB
.TII
.get(AMDGPU::V_WRITELANE_B32
),
1399 .addReg(SubReg
, SubKillState
)
1400 .addImm(i
% PVD
.PerVGPR
)
1401 .addReg(SB
.TmpVGPR
, TmpVGPRFlags
);
1406 LIS
->ReplaceMachineInstrInMaps(*MI
, *WriteLane
);
1408 LIS
->InsertMachineInstrInMaps(*WriteLane
);
1411 // There could be undef components of a spilled super register.
1412 // TODO: Can we detect this and skip the spill?
1413 if (SB
.NumSubRegs
> 1) {
1414 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1415 unsigned SuperKillState
= 0;
1416 if (i
+ 1 == SB
.NumSubRegs
)
1417 SuperKillState
|= getKillRegState(SB
.IsKill
);
1418 WriteLane
.addReg(SB
.SuperReg
, RegState::Implicit
| SuperKillState
);
1423 SB
.readWriteTmpVGPR(Offset
, /*IsLoad*/ false);
1429 MI
->eraseFromParent();
1430 SB
.MFI
.addToSpilledSGPRs(SB
.NumSubRegs
);
1433 LIS
->removeAllRegUnitsForPhysReg(SB
.SuperReg
);
1438 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI
,
1442 bool OnlyToVGPR
) const {
1443 SGPRSpillBuilder
SB(*this, *ST
.getInstrInfo(), isWave32
, MI
, Index
, RS
);
1445 ArrayRef
<SIMachineFunctionInfo::SpilledReg
> VGPRSpills
=
1446 SB
.MFI
.getSGPRToVGPRSpills(Index
);
1447 bool SpillToVGPR
= !VGPRSpills
.empty();
1448 if (OnlyToVGPR
&& !SpillToVGPR
)
1452 for (unsigned i
= 0, e
= SB
.NumSubRegs
; i
< e
; ++i
) {
1456 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1458 SIMachineFunctionInfo::SpilledReg Spill
= VGPRSpills
[i
];
1460 BuildMI(SB
.MBB
, MI
, SB
.DL
, SB
.TII
.get(AMDGPU::V_READLANE_B32
), SubReg
)
1462 .addImm(Spill
.Lane
);
1463 if (SB
.NumSubRegs
> 1 && i
== 0)
1464 MIB
.addReg(SB
.SuperReg
, RegState::ImplicitDefine
);
1467 LIS
->ReplaceMachineInstrInMaps(*MI
, *MIB
);
1469 LIS
->InsertMachineInstrInMaps(*MIB
);
1476 // Per VGPR helper data
1477 auto PVD
= SB
.getPerVGPRData();
1479 for (unsigned Offset
= 0; Offset
< PVD
.NumVGPRs
; ++Offset
) {
1480 // Load in VGPR data
1481 SB
.readWriteTmpVGPR(Offset
, /*IsLoad*/ true);
1484 for (unsigned i
= Offset
* PVD
.PerVGPR
,
1485 e
= std::min((Offset
+ 1) * PVD
.PerVGPR
, SB
.NumSubRegs
);
1490 : Register(getSubReg(SB
.SuperReg
, SB
.SplitParts
[i
]));
1492 bool LastSubReg
= (i
+ 1 == e
);
1493 auto MIB
= BuildMI(SB
.MBB
, MI
, SB
.DL
,
1494 SB
.TII
.get(AMDGPU::V_READLANE_B32
), SubReg
)
1495 .addReg(SB
.TmpVGPR
, getKillRegState(LastSubReg
))
1497 if (SB
.NumSubRegs
> 1 && i
== 0)
1498 MIB
.addReg(SB
.SuperReg
, RegState::ImplicitDefine
);
1501 LIS
->ReplaceMachineInstrInMaps(*MI
, *MIB
);
1503 LIS
->InsertMachineInstrInMaps(*MIB
);
1511 MI
->eraseFromParent();
1514 LIS
->removeAllRegUnitsForPhysReg(SB
.SuperReg
);
1519 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1520 /// a VGPR and the stack slot can be safely eliminated when all other users are
1522 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1523 MachineBasicBlock::iterator MI
,
1526 LiveIntervals
*LIS
) const {
1527 switch (MI
->getOpcode()) {
1528 case AMDGPU::SI_SPILL_S1024_SAVE
:
1529 case AMDGPU::SI_SPILL_S512_SAVE
:
1530 case AMDGPU::SI_SPILL_S256_SAVE
:
1531 case AMDGPU::SI_SPILL_S224_SAVE
:
1532 case AMDGPU::SI_SPILL_S192_SAVE
:
1533 case AMDGPU::SI_SPILL_S160_SAVE
:
1534 case AMDGPU::SI_SPILL_S128_SAVE
:
1535 case AMDGPU::SI_SPILL_S96_SAVE
:
1536 case AMDGPU::SI_SPILL_S64_SAVE
:
1537 case AMDGPU::SI_SPILL_S32_SAVE
:
1538 return spillSGPR(MI
, FI
, RS
, LIS
, true);
1539 case AMDGPU::SI_SPILL_S1024_RESTORE
:
1540 case AMDGPU::SI_SPILL_S512_RESTORE
:
1541 case AMDGPU::SI_SPILL_S256_RESTORE
:
1542 case AMDGPU::SI_SPILL_S224_RESTORE
:
1543 case AMDGPU::SI_SPILL_S192_RESTORE
:
1544 case AMDGPU::SI_SPILL_S160_RESTORE
:
1545 case AMDGPU::SI_SPILL_S128_RESTORE
:
1546 case AMDGPU::SI_SPILL_S96_RESTORE
:
1547 case AMDGPU::SI_SPILL_S64_RESTORE
:
1548 case AMDGPU::SI_SPILL_S32_RESTORE
:
1549 return restoreSGPR(MI
, FI
, RS
, LIS
, true);
1551 llvm_unreachable("not an SGPR spill instruction");
1555 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI
,
1556 int SPAdj
, unsigned FIOperandNum
,
1557 RegScavenger
*RS
) const {
1558 MachineFunction
*MF
= MI
->getParent()->getParent();
1559 MachineBasicBlock
*MBB
= MI
->getParent();
1560 SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
1561 MachineFrameInfo
&FrameInfo
= MF
->getFrameInfo();
1562 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1563 DebugLoc DL
= MI
->getDebugLoc();
1565 assert(SPAdj
== 0 && "unhandled SP adjustment in call sequence?");
1567 MachineOperand
&FIOp
= MI
->getOperand(FIOperandNum
);
1568 int Index
= MI
->getOperand(FIOperandNum
).getIndex();
1570 Register FrameReg
= FrameInfo
.isFixedObjectIndex(Index
) && hasBasePointer(*MF
)
1572 : getFrameRegister(*MF
);
1574 switch (MI
->getOpcode()) {
1575 // SGPR register spill
1576 case AMDGPU::SI_SPILL_S1024_SAVE
:
1577 case AMDGPU::SI_SPILL_S512_SAVE
:
1578 case AMDGPU::SI_SPILL_S256_SAVE
:
1579 case AMDGPU::SI_SPILL_S224_SAVE
:
1580 case AMDGPU::SI_SPILL_S192_SAVE
:
1581 case AMDGPU::SI_SPILL_S160_SAVE
:
1582 case AMDGPU::SI_SPILL_S128_SAVE
:
1583 case AMDGPU::SI_SPILL_S96_SAVE
:
1584 case AMDGPU::SI_SPILL_S64_SAVE
:
1585 case AMDGPU::SI_SPILL_S32_SAVE
: {
1586 spillSGPR(MI
, Index
, RS
);
1590 // SGPR register restore
1591 case AMDGPU::SI_SPILL_S1024_RESTORE
:
1592 case AMDGPU::SI_SPILL_S512_RESTORE
:
1593 case AMDGPU::SI_SPILL_S256_RESTORE
:
1594 case AMDGPU::SI_SPILL_S224_RESTORE
:
1595 case AMDGPU::SI_SPILL_S192_RESTORE
:
1596 case AMDGPU::SI_SPILL_S160_RESTORE
:
1597 case AMDGPU::SI_SPILL_S128_RESTORE
:
1598 case AMDGPU::SI_SPILL_S96_RESTORE
:
1599 case AMDGPU::SI_SPILL_S64_RESTORE
:
1600 case AMDGPU::SI_SPILL_S32_RESTORE
: {
1601 restoreSGPR(MI
, Index
, RS
);
1605 // VGPR register spill
1606 case AMDGPU::SI_SPILL_V1024_SAVE
:
1607 case AMDGPU::SI_SPILL_V512_SAVE
:
1608 case AMDGPU::SI_SPILL_V256_SAVE
:
1609 case AMDGPU::SI_SPILL_V224_SAVE
:
1610 case AMDGPU::SI_SPILL_V192_SAVE
:
1611 case AMDGPU::SI_SPILL_V160_SAVE
:
1612 case AMDGPU::SI_SPILL_V128_SAVE
:
1613 case AMDGPU::SI_SPILL_V96_SAVE
:
1614 case AMDGPU::SI_SPILL_V64_SAVE
:
1615 case AMDGPU::SI_SPILL_V32_SAVE
:
1616 case AMDGPU::SI_SPILL_A1024_SAVE
:
1617 case AMDGPU::SI_SPILL_A512_SAVE
:
1618 case AMDGPU::SI_SPILL_A256_SAVE
:
1619 case AMDGPU::SI_SPILL_A224_SAVE
:
1620 case AMDGPU::SI_SPILL_A192_SAVE
:
1621 case AMDGPU::SI_SPILL_A160_SAVE
:
1622 case AMDGPU::SI_SPILL_A128_SAVE
:
1623 case AMDGPU::SI_SPILL_A96_SAVE
:
1624 case AMDGPU::SI_SPILL_A64_SAVE
:
1625 case AMDGPU::SI_SPILL_A32_SAVE
: {
1626 const MachineOperand
*VData
= TII
->getNamedOperand(*MI
,
1627 AMDGPU::OpName::vdata
);
1628 assert(TII
->getNamedOperand(*MI
, AMDGPU::OpName::soffset
)->getReg() ==
1629 MFI
->getStackPtrOffsetReg());
1631 unsigned Opc
= ST
.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1632 : AMDGPU::BUFFER_STORE_DWORD_OFFSET
;
1633 auto *MBB
= MI
->getParent();
1634 buildSpillLoadStore(
1635 *MBB
, MI
, Opc
, Index
, VData
->getReg(), VData
->isKill(), FrameReg
,
1636 TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
)->getImm(),
1637 *MI
->memoperands_begin(), RS
);
1638 MFI
->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI
->getOpcode()));
1639 MI
->eraseFromParent();
1642 case AMDGPU::SI_SPILL_V32_RESTORE
:
1643 case AMDGPU::SI_SPILL_V64_RESTORE
:
1644 case AMDGPU::SI_SPILL_V96_RESTORE
:
1645 case AMDGPU::SI_SPILL_V128_RESTORE
:
1646 case AMDGPU::SI_SPILL_V160_RESTORE
:
1647 case AMDGPU::SI_SPILL_V192_RESTORE
:
1648 case AMDGPU::SI_SPILL_V224_RESTORE
:
1649 case AMDGPU::SI_SPILL_V256_RESTORE
:
1650 case AMDGPU::SI_SPILL_V512_RESTORE
:
1651 case AMDGPU::SI_SPILL_V1024_RESTORE
:
1652 case AMDGPU::SI_SPILL_A32_RESTORE
:
1653 case AMDGPU::SI_SPILL_A64_RESTORE
:
1654 case AMDGPU::SI_SPILL_A96_RESTORE
:
1655 case AMDGPU::SI_SPILL_A128_RESTORE
:
1656 case AMDGPU::SI_SPILL_A160_RESTORE
:
1657 case AMDGPU::SI_SPILL_A192_RESTORE
:
1658 case AMDGPU::SI_SPILL_A224_RESTORE
:
1659 case AMDGPU::SI_SPILL_A256_RESTORE
:
1660 case AMDGPU::SI_SPILL_A512_RESTORE
:
1661 case AMDGPU::SI_SPILL_A1024_RESTORE
: {
1662 const MachineOperand
*VData
= TII
->getNamedOperand(*MI
,
1663 AMDGPU::OpName::vdata
);
1664 assert(TII
->getNamedOperand(*MI
, AMDGPU::OpName::soffset
)->getReg() ==
1665 MFI
->getStackPtrOffsetReg());
1667 unsigned Opc
= ST
.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1668 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET
;
1669 auto *MBB
= MI
->getParent();
1670 buildSpillLoadStore(
1671 *MBB
, MI
, Opc
, Index
, VData
->getReg(), VData
->isKill(), FrameReg
,
1672 TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
)->getImm(),
1673 *MI
->memoperands_begin(), RS
);
1674 MI
->eraseFromParent();
1679 // Other access to frame index
1680 const DebugLoc
&DL
= MI
->getDebugLoc();
1682 int64_t Offset
= FrameInfo
.getObjectOffset(Index
);
1683 if (ST
.enableFlatScratch()) {
1684 if (TII
->isFLATScratch(*MI
)) {
1685 assert((int16_t)FIOperandNum
==
1686 AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
1687 AMDGPU::OpName::saddr
));
1689 // The offset is always swizzled, just replace it
1691 FIOp
.ChangeToRegister(FrameReg
, false);
1696 MachineOperand
*OffsetOp
=
1697 TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
);
1698 int64_t NewOffset
= Offset
+ OffsetOp
->getImm();
1699 if (TII
->isLegalFLATOffset(NewOffset
, AMDGPUAS::PRIVATE_ADDRESS
,
1700 SIInstrFlags::FlatScratch
)) {
1701 OffsetOp
->setImm(NewOffset
);
1707 assert(!TII
->getNamedOperand(*MI
, AMDGPU::OpName::vaddr
) &&
1708 "Unexpected vaddr for flat scratch with a FI operand");
1710 // On GFX10 we have ST mode to use no registers for an address.
1711 // Otherwise we need to materialize 0 into an SGPR.
1712 if (!Offset
&& ST
.hasFlatScratchSTMode()) {
1713 unsigned Opc
= MI
->getOpcode();
1714 unsigned NewOpc
= AMDGPU::getFlatScratchInstSTfromSS(Opc
);
1716 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::saddr
));
1717 MI
->setDesc(TII
->get(NewOpc
));
1723 FIOp
.ChangeToImmediate(Offset
);
1724 if (TII
->isImmOperandLegal(*MI
, FIOperandNum
, FIOp
))
1728 // We need to use register here. Check if we can use an SGPR or need
1730 FIOp
.ChangeToRegister(AMDGPU::M0
, false);
1731 bool UseSGPR
= TII
->isOperandLegal(*MI
, FIOperandNum
, &FIOp
);
1733 if (!Offset
&& FrameReg
&& UseSGPR
) {
1734 FIOp
.setReg(FrameReg
);
1738 const TargetRegisterClass
*RC
= UseSGPR
? &AMDGPU::SReg_32_XM0RegClass
1739 : &AMDGPU::VGPR_32RegClass
;
1741 Register TmpReg
= RS
->scavengeRegister(RC
, MI
, 0, !UseSGPR
);
1742 FIOp
.setReg(TmpReg
);
1743 FIOp
.setIsKill(true);
1745 if ((!FrameReg
|| !Offset
) && TmpReg
) {
1746 unsigned Opc
= UseSGPR
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
1747 auto MIB
= BuildMI(*MBB
, MI
, DL
, TII
->get(Opc
), TmpReg
);
1749 MIB
.addReg(FrameReg
);
1758 : RS
->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass
, MI
, 0,
1761 // TODO: for flat scratch another attempt can be made with a VGPR index
1762 // if no SGPRs can be scavenged.
1763 if ((!TmpSReg
&& !FrameReg
) || (!TmpReg
&& !UseSGPR
))
1764 report_fatal_error("Cannot scavenge register in FI elimination!");
1767 // Use frame register and restore it after.
1769 FIOp
.setReg(FrameReg
);
1770 FIOp
.setIsKill(false);
1773 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), TmpSReg
)
1778 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), TmpReg
)
1779 .addReg(TmpSReg
, RegState::Kill
);
1781 if (TmpSReg
== FrameReg
) {
1782 // Undo frame register modification.
1783 BuildMI(*MBB
, std::next(MI
), DL
, TII
->get(AMDGPU::S_ADD_I32
),
1792 bool IsMUBUF
= TII
->isMUBUF(*MI
);
1794 if (!IsMUBUF
&& !MFI
->isEntryFunction()) {
1795 // Convert to a swizzled stack address by scaling by the wave size.
1797 // In an entry function/kernel the offset is already swizzled.
1799 bool IsCopy
= MI
->getOpcode() == AMDGPU::V_MOV_B32_e32
;
1800 Register ResultReg
=
1801 IsCopy
? MI
->getOperand(0).getReg()
1802 : RS
->scavengeRegister(&AMDGPU::VGPR_32RegClass
, MI
, 0);
1804 int64_t Offset
= FrameInfo
.getObjectOffset(Index
);
1806 // XXX - This never happens because of emergency scavenging slot at 0?
1807 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::V_LSHRREV_B32_e64
), ResultReg
)
1808 .addImm(ST
.getWavefrontSizeLog2())
1811 if (auto MIB
= TII
->getAddNoCarry(*MBB
, MI
, DL
, ResultReg
, *RS
)) {
1812 // Reuse ResultReg in intermediate step.
1813 Register ScaledReg
= ResultReg
;
1815 BuildMI(*MBB
, *MIB
, DL
, TII
->get(AMDGPU::V_LSHRREV_B32_e64
),
1817 .addImm(ST
.getWavefrontSizeLog2())
1820 const bool IsVOP2
= MIB
->getOpcode() == AMDGPU::V_ADD_U32_e32
;
1822 // TODO: Fold if use instruction is another add of a constant.
1823 if (IsVOP2
|| AMDGPU::isInlinableLiteral32(Offset
, ST
.hasInv2PiInlineImm())) {
1824 // FIXME: This can fail
1826 MIB
.addReg(ScaledReg
, RegState::Kill
);
1828 MIB
.addImm(0); // clamp bit
1830 assert(MIB
->getOpcode() == AMDGPU::V_ADD_CO_U32_e64
&&
1831 "Need to reuse carry out register");
1833 // Use scavenged unused carry out as offset register.
1834 Register ConstOffsetReg
;
1836 ConstOffsetReg
= getSubReg(MIB
.getReg(1), AMDGPU::sub0
);
1838 ConstOffsetReg
= MIB
.getReg(1);
1840 BuildMI(*MBB
, *MIB
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ConstOffsetReg
)
1842 MIB
.addReg(ConstOffsetReg
, RegState::Kill
);
1843 MIB
.addReg(ScaledReg
, RegState::Kill
);
1844 MIB
.addImm(0); // clamp bit
1847 // We have to produce a carry out, and there isn't a free SGPR pair
1848 // for it. We can keep the whole computation on the SALU to avoid
1849 // clobbering an additional register at the cost of an extra mov.
1851 // We may have 1 free scratch SGPR even though a carry out is
1852 // unavailable. Only one additional mov is needed.
1853 Register TmpScaledReg
=
1854 RS
->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass
, MI
, 0, false);
1855 Register ScaledReg
= TmpScaledReg
.isValid() ? TmpScaledReg
: FrameReg
;
1857 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_LSHR_B32
), ScaledReg
)
1859 .addImm(ST
.getWavefrontSizeLog2());
1860 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), ScaledReg
)
1861 .addReg(ScaledReg
, RegState::Kill
)
1863 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::COPY
), ResultReg
)
1864 .addReg(ScaledReg
, RegState::Kill
);
1866 // If there were truly no free SGPRs, we need to undo everything.
1867 if (!TmpScaledReg
.isValid()) {
1868 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_ADD_I32
), ScaledReg
)
1869 .addReg(ScaledReg
, RegState::Kill
)
1871 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::S_LSHL_B32
), ScaledReg
)
1873 .addImm(ST
.getWavefrontSizeLog2());
1878 // Don't introduce an extra copy if we're just materializing in a mov.
1880 MI
->eraseFromParent();
1882 FIOp
.ChangeToRegister(ResultReg
, false, false, true);
1887 // Disable offen so we don't need a 0 vgpr base.
1888 assert(static_cast<int>(FIOperandNum
) ==
1889 AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
1890 AMDGPU::OpName::vaddr
));
1892 auto &SOffset
= *TII
->getNamedOperand(*MI
, AMDGPU::OpName::soffset
);
1893 assert((SOffset
.isImm() && SOffset
.getImm() == 0));
1895 if (FrameReg
!= AMDGPU::NoRegister
)
1896 SOffset
.ChangeToRegister(FrameReg
, false);
1898 int64_t Offset
= FrameInfo
.getObjectOffset(Index
);
1900 = TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
)->getImm();
1901 int64_t NewOffset
= OldImm
+ Offset
;
1903 if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset
) &&
1904 buildMUBUFOffsetLoadStore(ST
, FrameInfo
, MI
, Index
, NewOffset
)) {
1905 MI
->eraseFromParent();
1910 // If the offset is simply too big, don't convert to a scratch wave offset
1913 FIOp
.ChangeToImmediate(Offset
);
1914 if (!TII
->isImmOperandLegal(*MI
, FIOperandNum
, FIOp
)) {
1915 Register TmpReg
= RS
->scavengeRegister(&AMDGPU::VGPR_32RegClass
, MI
, 0);
1916 BuildMI(*MBB
, MI
, DL
, TII
->get(AMDGPU::V_MOV_B32_e32
), TmpReg
)
1918 FIOp
.ChangeToRegister(TmpReg
, false, false, true);
1924 StringRef
SIRegisterInfo::getRegAsmName(MCRegister Reg
) const {
1925 return AMDGPUInstPrinter::getRegisterName(Reg
);
1928 static const TargetRegisterClass
*
1929 getAnyVGPRClassForBitWidth(unsigned BitWidth
) {
1931 return &AMDGPU::VReg_64RegClass
;
1933 return &AMDGPU::VReg_96RegClass
;
1934 if (BitWidth
<= 128)
1935 return &AMDGPU::VReg_128RegClass
;
1936 if (BitWidth
<= 160)
1937 return &AMDGPU::VReg_160RegClass
;
1938 if (BitWidth
<= 192)
1939 return &AMDGPU::VReg_192RegClass
;
1940 if (BitWidth
<= 224)
1941 return &AMDGPU::VReg_224RegClass
;
1942 if (BitWidth
<= 256)
1943 return &AMDGPU::VReg_256RegClass
;
1944 if (BitWidth
<= 512)
1945 return &AMDGPU::VReg_512RegClass
;
1946 if (BitWidth
<= 1024)
1947 return &AMDGPU::VReg_1024RegClass
;
1952 static const TargetRegisterClass
*
1953 getAlignedVGPRClassForBitWidth(unsigned BitWidth
) {
1955 return &AMDGPU::VReg_64_Align2RegClass
;
1957 return &AMDGPU::VReg_96_Align2RegClass
;
1958 if (BitWidth
<= 128)
1959 return &AMDGPU::VReg_128_Align2RegClass
;
1960 if (BitWidth
<= 160)
1961 return &AMDGPU::VReg_160_Align2RegClass
;
1962 if (BitWidth
<= 192)
1963 return &AMDGPU::VReg_192_Align2RegClass
;
1964 if (BitWidth
<= 224)
1965 return &AMDGPU::VReg_224_Align2RegClass
;
1966 if (BitWidth
<= 256)
1967 return &AMDGPU::VReg_256_Align2RegClass
;
1968 if (BitWidth
<= 512)
1969 return &AMDGPU::VReg_512_Align2RegClass
;
1970 if (BitWidth
<= 1024)
1971 return &AMDGPU::VReg_1024_Align2RegClass
;
1976 const TargetRegisterClass
*
1977 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth
) const {
1979 return &AMDGPU::VReg_1RegClass
;
1981 return &AMDGPU::VGPR_LO16RegClass
;
1983 return &AMDGPU::VGPR_32RegClass
;
1984 return ST
.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth
)
1985 : getAnyVGPRClassForBitWidth(BitWidth
);
1988 static const TargetRegisterClass
*
1989 getAnyAGPRClassForBitWidth(unsigned BitWidth
) {
1991 return &AMDGPU::AReg_64RegClass
;
1993 return &AMDGPU::AReg_96RegClass
;
1994 if (BitWidth
<= 128)
1995 return &AMDGPU::AReg_128RegClass
;
1996 if (BitWidth
<= 160)
1997 return &AMDGPU::AReg_160RegClass
;
1998 if (BitWidth
<= 192)
1999 return &AMDGPU::AReg_192RegClass
;
2000 if (BitWidth
<= 224)
2001 return &AMDGPU::AReg_224RegClass
;
2002 if (BitWidth
<= 256)
2003 return &AMDGPU::AReg_256RegClass
;
2004 if (BitWidth
<= 512)
2005 return &AMDGPU::AReg_512RegClass
;
2006 if (BitWidth
<= 1024)
2007 return &AMDGPU::AReg_1024RegClass
;
2012 static const TargetRegisterClass
*
2013 getAlignedAGPRClassForBitWidth(unsigned BitWidth
) {
2015 return &AMDGPU::AReg_64_Align2RegClass
;
2017 return &AMDGPU::AReg_96_Align2RegClass
;
2018 if (BitWidth
<= 128)
2019 return &AMDGPU::AReg_128_Align2RegClass
;
2020 if (BitWidth
<= 160)
2021 return &AMDGPU::AReg_160_Align2RegClass
;
2022 if (BitWidth
<= 192)
2023 return &AMDGPU::AReg_192_Align2RegClass
;
2024 if (BitWidth
<= 224)
2025 return &AMDGPU::AReg_224_Align2RegClass
;
2026 if (BitWidth
<= 256)
2027 return &AMDGPU::AReg_256_Align2RegClass
;
2028 if (BitWidth
<= 512)
2029 return &AMDGPU::AReg_512_Align2RegClass
;
2030 if (BitWidth
<= 1024)
2031 return &AMDGPU::AReg_1024_Align2RegClass
;
2036 const TargetRegisterClass
*
2037 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth
) const {
2039 return &AMDGPU::AGPR_LO16RegClass
;
2041 return &AMDGPU::AGPR_32RegClass
;
2042 return ST
.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth
)
2043 : getAnyAGPRClassForBitWidth(BitWidth
);
2046 const TargetRegisterClass
*
2047 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth
) {
2049 return &AMDGPU::SGPR_LO16RegClass
;
2051 return &AMDGPU::SReg_32RegClass
;
2053 return &AMDGPU::SReg_64RegClass
;
2055 return &AMDGPU::SGPR_96RegClass
;
2056 if (BitWidth
<= 128)
2057 return &AMDGPU::SGPR_128RegClass
;
2058 if (BitWidth
<= 160)
2059 return &AMDGPU::SGPR_160RegClass
;
2060 if (BitWidth
<= 192)
2061 return &AMDGPU::SGPR_192RegClass
;
2062 if (BitWidth
<= 224)
2063 return &AMDGPU::SGPR_224RegClass
;
2064 if (BitWidth
<= 256)
2065 return &AMDGPU::SGPR_256RegClass
;
2066 if (BitWidth
<= 512)
2067 return &AMDGPU::SGPR_512RegClass
;
2068 if (BitWidth
<= 1024)
2069 return &AMDGPU::SGPR_1024RegClass
;
2074 // FIXME: This is very slow. It might be worth creating a map from physreg to
2076 const TargetRegisterClass
*
2077 SIRegisterInfo::getPhysRegClass(MCRegister Reg
) const {
2078 static const TargetRegisterClass
*const BaseClasses
[] = {
2079 &AMDGPU::VGPR_LO16RegClass
,
2080 &AMDGPU::VGPR_HI16RegClass
,
2081 &AMDGPU::SReg_LO16RegClass
,
2082 &AMDGPU::AGPR_LO16RegClass
,
2083 &AMDGPU::VGPR_32RegClass
,
2084 &AMDGPU::SReg_32RegClass
,
2085 &AMDGPU::AGPR_32RegClass
,
2086 &AMDGPU::AGPR_32RegClass
,
2087 &AMDGPU::VReg_64_Align2RegClass
,
2088 &AMDGPU::VReg_64RegClass
,
2089 &AMDGPU::SReg_64RegClass
,
2090 &AMDGPU::AReg_64_Align2RegClass
,
2091 &AMDGPU::AReg_64RegClass
,
2092 &AMDGPU::VReg_96_Align2RegClass
,
2093 &AMDGPU::VReg_96RegClass
,
2094 &AMDGPU::SReg_96RegClass
,
2095 &AMDGPU::AReg_96_Align2RegClass
,
2096 &AMDGPU::AReg_96RegClass
,
2097 &AMDGPU::VReg_128_Align2RegClass
,
2098 &AMDGPU::VReg_128RegClass
,
2099 &AMDGPU::SReg_128RegClass
,
2100 &AMDGPU::AReg_128_Align2RegClass
,
2101 &AMDGPU::AReg_128RegClass
,
2102 &AMDGPU::VReg_160_Align2RegClass
,
2103 &AMDGPU::VReg_160RegClass
,
2104 &AMDGPU::SReg_160RegClass
,
2105 &AMDGPU::AReg_160_Align2RegClass
,
2106 &AMDGPU::AReg_160RegClass
,
2107 &AMDGPU::VReg_192_Align2RegClass
,
2108 &AMDGPU::VReg_192RegClass
,
2109 &AMDGPU::SReg_192RegClass
,
2110 &AMDGPU::AReg_192_Align2RegClass
,
2111 &AMDGPU::AReg_192RegClass
,
2112 &AMDGPU::VReg_224_Align2RegClass
,
2113 &AMDGPU::VReg_224RegClass
,
2114 &AMDGPU::SReg_224RegClass
,
2115 &AMDGPU::AReg_224_Align2RegClass
,
2116 &AMDGPU::AReg_224RegClass
,
2117 &AMDGPU::VReg_256_Align2RegClass
,
2118 &AMDGPU::VReg_256RegClass
,
2119 &AMDGPU::SReg_256RegClass
,
2120 &AMDGPU::AReg_256_Align2RegClass
,
2121 &AMDGPU::AReg_256RegClass
,
2122 &AMDGPU::VReg_512_Align2RegClass
,
2123 &AMDGPU::VReg_512RegClass
,
2124 &AMDGPU::SReg_512RegClass
,
2125 &AMDGPU::AReg_512_Align2RegClass
,
2126 &AMDGPU::AReg_512RegClass
,
2127 &AMDGPU::SReg_1024RegClass
,
2128 &AMDGPU::VReg_1024_Align2RegClass
,
2129 &AMDGPU::VReg_1024RegClass
,
2130 &AMDGPU::AReg_1024_Align2RegClass
,
2131 &AMDGPU::AReg_1024RegClass
,
2132 &AMDGPU::SCC_CLASSRegClass
,
2133 &AMDGPU::Pseudo_SReg_32RegClass
,
2134 &AMDGPU::Pseudo_SReg_128RegClass
,
2137 for (const TargetRegisterClass
*BaseClass
: BaseClasses
) {
2138 if (BaseClass
->contains(Reg
)) {
2145 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo
&MRI
,
2146 Register Reg
) const {
2147 const TargetRegisterClass
*RC
;
2148 if (Reg
.isVirtual())
2149 RC
= MRI
.getRegClass(Reg
);
2151 RC
= getPhysRegClass(Reg
);
2152 return isSGPRClass(RC
);
2155 // TODO: It might be helpful to have some target specific flags in
2156 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
2157 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass
*RC
) const {
2158 unsigned Size
= getRegSizeInBits(*RC
);
2160 return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass
, RC
) != nullptr ||
2161 getCommonSubClass(&AMDGPU::VGPR_HI16RegClass
, RC
) != nullptr;
2163 const TargetRegisterClass
*VRC
= getVGPRClassForBitWidth(Size
);
2165 assert(Size
< 32 && "Invalid register class size");
2168 return getCommonSubClass(VRC
, RC
) != nullptr;
2171 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass
*RC
) const {
2172 unsigned Size
= getRegSizeInBits(*RC
);
2175 const TargetRegisterClass
*ARC
= getAGPRClassForBitWidth(Size
);
2177 assert(getVGPRClassForBitWidth(Size
) && "Invalid register class size");
2180 return getCommonSubClass(ARC
, RC
) != nullptr;
2183 const TargetRegisterClass
*
2184 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass
*SRC
) const {
2185 unsigned Size
= getRegSizeInBits(*SRC
);
2186 const TargetRegisterClass
*VRC
= getVGPRClassForBitWidth(Size
);
2187 assert(VRC
&& "Invalid register class size");
2191 const TargetRegisterClass
*
2192 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass
*SRC
) const {
2193 unsigned Size
= getRegSizeInBits(*SRC
);
2194 const TargetRegisterClass
*ARC
= getAGPRClassForBitWidth(Size
);
2195 assert(ARC
&& "Invalid register class size");
2199 const TargetRegisterClass
*
2200 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass
*VRC
) const {
2201 unsigned Size
= getRegSizeInBits(*VRC
);
2203 return &AMDGPU::SGPR_32RegClass
;
2204 const TargetRegisterClass
*SRC
= getSGPRClassForBitWidth(Size
);
2205 assert(SRC
&& "Invalid register class size");
2209 const TargetRegisterClass
*SIRegisterInfo::getSubRegClass(
2210 const TargetRegisterClass
*RC
, unsigned SubIdx
) const {
2211 if (SubIdx
== AMDGPU::NoSubRegister
)
2214 // We can assume that each lane corresponds to one 32-bit register.
2215 unsigned Size
= getNumChannelsFromSubReg(SubIdx
) * 32;
2216 if (isSGPRClass(RC
)) {
2218 RC
= &AMDGPU::SGPR_32RegClass
;
2220 RC
= getSGPRClassForBitWidth(Size
);
2221 } else if (hasAGPRs(RC
)) {
2222 RC
= getAGPRClassForBitWidth(Size
);
2224 RC
= getVGPRClassForBitWidth(Size
);
2226 assert(RC
&& "Invalid sub-register class size");
2230 const TargetRegisterClass
*
2231 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass
*SuperRC
,
2232 const TargetRegisterClass
*SubRC
,
2233 unsigned SubIdx
) const {
2234 // Ensure this subregister index is aligned in the super register.
2235 const TargetRegisterClass
*MatchRC
=
2236 getMatchingSuperRegClass(SuperRC
, SubRC
, SubIdx
);
2237 return MatchRC
&& MatchRC
->hasSubClassEq(SuperRC
) ? MatchRC
: nullptr;
2240 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType
) const {
2241 if (OpType
>= AMDGPU::OPERAND_REG_INLINE_AC_FIRST
&&
2242 OpType
<= AMDGPU::OPERAND_REG_INLINE_AC_LAST
)
2243 return !ST
.hasMFMAInlineLiteralBug();
2245 return OpType
>= AMDGPU::OPERAND_SRC_FIRST
&&
2246 OpType
<= AMDGPU::OPERAND_SRC_LAST
;
2249 bool SIRegisterInfo::shouldRewriteCopySrc(
2250 const TargetRegisterClass
*DefRC
,
2252 const TargetRegisterClass
*SrcRC
,
2253 unsigned SrcSubReg
) const {
2254 // We want to prefer the smallest register class possible, so we don't want to
2255 // stop and rewrite on anything that looks like a subregister
2256 // extract. Operations mostly don't care about the super register class, so we
2257 // only want to stop on the most basic of copies between the same register
2260 // e.g. if we have something like
2263 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
2264 // %3 = COPY %2, sub0
2266 // We want to look through the COPY to find:
2270 return getCommonSubClass(DefRC
, SrcRC
) != nullptr;
2273 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType
) const {
2274 // TODO: 64-bit operands have extending behavior from 32-bit literal.
2275 return OpType
>= AMDGPU::OPERAND_REG_IMM_FIRST
&&
2276 OpType
<= AMDGPU::OPERAND_REG_IMM_LAST
;
2279 /// Returns a lowest register that is not used at any point in the function.
2280 /// If all registers are used, then this function will return
2281 /// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
2282 /// highest unused register.
2283 MCRegister
SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo
&MRI
,
2284 const TargetRegisterClass
*RC
,
2285 const MachineFunction
&MF
,
2286 bool ReserveHighestVGPR
) const {
2287 if (ReserveHighestVGPR
) {
2288 for (MCRegister Reg
: reverse(*RC
))
2289 if (MRI
.isAllocatable(Reg
) && !MRI
.isPhysRegUsed(Reg
))
2292 for (MCRegister Reg
: *RC
)
2293 if (MRI
.isAllocatable(Reg
) && !MRI
.isPhysRegUsed(Reg
))
2296 return MCRegister();
2299 ArrayRef
<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass
*RC
,
2300 unsigned EltSize
) const {
2301 const unsigned RegBitWidth
= AMDGPU::getRegBitWidth(*RC
->MC
);
2302 assert(RegBitWidth
>= 32 && RegBitWidth
<= 1024);
2304 const unsigned RegDWORDs
= RegBitWidth
/ 32;
2305 const unsigned EltDWORDs
= EltSize
/ 4;
2306 assert(RegSplitParts
.size() + 1 >= EltDWORDs
);
2308 const std::vector
<int16_t> &Parts
= RegSplitParts
[EltDWORDs
- 1];
2309 const unsigned NumParts
= RegDWORDs
/ EltDWORDs
;
2311 return makeArrayRef(Parts
.data(), NumParts
);
2314 const TargetRegisterClass
*
2315 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo
&MRI
,
2316 Register Reg
) const {
2317 return Reg
.isVirtual() ? MRI
.getRegClass(Reg
) : getPhysRegClass(Reg
);
2320 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo
&MRI
,
2321 Register Reg
) const {
2322 const TargetRegisterClass
*RC
= getRegClassForReg(MRI
, Reg
);
2323 // Registers without classes are unaddressable, SGPR-like registers.
2324 return RC
&& hasVGPRs(RC
);
2327 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo
&MRI
,
2328 Register Reg
) const {
2329 const TargetRegisterClass
*RC
= getRegClassForReg(MRI
, Reg
);
2331 // Registers without classes are unaddressable, SGPR-like registers.
2332 return RC
&& hasAGPRs(RC
);
2335 bool SIRegisterInfo::shouldCoalesce(MachineInstr
*MI
,
2336 const TargetRegisterClass
*SrcRC
,
2338 const TargetRegisterClass
*DstRC
,
2340 const TargetRegisterClass
*NewRC
,
2341 LiveIntervals
&LIS
) const {
2342 unsigned SrcSize
= getRegSizeInBits(*SrcRC
);
2343 unsigned DstSize
= getRegSizeInBits(*DstRC
);
2344 unsigned NewSize
= getRegSizeInBits(*NewRC
);
2346 // Do not increase size of registers beyond dword, we would need to allocate
2347 // adjacent registers and constraint regalloc more than needed.
2349 // Always allow dword coalescing.
2350 if (SrcSize
<= 32 || DstSize
<= 32)
2353 return NewSize
<= DstSize
|| NewSize
<= SrcSize
;
2356 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass
*RC
,
2357 MachineFunction
&MF
) const {
2358 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2360 unsigned Occupancy
= ST
.getOccupancyWithLocalMemSize(MFI
->getLDSSize(),
2362 switch (RC
->getID()) {
2364 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC
, MF
);
2365 case AMDGPU::VGPR_32RegClassID
:
2366 case AMDGPU::VGPR_LO16RegClassID
:
2367 case AMDGPU::VGPR_HI16RegClassID
:
2368 return std::min(ST
.getMaxNumVGPRs(Occupancy
), ST
.getMaxNumVGPRs(MF
));
2369 case AMDGPU::SGPR_32RegClassID
:
2370 case AMDGPU::SGPR_LO16RegClassID
:
2371 return std::min(ST
.getMaxNumSGPRs(Occupancy
, true), ST
.getMaxNumSGPRs(MF
));
2375 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction
&MF
,
2376 unsigned Idx
) const {
2377 if (Idx
== AMDGPU::RegisterPressureSets::VGPR_32
||
2378 Idx
== AMDGPU::RegisterPressureSets::AGPR_32
)
2379 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass
,
2380 const_cast<MachineFunction
&>(MF
));
2382 if (Idx
== AMDGPU::RegisterPressureSets::SReg_32
)
2383 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass
,
2384 const_cast<MachineFunction
&>(MF
));
2386 llvm_unreachable("Unexpected register pressure set!");
2389 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit
) const {
2390 static const int Empty
[] = { -1 };
2392 if (RegPressureIgnoredUnits
[RegUnit
])
2395 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit
);
2398 MCRegister
SIRegisterInfo::getReturnAddressReg(const MachineFunction
&MF
) const {
2399 // Not a callee saved register.
2400 return AMDGPU::SGPR30_SGPR31
;
2403 const TargetRegisterClass
*
2404 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size
,
2405 const RegisterBank
&RB
,
2406 const MachineRegisterInfo
&MRI
) const {
2407 switch (RB
.getID()) {
2408 case AMDGPU::VGPRRegBankID
:
2409 return getVGPRClassForBitWidth(std::max(32u, Size
));
2410 case AMDGPU::VCCRegBankID
:
2412 return isWave32
? &AMDGPU::SReg_32_XM0_XEXECRegClass
2413 : &AMDGPU::SReg_64_XEXECRegClass
;
2414 case AMDGPU::SGPRRegBankID
:
2415 return getSGPRClassForBitWidth(std::max(32u, Size
));
2416 case AMDGPU::AGPRRegBankID
:
2417 return getAGPRClassForBitWidth(std::max(32u, Size
));
2419 llvm_unreachable("unknown register bank");
2423 const TargetRegisterClass
*
2424 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand
&MO
,
2425 const MachineRegisterInfo
&MRI
) const {
2426 const RegClassOrRegBank
&RCOrRB
= MRI
.getRegClassOrRegBank(MO
.getReg());
2427 if (const RegisterBank
*RB
= RCOrRB
.dyn_cast
<const RegisterBank
*>())
2428 return getRegClassForTypeOnBank(MRI
.getType(MO
.getReg()), *RB
, MRI
);
2430 const TargetRegisterClass
*RC
= RCOrRB
.get
<const TargetRegisterClass
*>();
2431 return getAllocatableClass(RC
);
2434 MCRegister
SIRegisterInfo::getVCC() const {
2435 return isWave32
? AMDGPU::VCC_LO
: AMDGPU::VCC
;
2438 const TargetRegisterClass
*SIRegisterInfo::getVGPR64Class() const {
2439 // VGPR tuples have an alignment requirement on gfx90a variants.
2440 return ST
.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
2441 : &AMDGPU::VReg_64RegClass
;
2444 const TargetRegisterClass
*
2445 SIRegisterInfo::getRegClass(unsigned RCID
) const {
2446 switch ((int)RCID
) {
2447 case AMDGPU::SReg_1RegClassID
:
2449 case AMDGPU::SReg_1_XEXECRegClassID
:
2450 return isWave32
? &AMDGPU::SReg_32_XM0_XEXECRegClass
2451 : &AMDGPU::SReg_64_XEXECRegClass
;
2455 return AMDGPUGenRegisterInfo::getRegClass(RCID
);
2459 // Find reaching register definition
2460 MachineInstr
*SIRegisterInfo::findReachingDef(Register Reg
, unsigned SubReg
,
2462 MachineRegisterInfo
&MRI
,
2463 LiveIntervals
*LIS
) const {
2464 auto &MDT
= LIS
->getAnalysis
<MachineDominatorTree
>();
2465 SlotIndex UseIdx
= LIS
->getInstructionIndex(Use
);
2468 if (Reg
.isVirtual()) {
2469 if (!LIS
->hasInterval(Reg
))
2471 LiveInterval
&LI
= LIS
->getInterval(Reg
);
2472 LaneBitmask SubLanes
= SubReg
? getSubRegIndexLaneMask(SubReg
)
2473 : MRI
.getMaxLaneMaskForVReg(Reg
);
2474 VNInfo
*V
= nullptr;
2475 if (LI
.hasSubRanges()) {
2476 for (auto &S
: LI
.subranges()) {
2477 if ((S
.LaneMask
& SubLanes
) == SubLanes
) {
2478 V
= S
.getVNInfoAt(UseIdx
);
2483 V
= LI
.getVNInfoAt(UseIdx
);
2490 for (MCRegUnitIterator
Units(Reg
.asMCReg(), this); Units
.isValid();
2492 LiveRange
&LR
= LIS
->getRegUnit(*Units
);
2493 if (VNInfo
*V
= LR
.getVNInfoAt(UseIdx
)) {
2494 if (!DefIdx
.isValid() ||
2495 MDT
.dominates(LIS
->getInstructionFromIndex(DefIdx
),
2496 LIS
->getInstructionFromIndex(V
->def
)))
2504 MachineInstr
*Def
= LIS
->getInstructionFromIndex(DefIdx
);
2506 if (!Def
|| !MDT
.dominates(Def
, &Use
))
2509 assert(Def
->modifiesRegister(Reg
, this));
2514 MCPhysReg
SIRegisterInfo::get32BitRegister(MCPhysReg Reg
) const {
2515 assert(getRegSizeInBits(*getPhysRegClass(Reg
)) <= 32);
2517 for (const TargetRegisterClass
&RC
: { AMDGPU::VGPR_32RegClass
,
2518 AMDGPU::SReg_32RegClass
,
2519 AMDGPU::AGPR_32RegClass
} ) {
2520 if (MCPhysReg Super
= getMatchingSuperReg(Reg
, AMDGPU::lo16
, &RC
))
2523 if (MCPhysReg Super
= getMatchingSuperReg(Reg
, AMDGPU::hi16
,
2524 &AMDGPU::VGPR_32RegClass
)) {
2528 return AMDGPU::NoRegister
;
2531 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass
&RC
) const {
2532 if (!ST
.needsAlignedVGPRs())
2536 return RC
.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC
)));
2538 return RC
.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC
)));
2543 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg
) const {
2545 case AMDGPU::SGPR_NULL
:
2546 case AMDGPU::SRC_SHARED_BASE
:
2547 case AMDGPU::SRC_PRIVATE_BASE
:
2548 case AMDGPU::SRC_SHARED_LIMIT
:
2549 case AMDGPU::SRC_PRIVATE_LIMIT
:
2557 SIRegisterInfo::getAllSGPR128(const MachineFunction
&MF
) const {
2558 return makeArrayRef(AMDGPU::SGPR_128RegClass
.begin(),
2559 ST
.getMaxNumSGPRs(MF
) / 4);
2563 SIRegisterInfo::getAllSGPR64(const MachineFunction
&MF
) const {
2564 return makeArrayRef(AMDGPU::SGPR_64RegClass
.begin(),
2565 ST
.getMaxNumSGPRs(MF
) / 2);
2569 SIRegisterInfo::getAllSGPR32(const MachineFunction
&MF
) const {
2570 return makeArrayRef(AMDGPU::SGPR_32RegClass
.begin(), ST
.getMaxNumSGPRs(MF
));