1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12 /// with sequential versions where possible.
14 //===----------------------------------------------------------------------===//
17 #include "GCNSubtarget.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/LiveRegMatrix.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/CodeGen/VirtRegMap.h"
25 #include "llvm/InitializePasses.h"
29 #define DEBUG_TYPE "amdgpu-nsa-reassign"
31 STATISTIC(NumNSAInstructions
,
32 "Number of NSA instructions with non-sequential address found");
33 STATISTIC(NumNSAConverted
,
34 "Number of NSA instructions changed to sequential");
38 class GCNNSAReassign
: public MachineFunctionPass
{
42 GCNNSAReassign() : MachineFunctionPass(ID
) {
43 initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
46 bool runOnMachineFunction(MachineFunction
&MF
) override
;
48 StringRef
getPassName() const override
{ return "GCN NSA Reassign"; }
50 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
51 AU
.addRequired
<LiveIntervalsWrapperPass
>();
52 AU
.addRequired
<VirtRegMap
>();
53 AU
.addRequired
<LiveRegMatrix
>();
55 MachineFunctionPass::getAnalysisUsage(AU
);
59 using NSA_Status
= enum {
60 NOT_NSA
, // Not an NSA instruction
61 FIXED
, // NSA which we cannot modify
62 NON_CONTIGUOUS
, // NSA with non-sequential address which we can try
64 CONTIGUOUS
// NSA with all sequential address registers
67 const GCNSubtarget
*ST
;
69 const MachineRegisterInfo
*MRI
;
71 const SIRegisterInfo
*TRI
;
81 const MCPhysReg
*CSRegs
;
83 NSA_Status
CheckNSA(const MachineInstr
&MI
, bool Fast
= false) const;
85 bool tryAssignRegisters(SmallVectorImpl
<LiveInterval
*> &Intervals
,
86 unsigned StartReg
) const;
88 bool canAssign(unsigned StartReg
, unsigned NumRegs
) const;
90 bool scavengeRegs(SmallVectorImpl
<LiveInterval
*> &Intervals
) const;
93 } // End anonymous namespace.
95 INITIALIZE_PASS_BEGIN(GCNNSAReassign
, DEBUG_TYPE
, "GCN NSA Reassign",
97 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass
)
98 INITIALIZE_PASS_DEPENDENCY(VirtRegMap
)
99 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix
)
100 INITIALIZE_PASS_END(GCNNSAReassign
, DEBUG_TYPE
, "GCN NSA Reassign",
104 char GCNNSAReassign::ID
= 0;
106 char &llvm::GCNNSAReassignID
= GCNNSAReassign::ID
;
109 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl
<LiveInterval
*> &Intervals
,
110 unsigned StartReg
) const {
111 unsigned NumRegs
= Intervals
.size();
113 for (unsigned N
= 0; N
< NumRegs
; ++N
)
114 if (VRM
->hasPhys(Intervals
[N
]->reg()))
115 LRM
->unassign(*Intervals
[N
]);
117 for (unsigned N
= 0; N
< NumRegs
; ++N
)
118 if (LRM
->checkInterference(*Intervals
[N
], MCRegister::from(StartReg
+ N
)))
121 for (unsigned N
= 0; N
< NumRegs
; ++N
)
122 LRM
->assign(*Intervals
[N
], MCRegister::from(StartReg
+ N
));
127 bool GCNNSAReassign::canAssign(unsigned StartReg
, unsigned NumRegs
) const {
128 for (unsigned N
= 0; N
< NumRegs
; ++N
) {
129 unsigned Reg
= StartReg
+ N
;
130 if (!MRI
->isAllocatable(Reg
))
133 for (unsigned I
= 0; CSRegs
[I
]; ++I
)
134 if (TRI
->isSubRegisterEq(Reg
, CSRegs
[I
]) &&
135 !LRM
->isPhysRegUsed(CSRegs
[I
]))
143 GCNNSAReassign::scavengeRegs(SmallVectorImpl
<LiveInterval
*> &Intervals
) const {
144 unsigned NumRegs
= Intervals
.size();
146 if (NumRegs
> MaxNumVGPRs
)
148 unsigned MaxReg
= MaxNumVGPRs
- NumRegs
+ AMDGPU::VGPR0
;
150 for (unsigned Reg
= AMDGPU::VGPR0
; Reg
<= MaxReg
; ++Reg
) {
151 if (!canAssign(Reg
, NumRegs
))
154 if (tryAssignRegisters(Intervals
, Reg
))
161 GCNNSAReassign::NSA_Status
162 GCNNSAReassign::CheckNSA(const MachineInstr
&MI
, bool Fast
) const {
163 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(MI
.getOpcode());
165 return NSA_Status::NOT_NSA
;
167 switch (Info
->MIMGEncoding
) {
168 case AMDGPU::MIMGEncGfx10NSA
:
169 case AMDGPU::MIMGEncGfx11NSA
:
172 return NSA_Status::NOT_NSA
;
176 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vaddr0
);
178 unsigned VgprBase
= 0;
180 for (unsigned I
= 0; I
< Info
->VAddrOperands
; ++I
) {
181 const MachineOperand
&Op
= MI
.getOperand(VAddr0Idx
+ I
);
182 Register Reg
= Op
.getReg();
183 if (Reg
.isPhysical() || !VRM
->isAssignedReg(Reg
))
184 return NSA_Status::FIXED
;
186 Register PhysReg
= VRM
->getPhys(Reg
);
190 return NSA_Status::FIXED
;
192 // TODO: address the below limitation to handle GFX11 BVH instructions
193 // Bail if address is not a VGPR32. That should be possible to extend the
194 // optimization to work with subregs of a wider register tuples, but the
195 // logic to find free registers will be much more complicated with much
196 // less chances for success. That seems reasonable to assume that in most
197 // cases a tuple is used because a vector variable contains different
198 // parts of an address and it is either already consecutive or cannot
199 // be reassigned if not. If needed it is better to rely on register
200 // coalescer to process such address tuples.
201 if (TRI
->getRegSizeInBits(*MRI
->getRegClass(Reg
)) != 32 || Op
.getSubReg())
202 return NSA_Status::FIXED
;
204 // InlineSpiller does not call LRM::assign() after an LI split leaving
205 // it in an inconsistent state, so we cannot call LRM::unassign().
206 // See llvm bug #48911.
207 // Skip reassign if a register has originated from such split.
208 // FIXME: Remove the workaround when bug #48911 is fixed.
209 if (VRM
->getPreSplitReg(Reg
))
210 return NSA_Status::FIXED
;
212 const MachineInstr
*Def
= MRI
->getUniqueVRegDef(Reg
);
214 if (Def
&& Def
->isCopy() && Def
->getOperand(1).getReg() == PhysReg
)
215 return NSA_Status::FIXED
;
217 for (auto U
: MRI
->use_nodbg_operands(Reg
)) {
219 return NSA_Status::FIXED
;
220 const MachineInstr
*UseInst
= U
.getParent();
221 if (UseInst
->isCopy() && UseInst
->getOperand(0).getReg() == PhysReg
)
222 return NSA_Status::FIXED
;
225 if (!LIS
->hasInterval(Reg
))
226 return NSA_Status::FIXED
;
231 else if (VgprBase
+ I
!= PhysReg
)
235 return NSA
? NSA_Status::NON_CONTIGUOUS
: NSA_Status::CONTIGUOUS
;
238 bool GCNNSAReassign::runOnMachineFunction(MachineFunction
&MF
) {
239 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
240 if (!ST
->hasNSAEncoding() || !ST
->hasNonNSAEncoding())
243 MRI
= &MF
.getRegInfo();
244 TRI
= ST
->getRegisterInfo();
245 VRM
= &getAnalysis
<VirtRegMap
>();
246 LRM
= &getAnalysis
<LiveRegMatrix
>();
247 LIS
= &getAnalysis
<LiveIntervalsWrapperPass
>().getLIS();
249 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
250 MaxNumVGPRs
= ST
->getMaxNumVGPRs(MF
);
251 MaxNumVGPRs
= std::min(ST
->getMaxNumVGPRs(MFI
->getOccupancy()), MaxNumVGPRs
);
252 CSRegs
= MRI
->getCalleeSavedRegs();
254 using Candidate
= std::pair
<const MachineInstr
*, bool>;
255 SmallVector
<Candidate
, 32> Candidates
;
256 for (const MachineBasicBlock
&MBB
: MF
) {
257 for (const MachineInstr
&MI
: MBB
) {
258 switch (CheckNSA(MI
)) {
261 case NSA_Status::CONTIGUOUS
:
262 Candidates
.push_back(std::pair(&MI
, true));
264 case NSA_Status::NON_CONTIGUOUS
:
265 Candidates
.push_back(std::pair(&MI
, false));
266 ++NumNSAInstructions
;
272 bool Changed
= false;
273 for (auto &C
: Candidates
) {
277 const MachineInstr
*MI
= C
.first
;
278 if (CheckNSA(*MI
, true) == NSA_Status::CONTIGUOUS
) {
279 // Already happen to be fixed.
285 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(MI
->getOpcode());
287 AMDGPU::getNamedOperandIdx(MI
->getOpcode(), AMDGPU::OpName::vaddr0
);
289 SmallVector
<LiveInterval
*, 16> Intervals
;
290 SmallVector
<MCRegister
, 16> OrigRegs
;
291 SlotIndex MinInd
, MaxInd
;
292 for (unsigned I
= 0; I
< Info
->VAddrOperands
; ++I
) {
293 const MachineOperand
&Op
= MI
->getOperand(VAddr0Idx
+ I
);
294 Register Reg
= Op
.getReg();
295 LiveInterval
*LI
= &LIS
->getInterval(Reg
);
296 if (llvm::is_contained(Intervals
, LI
)) {
297 // Same register used, unable to make sequential
301 Intervals
.push_back(LI
);
302 OrigRegs
.push_back(VRM
->getPhys(Reg
));
304 // The address input is undef, so it doesn't contribute to the relevant
305 // range. Seed a reasonable index range if required.
307 MinInd
= MaxInd
= LIS
->getInstructionIndex(*MI
);
310 MinInd
= I
!= 0 ? std::min(MinInd
, LI
->beginIndex()) : LI
->beginIndex();
311 MaxInd
= I
!= 0 ? std::max(MaxInd
, LI
->endIndex()) : LI
->endIndex();
314 if (Intervals
.empty())
317 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
318 << "\tOriginal allocation:\t";
321 << " " << llvm::printReg((VRM
->getPhys(LI
->reg())), TRI
);
324 bool Success
= scavengeRegs(Intervals
);
326 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
327 if (VRM
->hasPhys(Intervals
.back()->reg())) // Did not change allocation.
330 // Check we did not make it worse for other instructions.
331 auto I
= std::lower_bound(Candidates
.begin(), &C
, MinInd
,
332 [this](const Candidate
&C
, SlotIndex I
) {
333 return LIS
->getInstructionIndex(*C
.first
) < I
;
335 for (auto E
= Candidates
.end(); Success
&& I
!= E
&&
336 LIS
->getInstructionIndex(*I
->first
) < MaxInd
; ++I
) {
337 if (I
->second
&& CheckNSA(*I
->first
, true) < NSA_Status::CONTIGUOUS
) {
339 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I
->first
);
345 for (unsigned I
= 0; I
< Info
->VAddrOperands
; ++I
)
346 if (VRM
->hasPhys(Intervals
[I
]->reg()))
347 LRM
->unassign(*Intervals
[I
]);
349 for (unsigned I
= 0; I
< Info
->VAddrOperands
; ++I
)
350 LRM
->assign(*Intervals
[I
], OrigRegs
[I
]);
358 dbgs() << "\tNew allocation:\t\t ["
359 << llvm::printReg((VRM
->getPhys(Intervals
.front()->reg())), TRI
)
361 << llvm::printReg((VRM
->getPhys(Intervals
.back()->reg())), TRI
)