1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveInterval.h"
22 #include "llvm/CodeGen/LiveIntervals.h"
23 #include "llvm/CodeGen/LiveRegMatrix.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/VirtRegMap.h"
26 #include "llvm/Support/MathExtras.h"
31 #define DEBUG_TYPE "amdgpu-nsa-reassign"
33 STATISTIC(NumNSAInstructions
,
34 "Number of NSA instructions with non-sequential address found");
35 STATISTIC(NumNSAConverted
,
36 "Number of NSA instructions changed to sequential");
40 class GCNNSAReassign
: public MachineFunctionPass
{
44 GCNNSAReassign() : MachineFunctionPass(ID
) {
45 initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
48 bool runOnMachineFunction(MachineFunction
&MF
) override
;
50 StringRef
getPassName() const override
{ return "GCN NSA Reassign"; }
52 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
53 AU
.addRequired
<LiveIntervals
>();
54 AU
.addRequired
<VirtRegMap
>();
55 AU
.addRequired
<LiveRegMatrix
>();
57 MachineFunctionPass::getAnalysisUsage(AU
);
62 NOT_NSA
, // Not an NSA instruction
63 FIXED
, // NSA which we cannot modify
64 NON_CONTIGUOUS
, // NSA with non-sequential address which we can try
66 CONTIGUOUS
// NSA with all sequential address registers
69 const GCNSubtarget
*ST
;
71 const MachineRegisterInfo
*MRI
;
73 const SIRegisterInfo
*TRI
;
83 const MCPhysReg
*CSRegs
;
85 NSA_Status
CheckNSA(const MachineInstr
&MI
, bool Fast
= false) const;
87 bool tryAssignRegisters(SmallVectorImpl
<LiveInterval
*> &Intervals
,
88 unsigned StartReg
) const;
90 bool canAssign(unsigned StartReg
, unsigned NumRegs
) const;
92 bool scavengeRegs(SmallVectorImpl
<LiveInterval
*> &Intervals
) const;
95 } // End anonymous namespace.
97 INITIALIZE_PASS_BEGIN(GCNNSAReassign
, DEBUG_TYPE
, "GCN NSA Reassign",
99 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
100 INITIALIZE_PASS_DEPENDENCY(VirtRegMap
)
101 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix
)
102 INITIALIZE_PASS_END(GCNNSAReassign
, DEBUG_TYPE
, "GCN NSA Reassign",
106 char GCNNSAReassign::ID
= 0;
108 char &llvm::GCNNSAReassignID
= GCNNSAReassign::ID
;
111 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl
<LiveInterval
*> &Intervals
,
112 unsigned StartReg
) const {
113 unsigned NumRegs
= Intervals
.size();
115 for (unsigned N
= 0; N
< NumRegs
; ++N
)
116 if (VRM
->hasPhys(Intervals
[N
]->reg
))
117 LRM
->unassign(*Intervals
[N
]);
119 for (unsigned N
= 0; N
< NumRegs
; ++N
)
120 if (LRM
->checkInterference(*Intervals
[N
], StartReg
+ N
))
123 for (unsigned N
= 0; N
< NumRegs
; ++N
)
124 LRM
->assign(*Intervals
[N
], StartReg
+ N
);
129 bool GCNNSAReassign::canAssign(unsigned StartReg
, unsigned NumRegs
) const {
130 for (unsigned N
= 0; N
< NumRegs
; ++N
) {
131 unsigned Reg
= StartReg
+ N
;
132 if (!MRI
->isAllocatable(Reg
))
135 for (unsigned I
= 0; CSRegs
[I
]; ++I
)
136 if (TRI
->isSubRegisterEq(Reg
, CSRegs
[I
]) &&
137 !LRM
->isPhysRegUsed(CSRegs
[I
]))
145 GCNNSAReassign::scavengeRegs(SmallVectorImpl
<LiveInterval
*> &Intervals
) const {
146 unsigned NumRegs
= Intervals
.size();
148 if (NumRegs
> MaxNumVGPRs
)
150 unsigned MaxReg
= MaxNumVGPRs
- NumRegs
+ AMDGPU::VGPR0
;
152 for (unsigned Reg
= AMDGPU::VGPR0
; Reg
<= MaxReg
; ++Reg
) {
153 if (!canAssign(Reg
, NumRegs
))
156 if (tryAssignRegisters(Intervals
, Reg
))
163 GCNNSAReassign::NSA_Status
164 GCNNSAReassign::CheckNSA(const MachineInstr
&MI
, bool Fast
) const {
165 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(MI
.getOpcode());
166 if (!Info
|| Info
->MIMGEncoding
!= AMDGPU::MIMGEncGfx10NSA
)
167 return NSA_Status::NOT_NSA
;
170 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vaddr0
);
172 unsigned VgprBase
= 0;
174 for (unsigned I
= 0; I
< Info
->VAddrDwords
; ++I
) {
175 const MachineOperand
&Op
= MI
.getOperand(VAddr0Idx
+ I
);
176 Register Reg
= Op
.getReg();
177 if (Register::isPhysicalRegister(Reg
) || !VRM
->isAssignedReg(Reg
))
178 return NSA_Status::FIXED
;
180 Register PhysReg
= VRM
->getPhys(Reg
);
184 return NSA_Status::FIXED
;
186 // Bail if address is not a VGPR32. That should be possible to extend the
187 // optimization to work with subregs of a wider register tuples, but the
188 // logic to find free registers will be much more complicated with much
189 // less chances for success. That seems reasonable to assume that in most
190 // cases a tuple is used because a vector variable contains different
191 // parts of an address and it is either already consequitive or cannot
192 // be reassigned if not. If needed it is better to rely on register
193 // coalescer to process such address tuples.
194 if (MRI
->getRegClass(Reg
) != &AMDGPU::VGPR_32RegClass
|| Op
.getSubReg())
195 return NSA_Status::FIXED
;
197 const MachineInstr
*Def
= MRI
->getUniqueVRegDef(Reg
);
199 if (Def
&& Def
->isCopy() && Def
->getOperand(1).getReg() == PhysReg
)
200 return NSA_Status::FIXED
;
202 for (auto U
: MRI
->use_nodbg_operands(Reg
)) {
204 return NSA_Status::FIXED
;
205 const MachineInstr
*UseInst
= U
.getParent();
206 if (UseInst
->isCopy() && UseInst
->getOperand(0).getReg() == PhysReg
)
207 return NSA_Status::FIXED
;
210 if (!LIS
->hasInterval(Reg
))
211 return NSA_Status::FIXED
;
216 else if (VgprBase
+ I
!= PhysReg
)
220 return NSA
? NSA_Status::NON_CONTIGUOUS
: NSA_Status::CONTIGUOUS
;
223 bool GCNNSAReassign::runOnMachineFunction(MachineFunction
&MF
) {
224 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
225 if (ST
->getGeneration() < GCNSubtarget::GFX10
)
228 MRI
= &MF
.getRegInfo();
229 TRI
= ST
->getRegisterInfo();
230 VRM
= &getAnalysis
<VirtRegMap
>();
231 LRM
= &getAnalysis
<LiveRegMatrix
>();
232 LIS
= &getAnalysis
<LiveIntervals
>();
234 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
235 MaxNumVGPRs
= ST
->getMaxNumVGPRs(MF
);
236 MaxNumVGPRs
= std::min(ST
->getMaxNumVGPRs(MFI
->getOccupancy()), MaxNumVGPRs
);
237 CSRegs
= MRI
->getCalleeSavedRegs();
239 using Candidate
= std::pair
<const MachineInstr
*, bool>;
240 SmallVector
<Candidate
, 32> Candidates
;
241 for (const MachineBasicBlock
&MBB
: MF
) {
242 for (const MachineInstr
&MI
: MBB
) {
243 switch (CheckNSA(MI
)) {
246 case NSA_Status::CONTIGUOUS
:
247 Candidates
.push_back(std::make_pair(&MI
, true));
249 case NSA_Status::NON_CONTIGUOUS
:
250 Candidates
.push_back(std::make_pair(&MI
, false));
251 ++NumNSAInstructions
;
257 bool Changed
= false;
258 for (auto &C
: Candidates
) {
262 const MachineInstr
*MI
= C
.first
;
263 if (CheckNSA(*MI
, true) == NSA_Status::CONTIGUOUS
) {
264 // Already happen to be fixed.
270 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(MI
->getOpcode());
272 AMDGPU::getNamedOperandIdx(MI
->getOpcode(), AMDGPU::OpName::vaddr0
);
274 SmallVector
<LiveInterval
*, 16> Intervals
;
275 SmallVector
<unsigned, 16> OrigRegs
;
276 SlotIndex MinInd
, MaxInd
;
277 for (unsigned I
= 0; I
< Info
->VAddrDwords
; ++I
) {
278 const MachineOperand
&Op
= MI
->getOperand(VAddr0Idx
+ I
);
279 Register Reg
= Op
.getReg();
280 LiveInterval
*LI
= &LIS
->getInterval(Reg
);
281 if (llvm::find(Intervals
, LI
) != Intervals
.end()) {
282 // Same register used, unable to make sequential
286 Intervals
.push_back(LI
);
287 OrigRegs
.push_back(VRM
->getPhys(Reg
));
288 MinInd
= I
? std::min(MinInd
, LI
->beginIndex()) : LI
->beginIndex();
289 MaxInd
= I
? std::max(MaxInd
, LI
->endIndex()) : LI
->endIndex();
292 if (Intervals
.empty())
295 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
296 << "\tOriginal allocation:\t";
297 for(auto *LI
: Intervals
)
298 dbgs() << " " << llvm::printReg((VRM
->getPhys(LI
->reg
)), TRI
);
301 bool Success
= scavengeRegs(Intervals
);
303 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
304 if (VRM
->hasPhys(Intervals
.back()->reg
)) // Did not change allocation.
307 // Check we did not make it worse for other instructions.
308 auto I
= std::lower_bound(Candidates
.begin(), &C
, MinInd
,
309 [this](const Candidate
&C
, SlotIndex I
) {
310 return LIS
->getInstructionIndex(*C
.first
) < I
;
312 for (auto E
= Candidates
.end(); Success
&& I
!= E
&&
313 LIS
->getInstructionIndex(*I
->first
) < MaxInd
; ++I
) {
314 if (I
->second
&& CheckNSA(*I
->first
, true) < NSA_Status::CONTIGUOUS
) {
316 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I
->first
);
322 for (unsigned I
= 0; I
< Info
->VAddrDwords
; ++I
)
323 if (VRM
->hasPhys(Intervals
[I
]->reg
))
324 LRM
->unassign(*Intervals
[I
]);
326 for (unsigned I
= 0; I
< Info
->VAddrDwords
; ++I
)
327 LRM
->assign(*Intervals
[I
], OrigRegs
[I
]);
334 LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
335 << llvm::printReg((VRM
->getPhys(Intervals
.front()->reg
)), TRI
)
337 << llvm::printReg((VRM
->getPhys(Intervals
.back()->reg
)), TRI
)