1 //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// \brief Try to reassign registers on GFX10+ to reduce register bank
13 /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14 /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15 /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16 /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
18 /// The shader can read one dword from each of these banks once per cycle.
19 /// If an instruction has to read more register operands from the same bank
20 /// an additional cycle is needed. HW attempts to pre-load registers through
21 /// input operand gathering, but a stall cycle may occur if that fails. For
22 /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23 /// potentially incuring 2 stall cycles.
25 /// The pass tries to reassign registers to reduce bank conflicts.
27 /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28 /// that 4 has to be subtracted from an SGPR bank number to get the real value.
29 /// This also corresponds to bit numbers in bank masks used in the pass.
31 //===----------------------------------------------------------------------===//
34 #include "AMDGPUSubtarget.h"
35 #include "SIInstrInfo.h"
36 #include "SIMachineFunctionInfo.h"
37 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
38 #include "llvm/ADT/SmallSet.h"
39 #include "llvm/ADT/Statistic.h"
40 #include "llvm/CodeGen/LiveInterval.h"
41 #include "llvm/CodeGen/LiveIntervals.h"
42 #include "llvm/CodeGen/LiveRegMatrix.h"
43 #include "llvm/CodeGen/MachineFunctionPass.h"
44 #include "llvm/CodeGen/MachineLoopInfo.h"
45 #include "llvm/CodeGen/VirtRegMap.h"
46 #include "llvm/Support/MathExtras.h"
50 static cl::opt
<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
51 cl::desc("Verify stall cycles in the regbanks reassign pass"),
52 cl::value_desc("0|1|2"),
53 cl::init(0), cl::Hidden
);
55 #define DEBUG_TYPE "amdgpu-regbanks-reassign"
57 #define NUM_VGPR_BANKS 4
58 #define NUM_SGPR_BANKS 8
59 #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
60 #define SGPR_BANK_OFFSET NUM_VGPR_BANKS
61 #define VGPR_BANK_MASK 0xf
62 #define SGPR_BANK_MASK 0xff0
63 #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
65 STATISTIC(NumStallsDetected
,
66 "Number of operand read stalls detected");
67 STATISTIC(NumStallsRecovered
,
68 "Number of operand read stalls recovered");
72 class GCNRegBankReassign
: public MachineFunctionPass
{
76 OperandMask(unsigned r
, unsigned s
, unsigned m
)
77 : Reg(r
), SubReg(s
), Mask(m
) {}
85 Candidate(MachineInstr
*mi
, unsigned reg
, unsigned freebanks
,
87 : MI(mi
), Reg(reg
), FreeBanks(freebanks
), Weight(weight
) {}
89 bool operator< (const Candidate
& RHS
) const { return Weight
< RHS
.Weight
; }
91 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
92 void dump(const GCNRegBankReassign
*P
) const {
94 dbgs() << P
->printReg(Reg
) << " to banks ";
95 dumpFreeBanks(FreeBanks
);
96 dbgs() << " weight " << Weight
<< '\n';
106 class CandidateList
: public std::list
<Candidate
> {
108 // Speedup subsequent sort.
109 void push(const Candidate
&& C
) {
110 if (C
.Weight
) push_back(C
);
119 GCNRegBankReassign() : MachineFunctionPass(ID
) {
120 initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
123 bool runOnMachineFunction(MachineFunction
&MF
) override
;
125 StringRef
getPassName() const override
{ return "GCN RegBank Reassign"; }
127 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
128 AU
.addRequired
<MachineLoopInfo
>();
129 AU
.addRequired
<LiveIntervals
>();
130 AU
.addRequired
<VirtRegMap
>();
131 AU
.addRequired
<LiveRegMatrix
>();
132 AU
.setPreservesAll();
133 MachineFunctionPass::getAnalysisUsage(AU
);
137 const GCNSubtarget
*ST
;
139 const MachineRegisterInfo
*MRI
;
141 const SIRegisterInfo
*TRI
;
143 MachineLoopInfo
*MLI
;
151 unsigned MaxNumVGPRs
;
153 unsigned MaxNumSGPRs
;
157 SmallVector
<OperandMask
, 8> OperandMasks
;
159 CandidateList Candidates
;
161 const MCPhysReg
*CSRegs
;
163 // Returns bank for a phys reg.
164 unsigned getPhysRegBank(unsigned Reg
) const;
166 // Return a bit set for each register bank used. 4 banks for VGPRs and
167 // 8 banks for SGPRs.
168 // Registers already processed and recorded in RegsUsed are excluded.
169 // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
170 unsigned getRegBankMask(unsigned Reg
, unsigned SubReg
, int Bank
);
172 // Return number of stalls in the instructions.
173 // UsedBanks has bits set for the banks used by all operands.
174 // If Reg and Bank provided substitute the Reg with the Bank.
175 unsigned analyzeInst(const MachineInstr
& MI
, unsigned& UsedBanks
,
176 unsigned Reg
= AMDGPU::NoRegister
, int Bank
= -1);
178 // Return true if register is regular VGPR or SGPR or their tuples.
179 // Returns false for special registers like m0, vcc etc.
180 bool isReassignable(unsigned Reg
) const;
182 // Check if registers' defs are old and may be pre-loaded.
183 // Returns 0 if both registers are old enough, 1 or 2 if one or both
184 // registers will not likely be pre-loaded.
185 unsigned getOperandGatherWeight(const MachineInstr
& MI
,
188 unsigned StallCycles
) const;
191 // Find all bank bits in UsedBanks where Mask can be relocated to.
192 unsigned getFreeBanks(unsigned Mask
, unsigned UsedBanks
) const;
194 // Find all bank bits in UsedBanks where Mask can be relocated to.
195 // Bank is relative to the register and not its subregister component.
196 // Returns 0 is a register is not reassignable.
197 unsigned getFreeBanks(unsigned Reg
, unsigned SubReg
, unsigned Mask
,
198 unsigned UsedBanks
) const;
200 // Add cadidate instruction to the work list.
201 void collectCandidates(MachineInstr
& MI
, unsigned UsedBanks
,
202 unsigned StallCycles
);
204 // Collect cadidate instructions across function. Returns a number stall
205 // cycles detected. Only counts stalls if Collect is false.
206 unsigned collectCandidates(MachineFunction
&MF
, bool Collect
= true);
208 // Remove all candidates that read specified register.
209 void removeCandidates(unsigned Reg
);
211 // Compute stalls within the uses of SrcReg replaced by a register from
212 // Bank. If Bank is -1 does not perform substitution. If Collect is set
213 // candidates are collected and added to work list.
214 unsigned computeStallCycles(unsigned SrcReg
,
215 unsigned Reg
= AMDGPU::NoRegister
,
216 int Bank
= -1, bool Collect
= false);
218 // Search for a register in Bank unused within LI.
219 // Returns phys reg or NoRegister.
220 unsigned scavengeReg(LiveInterval
& LI
, unsigned Bank
) const;
222 // Try to reassign candidate. Returns number or stall cycles saved.
223 unsigned tryReassign(Candidate
&C
);
225 bool verifyCycles(MachineFunction
&MF
,
226 unsigned OriginalCycles
, unsigned CyclesSaved
);
229 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
231 Printable
printReg(unsigned Reg
, unsigned SubReg
= 0) const {
232 return Printable([Reg
, SubReg
, this](raw_ostream
&OS
) {
233 if (Register::isPhysicalRegister(Reg
)) {
234 OS
<< llvm::printReg(Reg
, TRI
);
237 if (!VRM
->isAssignedReg(Reg
))
238 OS
<< "<unassigned> " << llvm::printReg(Reg
, TRI
);
240 OS
<< llvm::printReg(Reg
, TRI
) << '('
241 << llvm::printReg(VRM
->getPhys(Reg
), TRI
) << ')';
243 OS
<< ':' << TRI
->getSubRegIndexName(SubReg
);
247 static Printable
printBank(unsigned Bank
) {
248 return Printable([Bank
](raw_ostream
&OS
) {
249 OS
<< ((Bank
>= SGPR_BANK_OFFSET
) ? Bank
- SGPR_BANK_OFFSET
: Bank
);
253 static void dumpFreeBanks(unsigned FreeBanks
) {
254 for (unsigned L
= 0; L
< NUM_BANKS
; ++L
)
255 if (FreeBanks
& (1 << L
))
256 dbgs() << printBank(L
) << ' ';
261 } // End anonymous namespace.
263 INITIALIZE_PASS_BEGIN(GCNRegBankReassign
, DEBUG_TYPE
, "GCN RegBank Reassign",
265 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
266 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo
)
267 INITIALIZE_PASS_DEPENDENCY(VirtRegMap
)
268 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix
)
269 INITIALIZE_PASS_END(GCNRegBankReassign
, DEBUG_TYPE
, "GCN RegBank Reassign",
273 char GCNRegBankReassign::ID
= 0;
275 char &llvm::GCNRegBankReassignID
= GCNRegBankReassign::ID
;
277 unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg
) const {
278 assert(Register::isPhysicalRegister(Reg
));
280 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
);
281 unsigned Size
= TRI
->getRegSizeInBits(*RC
);
283 Reg
= TRI
->getSubReg(Reg
, AMDGPU::sub0
);
285 if (TRI
->hasVGPRs(RC
)) {
286 Reg
-= AMDGPU::VGPR0
;
287 return Reg
% NUM_VGPR_BANKS
;
290 Reg
= TRI
->getEncodingValue(Reg
) / 2;
291 return Reg
% NUM_SGPR_BANKS
+ SGPR_BANK_OFFSET
;
294 unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg
, unsigned SubReg
,
296 if (Register::isVirtualRegister(Reg
)) {
297 if (!VRM
->isAssignedReg(Reg
))
300 Reg
= VRM
->getPhys(Reg
);
304 Reg
= TRI
->getSubReg(Reg
, SubReg
);
307 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
);
308 unsigned Size
= TRI
->getRegSizeInBits(*RC
) / 32;
310 Reg
= TRI
->getSubReg(Reg
, AMDGPU::sub0
);
312 if (TRI
->hasVGPRs(RC
)) {
313 // VGPRs have 4 banks assigned in a round-robin fashion.
314 Reg
-= AMDGPU::VGPR0
;
315 unsigned Mask
= (1 << Size
) - 1;
317 // Bitmask lacks an extract method
318 for (unsigned I
= 0; I
< Size
; ++I
)
319 if (RegsUsed
.test(Reg
+ I
))
321 RegsUsed
.set(Reg
, Reg
+ Size
);
323 Mask
<<= (Bank
== -1) ? Reg
% NUM_VGPR_BANKS
: unsigned(Bank
);
324 return (Mask
| (Mask
>> NUM_VGPR_BANKS
)) & VGPR_BANK_MASK
;
327 // SGPRs have 8 banks holding 2 consequitive registers each.
328 Reg
= TRI
->getEncodingValue(Reg
) / 2;
329 unsigned StartBit
= AMDGPU::VGPR_32RegClass
.getNumRegs();
330 if (Reg
+ StartBit
>= RegsUsed
.size())
335 unsigned Mask
= (1 << Size
) - 1;
337 for (unsigned I
= 0; I
< Size
; ++I
)
338 if (RegsUsed
.test(StartBit
+ Reg
+ I
))
340 RegsUsed
.set(StartBit
+ Reg
, StartBit
+ Reg
+ Size
);
342 Mask
<<= (Bank
== -1) ? Reg
% NUM_SGPR_BANKS
343 : unsigned(Bank
- SGPR_BANK_OFFSET
);
344 Mask
= (Mask
| (Mask
>> NUM_SGPR_BANKS
)) & SGPR_BANK_SHIFTED_MASK
;
345 // Reserve 4 bank ids for VGPRs.
346 return Mask
<< SGPR_BANK_OFFSET
;
349 unsigned GCNRegBankReassign::analyzeInst(const MachineInstr
& MI
,
353 unsigned StallCycles
= 0;
356 if (MI
.isDebugValue())
360 OperandMasks
.clear();
361 for (const auto& Op
: MI
.explicit_uses()) {
362 // Undef can be assigned to any register, so two vregs can be assigned
363 // the same phys reg within the same instruction.
364 if (!Op
.isReg() || Op
.isUndef())
367 Register R
= Op
.getReg();
368 if (TRI
->hasAGPRs(TRI
->getRegClassForReg(*MRI
, R
)))
371 unsigned ShiftedBank
= Bank
;
373 if (Bank
!= -1 && R
== Reg
&& Op
.getSubReg()) {
374 unsigned LM
= TRI
->getSubRegIndexLaneMask(Op
.getSubReg()).getAsInteger();
375 if (!(LM
& 1) && (Bank
< NUM_VGPR_BANKS
)) {
376 // If a register spans all banks we cannot shift it to avoid conflict.
377 if (countPopulation(LM
) >= NUM_VGPR_BANKS
)
379 ShiftedBank
= (Bank
+ countTrailingZeros(LM
)) % NUM_VGPR_BANKS
;
380 } else if (!(LM
& 3) && (Bank
>= SGPR_BANK_OFFSET
)) {
381 // If a register spans all banks we cannot shift it to avoid conflict.
382 if (countPopulation(LM
) / 2 >= NUM_SGPR_BANKS
)
384 ShiftedBank
= SGPR_BANK_OFFSET
+ (Bank
- SGPR_BANK_OFFSET
+
385 (countTrailingZeros(LM
) >> 1)) %
390 unsigned Mask
= getRegBankMask(R
, Op
.getSubReg(),
391 (Reg
== R
) ? ShiftedBank
: -1);
392 StallCycles
+= countPopulation(UsedBanks
& Mask
);
394 OperandMasks
.push_back(OperandMask(Op
.getReg(), Op
.getSubReg(), Mask
));
400 unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr
& MI
,
403 unsigned StallCycles
) const
406 MachineBasicBlock::const_instr_iterator
Def(MI
.getIterator());
407 MachineBasicBlock::const_instr_iterator
B(MI
.getParent()->instr_begin());
408 for (unsigned S
= StallCycles
; S
&& Def
!= B
&& Defs
!= 3; --S
) {
409 if (MI
.isDebugInstr())
412 if (Def
->getOpcode() == TargetOpcode::IMPLICIT_DEF
)
414 if (Def
->modifiesRegister(Reg1
, TRI
))
416 if (Def
->modifiesRegister(Reg2
, TRI
))
419 return countPopulation(Defs
);
422 bool GCNRegBankReassign::isReassignable(unsigned Reg
) const {
423 if (Register::isPhysicalRegister(Reg
) || !VRM
->isAssignedReg(Reg
))
426 const MachineInstr
*Def
= MRI
->getUniqueVRegDef(Reg
);
428 Register PhysReg
= VRM
->getPhys(Reg
);
430 if (Def
&& Def
->isCopy() && Def
->getOperand(1).getReg() == PhysReg
)
433 for (auto U
: MRI
->use_nodbg_operands(Reg
)) {
436 const MachineInstr
*UseInst
= U
.getParent();
437 if (UseInst
->isCopy() && UseInst
->getOperand(0).getReg() == PhysReg
)
441 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(PhysReg
);
442 if (TRI
->hasVGPRs(RC
))
445 unsigned Size
= TRI
->getRegSizeInBits(*RC
);
447 PhysReg
= TRI
->getSubReg(PhysReg
, AMDGPU::sub0
);
449 return AMDGPU::SGPR_32RegClass
.contains(PhysReg
);
452 unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask
,
453 unsigned UsedBanks
) const {
454 unsigned Size
= countPopulation(Mask
);
455 unsigned FreeBanks
= 0;
456 unsigned Bank
= findFirstSet(Mask
);
460 // Find free VGPR banks
461 if ((Mask
& VGPR_BANK_MASK
) && (Size
< NUM_VGPR_BANKS
)) {
462 for (unsigned I
= 0; I
< NUM_VGPR_BANKS
; ++I
) {
465 unsigned NewMask
= ((1 << Size
) - 1) << I
;
466 NewMask
= (NewMask
| (NewMask
>> NUM_VGPR_BANKS
)) & VGPR_BANK_MASK
;
467 if (!(UsedBanks
& NewMask
))
473 // Find free SGPR banks
474 // SGPR tuples must be aligned, so step is size in banks it
476 Bank
-= SGPR_BANK_OFFSET
;
477 for (unsigned I
= 0; I
< NUM_SGPR_BANKS
; I
+= Size
) {
480 unsigned NewMask
= ((1 << Size
) - 1) << I
;
481 NewMask
= (NewMask
| (NewMask
>> NUM_SGPR_BANKS
)) & SGPR_BANK_SHIFTED_MASK
;
482 if (!(UsedBanks
& (NewMask
<< SGPR_BANK_OFFSET
)))
483 FreeBanks
|= (1 << SGPR_BANK_OFFSET
) << I
;
489 unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg
,
492 unsigned UsedBanks
) const {
493 if (!isReassignable(Reg
))
496 unsigned FreeBanks
= getFreeBanks(Mask
, UsedBanks
);
498 unsigned LM
= TRI
->getSubRegIndexLaneMask(SubReg
).getAsInteger();
499 if (!(LM
& 1) && (Mask
& VGPR_BANK_MASK
)) {
500 unsigned Shift
= countTrailingZeros(LM
);
501 if (Shift
>= NUM_VGPR_BANKS
)
503 unsigned VB
= FreeBanks
& VGPR_BANK_MASK
;
504 FreeBanks
= ((VB
>> Shift
) | (VB
<< (NUM_VGPR_BANKS
- Shift
))) &
506 } else if (!(LM
& 3) && (Mask
& SGPR_BANK_MASK
)) {
507 unsigned Shift
= countTrailingZeros(LM
) >> 1;
508 if (Shift
>= NUM_SGPR_BANKS
)
510 unsigned SB
= FreeBanks
>> SGPR_BANK_OFFSET
;
511 FreeBanks
= ((SB
>> Shift
) | (SB
<< (NUM_SGPR_BANKS
- Shift
))) &
512 SGPR_BANK_SHIFTED_MASK
;
513 FreeBanks
<<= SGPR_BANK_OFFSET
;
516 LLVM_DEBUG(if (FreeBanks
) {
517 dbgs() << "Potential reassignments of " << printReg(Reg
, SubReg
)
518 << " to banks: "; dumpFreeBanks(FreeBanks
);
524 void GCNRegBankReassign::collectCandidates(MachineInstr
& MI
,
526 unsigned StallCycles
) {
527 LLVM_DEBUG(MI
.dump());
532 LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles
<< '\n');
534 for (unsigned I
= 0, E
= OperandMasks
.size(); I
+ 1 < E
; ++I
) {
535 for (unsigned J
= I
+ 1; J
!= E
; ++J
) {
536 if (!(OperandMasks
[I
].Mask
& OperandMasks
[J
].Mask
))
539 unsigned Reg1
= OperandMasks
[I
].Reg
;
540 unsigned Reg2
= OperandMasks
[J
].Reg
;
541 unsigned SubReg1
= OperandMasks
[I
].SubReg
;
542 unsigned SubReg2
= OperandMasks
[J
].SubReg
;
543 unsigned Mask1
= OperandMasks
[I
].Mask
;
544 unsigned Mask2
= OperandMasks
[J
].Mask
;
545 unsigned Size1
= countPopulation(Mask1
);
546 unsigned Size2
= countPopulation(Mask2
);
548 LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1
, SubReg1
) <<
549 " and " << printReg(Reg2
, SubReg2
) << '\n');
551 unsigned Weight
= getOperandGatherWeight(MI
, Reg1
, Reg2
, StallCycles
);
552 Weight
+= MLI
->getLoopDepth(MI
.getParent()) * 10;
554 LLVM_DEBUG(dbgs() << "Stall weight = " << Weight
<< '\n');
556 unsigned FreeBanks1
= getFreeBanks(Reg1
, SubReg1
, Mask1
, UsedBanks
);
557 unsigned FreeBanks2
= getFreeBanks(Reg2
, SubReg2
, Mask2
, UsedBanks
);
559 Candidates
.push(Candidate(&MI
, Reg1
, FreeBanks1
, Weight
560 + ((Size2
> Size1
) ? 1 : 0)));
562 Candidates
.push(Candidate(&MI
, Reg2
, FreeBanks2
, Weight
563 + ((Size1
> Size2
) ? 1 : 0)));
568 unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg
,
569 unsigned Reg
, int Bank
,
571 unsigned TotalStallCycles
= 0;
572 unsigned UsedBanks
= 0;
573 SmallSet
<const MachineInstr
*, 16> Visited
;
575 for (auto &MI
: MRI
->use_nodbg_instructions(SrcReg
)) {
578 if (!Visited
.insert(&MI
).second
)
580 unsigned StallCycles
= analyzeInst(MI
, UsedBanks
, Reg
, Bank
);
581 TotalStallCycles
+= StallCycles
;
583 collectCandidates(MI
, UsedBanks
, StallCycles
);
586 return TotalStallCycles
;
589 unsigned GCNRegBankReassign::scavengeReg(LiveInterval
& LI
,
590 unsigned Bank
) const {
591 const TargetRegisterClass
*RC
= MRI
->getRegClass(LI
.reg
);
592 unsigned MaxNumRegs
= (Bank
< NUM_VGPR_BANKS
) ? MaxNumVGPRs
594 unsigned MaxReg
= MaxNumRegs
+ (Bank
< NUM_VGPR_BANKS
? AMDGPU::VGPR0
597 for (unsigned Reg
: RC
->getRegisters()) {
598 // Check occupancy limit.
599 if (TRI
->isSubRegisterEq(Reg
, MaxReg
))
602 if (!MRI
->isAllocatable(Reg
) || getPhysRegBank(Reg
) != Bank
)
605 for (unsigned I
= 0; CSRegs
[I
]; ++I
)
606 if (TRI
->isSubRegisterEq(Reg
, CSRegs
[I
]) &&
607 !LRM
->isPhysRegUsed(CSRegs
[I
]))
608 return AMDGPU::NoRegister
;
610 LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg
) << '\n');
612 if (!LRM
->checkInterference(LI
, Reg
))
616 return AMDGPU::NoRegister
;
619 unsigned GCNRegBankReassign::tryReassign(Candidate
&C
) {
620 if (!LIS
->hasInterval(C
.Reg
))
623 LiveInterval
&LI
= LIS
->getInterval(C
.Reg
);
624 LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C
.Reg
) << " in "; C
.MI
->dump();
627 // For each candidate bank walk all instructions in the range of live
628 // interval and check if replacing the register with one belonging to
629 // the candidate bank reduces conflicts.
631 unsigned OrigStalls
= computeStallCycles(C
.Reg
);
632 LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls
<< '\n');
637 BankStall(unsigned b
, unsigned s
) : Bank(b
), Stalls(s
) {};
638 bool operator< (const BankStall
&RHS
) const { return Stalls
> RHS
.Stalls
; }
642 SmallVector
<BankStall
, 8> BankStalls
;
644 for (int Bank
= 0; Bank
< NUM_BANKS
; ++Bank
) {
645 if (C
.FreeBanks
& (1 << Bank
)) {
646 LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank
) << '\n');
647 unsigned Stalls
= computeStallCycles(C
.Reg
, C
.Reg
, Bank
);
648 if (Stalls
< OrigStalls
) {
649 LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank
) << " -> "
651 BankStalls
.push_back(BankStall((unsigned)Bank
, Stalls
));
655 std::sort(BankStalls
.begin(), BankStalls
.end());
657 Register OrigReg
= VRM
->getPhys(C
.Reg
);
659 while (!BankStalls
.empty()) {
660 BankStall BS
= BankStalls
.pop_back_val();
661 unsigned Reg
= scavengeReg(LI
, BS
.Bank
);
662 if (Reg
== AMDGPU::NoRegister
) {
663 LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS
.Bank
)
667 LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg
)
668 << (LRM
->isPhysRegUsed(Reg
) ? "" : " (new)")
669 << " in bank " << printBank(BS
.Bank
) << '\n');
671 LRM
->assign(LI
, Reg
);
673 LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls
- BS
.Stalls
<< '\n');
675 return OrigStalls
- BS
.Stalls
;
677 LRM
->assign(LI
, OrigReg
);
682 unsigned GCNRegBankReassign::collectCandidates(MachineFunction
&MF
,
684 unsigned TotalStallCycles
= 0;
686 for (MachineBasicBlock
&MBB
: MF
) {
688 LLVM_DEBUG(if (Collect
) {
689 if (MBB
.getName().empty()) dbgs() << "bb." << MBB
.getNumber();
690 else dbgs() << MBB
.getName(); dbgs() << ":\n";
693 for (MachineInstr
&MI
: MBB
.instrs()) {
695 continue; // we analyze the instructions inside the bundle individually
697 unsigned UsedBanks
= 0;
698 unsigned StallCycles
= analyzeInst(MI
, UsedBanks
);
701 collectCandidates(MI
, UsedBanks
, StallCycles
);
703 TotalStallCycles
+= StallCycles
;
706 LLVM_DEBUG(if (Collect
) { dbgs() << '\n'; });
709 return TotalStallCycles
;
712 void GCNRegBankReassign::removeCandidates(unsigned Reg
) {
713 Candidates
.remove_if([Reg
, this](const Candidate
& C
) {
714 return C
.MI
->readsRegister(Reg
, TRI
);
718 bool GCNRegBankReassign::verifyCycles(MachineFunction
&MF
,
719 unsigned OriginalCycles
,
720 unsigned CyclesSaved
) {
721 unsigned StallCycles
= collectCandidates(MF
, false);
722 LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
723 << " stall cycles left\n");
724 return StallCycles
+ CyclesSaved
== OriginalCycles
;
727 bool GCNRegBankReassign::runOnMachineFunction(MachineFunction
&MF
) {
728 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
729 if (!ST
->hasRegisterBanking() || skipFunction(MF
.getFunction()))
732 MRI
= &MF
.getRegInfo();
733 TRI
= ST
->getRegisterInfo();
734 MLI
= &getAnalysis
<MachineLoopInfo
>();
735 VRM
= &getAnalysis
<VirtRegMap
>();
736 LRM
= &getAnalysis
<LiveRegMatrix
>();
737 LIS
= &getAnalysis
<LiveIntervals
>();
739 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
740 unsigned Occupancy
= MFI
->getOccupancy();
741 MaxNumVGPRs
= ST
->getMaxNumVGPRs(MF
);
742 MaxNumSGPRs
= ST
->getMaxNumSGPRs(MF
);
743 MaxNumVGPRs
= std::min(ST
->getMaxNumVGPRs(Occupancy
), MaxNumVGPRs
);
744 MaxNumSGPRs
= std::min(ST
->getMaxNumSGPRs(Occupancy
, true), MaxNumSGPRs
);
746 CSRegs
= MRI
->getCalleeSavedRegs();
748 RegsUsed
.resize(AMDGPU::VGPR_32RegClass
.getNumRegs() +
749 TRI
->getEncodingValue(AMDGPU::SGPR_NULL
) / 2 + 1);
751 LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF
.getName()
754 unsigned StallCycles
= collectCandidates(MF
);
755 NumStallsDetected
+= StallCycles
;
757 LLVM_DEBUG(dbgs() << "=== " << StallCycles
<< " stall cycles detected in "
758 "function " << MF
.getName() << '\n');
762 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
763 for (auto C
: Candidates
) C
.dump(this);
766 unsigned CyclesSaved
= 0;
767 while (!Candidates
.empty()) {
768 Candidate C
= Candidates
.back();
769 unsigned LocalCyclesSaved
= tryReassign(C
);
770 CyclesSaved
+= LocalCyclesSaved
;
772 if (VerifyStallCycles
> 1 && !verifyCycles(MF
, StallCycles
, CyclesSaved
))
773 report_fatal_error("RegBank reassign stall cycles verification failed.");
775 Candidates
.pop_back();
776 if (LocalCyclesSaved
) {
777 removeCandidates(C
.Reg
);
778 computeStallCycles(C
.Reg
, AMDGPU::NoRegister
, -1, true);
781 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
782 for (auto C
: Candidates
)
787 NumStallsRecovered
+= CyclesSaved
;
789 LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
790 << " cycles saved in function " << MF
.getName() << '\n');
794 if (VerifyStallCycles
== 1 && !verifyCycles(MF
, StallCycles
, CyclesSaved
))
795 report_fatal_error("RegBank reassign stall cycles verification failed.");
799 return CyclesSaved
> 0;