[x86] fix assert with horizontal math + broadcast of vector (PR43402)
[llvm-core.git] / lib / Target / AMDGPU / GCNNSAReassign.cpp
blob36a8f74150f57c68615d7eee009ccf0cb68be937
1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveInterval.h"
22 #include "llvm/CodeGen/LiveIntervals.h"
23 #include "llvm/CodeGen/LiveRegMatrix.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/VirtRegMap.h"
26 #include "llvm/Support/MathExtras.h"
27 #include <algorithm>
29 using namespace llvm;
31 #define DEBUG_TYPE "amdgpu-nsa-reassign"
33 STATISTIC(NumNSAInstructions,
34 "Number of NSA instructions with non-sequential address found");
35 STATISTIC(NumNSAConverted,
36 "Number of NSA instructions changed to sequential");
38 namespace {
40 class GCNNSAReassign : public MachineFunctionPass {
41 public:
42 static char ID;
44 GCNNSAReassign() : MachineFunctionPass(ID) {
45 initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
48 bool runOnMachineFunction(MachineFunction &MF) override;
50 StringRef getPassName() const override { return "GCN NSA Reassign"; }
52 void getAnalysisUsage(AnalysisUsage &AU) const override {
53 AU.addRequired<LiveIntervals>();
54 AU.addRequired<VirtRegMap>();
55 AU.addRequired<LiveRegMatrix>();
56 AU.setPreservesAll();
57 MachineFunctionPass::getAnalysisUsage(AU);
60 private:
61 typedef enum {
62 NOT_NSA, // Not an NSA instruction
63 FIXED, // NSA which we cannot modify
64 NON_CONTIGUOUS, // NSA with non-sequential address which we can try
65 // to optimize.
66 CONTIGUOUS // NSA with all sequential address registers
67 } NSA_Status;
69 const GCNSubtarget *ST;
71 const MachineRegisterInfo *MRI;
73 const SIRegisterInfo *TRI;
75 VirtRegMap *VRM;
77 LiveRegMatrix *LRM;
79 LiveIntervals *LIS;
81 unsigned MaxNumVGPRs;
83 const MCPhysReg *CSRegs;
85 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
87 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
88 unsigned StartReg) const;
90 bool canAssign(unsigned StartReg, unsigned NumRegs) const;
92 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
95 } // End anonymous namespace.
97 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
98 false, false)
99 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
100 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
101 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
102 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
103 false, false)
106 char GCNNSAReassign::ID = 0;
108 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
110 bool
111 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
112 unsigned StartReg) const {
113 unsigned NumRegs = Intervals.size();
115 for (unsigned N = 0; N < NumRegs; ++N)
116 if (VRM->hasPhys(Intervals[N]->reg))
117 LRM->unassign(*Intervals[N]);
119 for (unsigned N = 0; N < NumRegs; ++N)
120 if (LRM->checkInterference(*Intervals[N], StartReg + N))
121 return false;
123 for (unsigned N = 0; N < NumRegs; ++N)
124 LRM->assign(*Intervals[N], StartReg + N);
126 return true;
129 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
130 for (unsigned N = 0; N < NumRegs; ++N) {
131 unsigned Reg = StartReg + N;
132 if (!MRI->isAllocatable(Reg))
133 return false;
135 for (unsigned I = 0; CSRegs[I]; ++I)
136 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
137 !LRM->isPhysRegUsed(CSRegs[I]))
138 return false;
141 return true;
144 bool
145 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
146 unsigned NumRegs = Intervals.size();
148 if (NumRegs > MaxNumVGPRs)
149 return false;
150 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
152 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
153 if (!canAssign(Reg, NumRegs))
154 continue;
156 if (tryAssignRegisters(Intervals, Reg))
157 return true;
160 return false;
163 GCNNSAReassign::NSA_Status
164 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
165 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
166 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
167 return NSA_Status::NOT_NSA;
169 int VAddr0Idx =
170 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
172 unsigned VgprBase = 0;
173 bool NSA = false;
174 for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
175 const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
176 Register Reg = Op.getReg();
177 if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
178 return NSA_Status::FIXED;
180 Register PhysReg = VRM->getPhys(Reg);
182 if (!Fast) {
183 if (!PhysReg)
184 return NSA_Status::FIXED;
186 // Bail if address is not a VGPR32. That should be possible to extend the
187 // optimization to work with subregs of a wider register tuples, but the
188 // logic to find free registers will be much more complicated with much
189 // less chances for success. That seems reasonable to assume that in most
190 // cases a tuple is used because a vector variable contains different
191 // parts of an address and it is either already consequitive or cannot
192 // be reassigned if not. If needed it is better to rely on register
193 // coalescer to process such address tuples.
194 if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
195 return NSA_Status::FIXED;
197 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
199 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
200 return NSA_Status::FIXED;
202 for (auto U : MRI->use_nodbg_operands(Reg)) {
203 if (U.isImplicit())
204 return NSA_Status::FIXED;
205 const MachineInstr *UseInst = U.getParent();
206 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
207 return NSA_Status::FIXED;
210 if (!LIS->hasInterval(Reg))
211 return NSA_Status::FIXED;
214 if (I == 0)
215 VgprBase = PhysReg;
216 else if (VgprBase + I != PhysReg)
217 NSA = true;
220 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
223 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
224 ST = &MF.getSubtarget<GCNSubtarget>();
225 if (ST->getGeneration() < GCNSubtarget::GFX10)
226 return false;
228 MRI = &MF.getRegInfo();
229 TRI = ST->getRegisterInfo();
230 VRM = &getAnalysis<VirtRegMap>();
231 LRM = &getAnalysis<LiveRegMatrix>();
232 LIS = &getAnalysis<LiveIntervals>();
234 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
235 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
236 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
237 CSRegs = MRI->getCalleeSavedRegs();
239 using Candidate = std::pair<const MachineInstr*, bool>;
240 SmallVector<Candidate, 32> Candidates;
241 for (const MachineBasicBlock &MBB : MF) {
242 for (const MachineInstr &MI : MBB) {
243 switch (CheckNSA(MI)) {
244 default:
245 continue;
246 case NSA_Status::CONTIGUOUS:
247 Candidates.push_back(std::make_pair(&MI, true));
248 break;
249 case NSA_Status::NON_CONTIGUOUS:
250 Candidates.push_back(std::make_pair(&MI, false));
251 ++NumNSAInstructions;
252 break;
257 bool Changed = false;
258 for (auto &C : Candidates) {
259 if (C.second)
260 continue;
262 const MachineInstr *MI = C.first;
263 if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
264 // Already happen to be fixed.
265 C.second = true;
266 ++NumNSAConverted;
267 continue;
270 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
271 int VAddr0Idx =
272 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
274 SmallVector<LiveInterval *, 16> Intervals;
275 SmallVector<unsigned, 16> OrigRegs;
276 SlotIndex MinInd, MaxInd;
277 for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
278 const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
279 Register Reg = Op.getReg();
280 LiveInterval *LI = &LIS->getInterval(Reg);
281 if (llvm::find(Intervals, LI) != Intervals.end()) {
282 // Same register used, unable to make sequential
283 Intervals.clear();
284 break;
286 Intervals.push_back(LI);
287 OrigRegs.push_back(VRM->getPhys(Reg));
288 MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
289 MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
292 if (Intervals.empty())
293 continue;
295 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
296 << "\tOriginal allocation:\t";
297 for(auto *LI : Intervals)
298 dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
299 dbgs() << '\n');
301 bool Success = scavengeRegs(Intervals);
302 if (!Success) {
303 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
304 if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
305 continue;
306 } else {
307 // Check we did not make it worse for other instructions.
308 auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
309 [this](const Candidate &C, SlotIndex I) {
310 return LIS->getInstructionIndex(*C.first) < I;
312 for (auto E = Candidates.end(); Success && I != E &&
313 LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
314 if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
315 Success = false;
316 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
321 if (!Success) {
322 for (unsigned I = 0; I < Info->VAddrDwords; ++I)
323 if (VRM->hasPhys(Intervals[I]->reg))
324 LRM->unassign(*Intervals[I]);
326 for (unsigned I = 0; I < Info->VAddrDwords; ++I)
327 LRM->assign(*Intervals[I], OrigRegs[I]);
329 continue;
332 C.second = true;
333 ++NumNSAConverted;
334 LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
335 << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
336 << " : "
337 << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
338 << "]\n");
339 Changed = true;
342 return Changed;