llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp

   1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
  11 /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
  12 /// with sequential versions where possible.
  13 ///
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPU.h"
  17 #include "GCNSubtarget.h"
  18 #include "SIMachineFunctionInfo.h"
  19 #include "SIRegisterInfo.h"
  20 #include "llvm/ADT/Statistic.h"
  21 #include "llvm/CodeGen/LiveIntervals.h"
  22 #include "llvm/CodeGen/LiveRegMatrix.h"
  23 #include "llvm/CodeGen/MachineFunctionPass.h"
  24 #include "llvm/CodeGen/VirtRegMap.h"
  25 #include "llvm/InitializePasses.h"
  26
  27 using namespace llvm;
  28
  29 #define DEBUG_TYPE "amdgpu-nsa-reassign"
  30
  31 STATISTIC(NumNSAInstructions,
  32           "Number of NSA instructions with non-sequential address found");
  33 STATISTIC(NumNSAConverted,
  34           "Number of NSA instructions changed to sequential");
  35
  36 namespace {
  37
  38 class GCNNSAReassign : public MachineFunctionPass {
  39 public:
  40   static char ID;
  41
  42   GCNNSAReassign() : MachineFunctionPass(ID) {
  43     initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
  44   }
  45
  46   bool runOnMachineFunction(MachineFunction &MF) override;
  47
  48   StringRef getPassName() const override { return "GCN NSA Reassign"; }
  49
  50   void getAnalysisUsage(AnalysisUsage &AU) const override {
  51     AU.addRequired<LiveIntervalsWrapperPass>();
  52     AU.addRequired<VirtRegMapWrapperLegacy>();
  53     AU.addRequired<LiveRegMatrixWrapperLegacy>();
  54     AU.setPreservesAll();
  55     MachineFunctionPass::getAnalysisUsage(AU);
  56   }
  57
  58 private:
  59   using NSA_Status = enum {
  60     NOT_NSA,        // Not an NSA instruction
  61     FIXED,          // NSA which we cannot modify
  62     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
  63                     // to optimize.
  64     CONTIGUOUS      // NSA with all sequential address registers
  65   };
  66
  67   const GCNSubtarget *ST;
  68
  69   const MachineRegisterInfo *MRI;
  70
  71   const SIRegisterInfo *TRI;
  72
  73   VirtRegMap *VRM;
  74
  75   LiveRegMatrix *LRM;
  76
  77   LiveIntervals *LIS;
  78
  79   unsigned MaxNumVGPRs;
  80
  81   const MCPhysReg *CSRegs;
  82
  83   NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
  84
  85   bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
  86                           unsigned StartReg) const;
  87
  88   bool canAssign(unsigned StartReg, unsigned NumRegs) const;
  89
  90   bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
  91 };
  92
  93 } // End anonymous namespace.
  94
  95 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
  96                       false, false)
  97 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
  98 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
  99 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
 100 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
 101                     false, false)
 102
 103
 104 char GCNNSAReassign::ID = 0;
 105
 106 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
 107
 108 bool
 109 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
 110                                    unsigned StartReg) const {
 111   unsigned NumRegs = Intervals.size();
 112
 113   for (unsigned N = 0; N < NumRegs; ++N)
 114     if (VRM->hasPhys(Intervals[N]->reg()))
 115       LRM->unassign(*Intervals[N]);
 116
 117   for (unsigned N = 0; N < NumRegs; ++N)
 118     if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
 119       return false;
 120
 121   for (unsigned N = 0; N < NumRegs; ++N)
 122     LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
 123
 124   return true;
 125 }
 126
 127 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
 128   for (unsigned N = 0; N < NumRegs; ++N) {
 129     unsigned Reg = StartReg + N;
 130     if (!MRI->isAllocatable(Reg))
 131       return false;
 132
 133     for (unsigned I = 0; CSRegs[I]; ++I)
 134       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
 135           !LRM->isPhysRegUsed(CSRegs[I]))
 136       return false;
 137   }
 138
 139   return true;
 140 }
 141
 142 bool
 143 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
 144   unsigned NumRegs = Intervals.size();
 145
 146   if (NumRegs > MaxNumVGPRs)
 147     return false;
 148   unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
 149
 150   for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
 151     if (!canAssign(Reg, NumRegs))
 152       continue;
 153
 154     if (tryAssignRegisters(Intervals, Reg))
 155       return true;
 156   }
 157
 158   return false;
 159 }
 160
 161 GCNNSAReassign::NSA_Status
 162 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
 163   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
 164   if (!Info)
 165     return NSA_Status::NOT_NSA;
 166
 167   switch (Info->MIMGEncoding) {
 168   case AMDGPU::MIMGEncGfx10NSA:
 169   case AMDGPU::MIMGEncGfx11NSA:
 170     break;
 171   default:
 172     return NSA_Status::NOT_NSA;
 173   }
 174
 175   int VAddr0Idx =
 176     AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
 177
 178   unsigned VgprBase = 0;
 179   bool NSA = false;
 180   for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
 181     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
 182     Register Reg = Op.getReg();
 183     if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
 184       return NSA_Status::FIXED;
 185
 186     Register PhysReg = VRM->getPhys(Reg);
 187
 188     if (!Fast) {
 189       if (!PhysReg)
 190         return NSA_Status::FIXED;
 191
 192       // TODO: address the below limitation to handle GFX11 BVH instructions
 193       // Bail if address is not a VGPR32. That should be possible to extend the
 194       // optimization to work with subregs of a wider register tuples, but the
 195       // logic to find free registers will be much more complicated with much
 196       // less chances for success. That seems reasonable to assume that in most
 197       // cases a tuple is used because a vector variable contains different
 198       // parts of an address and it is either already consecutive or cannot
 199       // be reassigned if not. If needed it is better to rely on register
 200       // coalescer to process such address tuples.
 201       if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
 202         return NSA_Status::FIXED;
 203
 204       // InlineSpiller does not call LRM::assign() after an LI split leaving
 205       // it in an inconsistent state, so we cannot call LRM::unassign().
 206       // See llvm bug #48911.
 207       // Skip reassign if a register has originated from such split.
 208       // FIXME: Remove the workaround when bug #48911 is fixed.
 209       if (VRM->getPreSplitReg(Reg))
 210         return NSA_Status::FIXED;
 211
 212       const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
 213
 214       if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
 215         return NSA_Status::FIXED;
 216
 217       for (auto U : MRI->use_nodbg_operands(Reg)) {
 218         if (U.isImplicit())
 219           return NSA_Status::FIXED;
 220         const MachineInstr *UseInst = U.getParent();
 221         if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
 222           return NSA_Status::FIXED;
 223       }
 224
 225       if (!LIS->hasInterval(Reg))
 226         return NSA_Status::FIXED;
 227     }
 228
 229     if (I == 0)
 230       VgprBase = PhysReg;
 231     else if (VgprBase + I != PhysReg)
 232       NSA = true;
 233   }
 234
 235   return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
 236 }
 237
 238 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
 239   ST = &MF.getSubtarget<GCNSubtarget>();
 240   if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding())
 241     return false;
 242
 243   MRI = &MF.getRegInfo();
 244   TRI = ST->getRegisterInfo();
 245   VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
 246   LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
 247   LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
 248
 249   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 250   MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
 251   MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
 252   CSRegs = MRI->getCalleeSavedRegs();
 253
 254   using Candidate = std::pair<const MachineInstr*, bool>;
 255   SmallVector<Candidate, 32> Candidates;
 256   for (const MachineBasicBlock &MBB : MF) {
 257     for (const MachineInstr &MI : MBB) {
 258       switch (CheckNSA(MI)) {
 259       default:
 260         continue;
 261       case NSA_Status::CONTIGUOUS:
 262         Candidates.push_back(std::pair(&MI, true));
 263         break;
 264       case NSA_Status::NON_CONTIGUOUS:
 265         Candidates.push_back(std::pair(&MI, false));
 266         ++NumNSAInstructions;
 267         break;
 268       }
 269     }
 270   }
 271
 272   bool Changed = false;
 273   for (auto &C : Candidates) {
 274     if (C.second)
 275       continue;
 276
 277     const MachineInstr *MI = C.first;
 278     if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
 279       // Already happen to be fixed.
 280       C.second = true;
 281       ++NumNSAConverted;
 282       continue;
 283     }
 284
 285     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
 286     int VAddr0Idx =
 287       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
 288
 289     SmallVector<LiveInterval *, 16> Intervals;
 290     SmallVector<MCRegister, 16> OrigRegs;
 291     SlotIndex MinInd, MaxInd;
 292     for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
 293       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
 294       Register Reg = Op.getReg();
 295       LiveInterval *LI = &LIS->getInterval(Reg);
 296       if (llvm::is_contained(Intervals, LI)) {
 297         // Same register used, unable to make sequential
 298         Intervals.clear();
 299         break;
 300       }
 301       Intervals.push_back(LI);
 302       OrigRegs.push_back(VRM->getPhys(Reg));
 303       if (LI->empty()) {
 304         // The address input is undef, so it doesn't contribute to the relevant
 305         // range. Seed a reasonable index range if required.
 306         if (I == 0)
 307           MinInd = MaxInd = LIS->getInstructionIndex(*MI);
 308         continue;
 309       }
 310       MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
 311       MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
 312     }
 313
 314     if (Intervals.empty())
 315       continue;
 316
 317     LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
 318                       << "\tOriginal allocation:\t";
 319                for (auto *LI
 320                     : Intervals) dbgs()
 321                << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
 322                dbgs() << '\n');
 323
 324     bool Success = scavengeRegs(Intervals);
 325     if (!Success) {
 326       LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
 327       if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
 328         continue;
 329     } else {
 330       // Check we did not make it worse for other instructions.
 331       auto *I =
 332           std::lower_bound(Candidates.begin(), &C, MinInd,
 333                            [this](const Candidate &C, SlotIndex I) {
 334                              return LIS->getInstructionIndex(*C.first) < I;
 335                            });
 336       for (auto *E = Candidates.end();
 337            Success && I != E && LIS->getInstructionIndex(*I->first) < MaxInd;
 338            ++I) {
 339         if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
 340           Success = false;
 341           LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
 342         }
 343       }
 344     }
 345
 346     if (!Success) {
 347       for (unsigned I = 0; I < Info->VAddrOperands; ++I)
 348         if (VRM->hasPhys(Intervals[I]->reg()))
 349           LRM->unassign(*Intervals[I]);
 350
 351       for (unsigned I = 0; I < Info->VAddrOperands; ++I)
 352         LRM->assign(*Intervals[I], OrigRegs[I]);
 353
 354       continue;
 355     }
 356
 357     C.second = true;
 358     ++NumNSAConverted;
 359     LLVM_DEBUG(
 360         dbgs() << "\tNew allocation:\t\t ["
 361                << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
 362                << " : "
 363                << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
 364                << "]\n");
 365     Changed = true;
 366   }
 367
 368   return Changed;
 369 }