lib/Target/AMDGPU/SIRegisterInfo.cpp

   1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// SI implementation of the TargetRegisterInfo class.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "SIRegisterInfo.h"
  15 #include "AMDGPURegisterBankInfo.h"
  16 #include "AMDGPUSubtarget.h"
  17 #include "SIInstrInfo.h"
  18 #include "SIMachineFunctionInfo.h"
  19 #include "MCTargetDesc/AMDGPUInstPrinter.h"
  20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  21 #include "llvm/CodeGen/LiveIntervals.h"
  22 #include "llvm/CodeGen/MachineDominators.h"
  23 #include "llvm/CodeGen/MachineFrameInfo.h"
  24 #include "llvm/CodeGen/MachineInstrBuilder.h"
  25 #include "llvm/CodeGen/RegisterScavenging.h"
  26 #include "llvm/CodeGen/SlotIndexes.h"
  27 #include "llvm/IR/Function.h"
  28 #include "llvm/IR/LLVMContext.h"
  29
  30 using namespace llvm;
  31
  32 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
  33   for (unsigned i = 0; PSets[i] != -1; ++i) {
  34     if (PSets[i] == (int)PSetID)
  35       return true;
  36   }
  37   return false;
  38 }
  39
  40 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
  41                                          BitVector &PressureSets) const {
  42   for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
  43     const int *PSets = getRegUnitPressureSets(*U);
  44     if (hasPressureSet(PSets, PSetID)) {
  45       PressureSets.set(PSetID);
  46       break;
  47     }
  48   }
  49 }
  50
  51 static cl::opt<bool> EnableSpillSGPRToSMEM(
  52   "amdgpu-spill-sgpr-to-smem",
  53   cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
  54   cl::init(false));
  55
  56 static cl::opt<bool> EnableSpillSGPRToVGPR(
  57   "amdgpu-spill-sgpr-to-vgpr",
  58   cl::desc("Enable spilling VGPRs to SGPRs"),
  59   cl::ReallyHidden,
  60   cl::init(true));
  61
  62 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
  63   AMDGPURegisterInfo(),
  64   SGPRPressureSets(getNumRegPressureSets()),
  65   VGPRPressureSets(getNumRegPressureSets()),
  66   AGPRPressureSets(getNumRegPressureSets()),
  67   SpillSGPRToVGPR(false),
  68   SpillSGPRToSMEM(false),
  69   isWave32(ST.isWave32()) {
  70   if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
  71     SpillSGPRToSMEM = true;
  72   else if (EnableSpillSGPRToVGPR)
  73     SpillSGPRToVGPR = true;
  74
  75   unsigned NumRegPressureSets = getNumRegPressureSets();
  76
  77   SGPRSetID = NumRegPressureSets;
  78   VGPRSetID = NumRegPressureSets;
  79   AGPRSetID = NumRegPressureSets;
  80
  81   for (unsigned i = 0; i < NumRegPressureSets; ++i) {
  82     classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
  83     classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
  84     classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets);
  85   }
  86
  87   // Determine the number of reg units for each pressure set.
  88   std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
  89   for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) {
  90     const int *PSets = getRegUnitPressureSets(i);
  91     for (unsigned j = 0; PSets[j] != -1; ++j) {
  92       ++PressureSetRegUnits[PSets[j]];
  93     }
  94   }
  95
  96   unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0;
  97   for (unsigned i = 0; i < NumRegPressureSets; ++i) {
  98     if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
  99       VGPRSetID = i;
 100       VGPRMax = PressureSetRegUnits[i];
 101       continue;
 102     }
 103     if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) {
 104       SGPRSetID = i;
 105       SGPRMax = PressureSetRegUnits[i];
 106     }
 107     if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) {
 108       AGPRSetID = i;
 109       AGPRMax = PressureSetRegUnits[i];
 110       continue;
 111     }
 112   }
 113
 114   assert(SGPRSetID < NumRegPressureSets &&
 115          VGPRSetID < NumRegPressureSets &&
 116          AGPRSetID < NumRegPressureSets);
 117 }
 118
 119 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
 120   const MachineFunction &MF) const {
 121
 122   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 123   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
 124   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
 125   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 126 }
 127
 128 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
 129   unsigned Reg;
 130
 131   // Try to place it in a hole after PrivateSegmentBufferReg.
 132   if (RegCount & 3) {
 133     // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
 134     // alignment constraints, so we have a hole where can put the wave offset.
 135     Reg = RegCount - 1;
 136   } else {
 137     // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
 138     // wave offset before it.
 139     Reg = RegCount - 5;
 140   }
 141
 142   return Reg;
 143 }
 144
 145 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
 146   const MachineFunction &MF) const {
 147   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 148   unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
 149   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 150 }
 151
 152 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 153   BitVector Reserved(getNumRegs());
 154
 155   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
 156   // this seems likely to result in bugs, so I'm marking them as reserved.
 157   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
 158   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 159
 160   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
 161   reserveRegisterTuples(Reserved, AMDGPU::M0);
 162
 163   // Reserve src_vccz, src_execz, src_scc.
 164   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
 165   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
 166   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
 167
 168   // Reserve the memory aperture registers.
 169   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
 170   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
 171   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
 172   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
 173
 174   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
 175   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
 176
 177   // Reserve xnack_mask registers - support is not implemented in Codegen.
 178   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
 179
 180   // Reserve lds_direct register - support is not implemented in Codegen.
 181   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
 182
 183   // Reserve Trap Handler registers - support is not implemented in Codegen.
 184   reserveRegisterTuples(Reserved, AMDGPU::TBA);
 185   reserveRegisterTuples(Reserved, AMDGPU::TMA);
 186   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
 187   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
 188   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
 189   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
 190   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
 191   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
 192   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
 193   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
 194
 195   // Reserve null register - it shall never be allocated
 196   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
 197
 198   // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
 199   // will result in bugs.
 200   if (isWave32) {
 201     Reserved.set(AMDGPU::VCC);
 202     Reserved.set(AMDGPU::VCC_HI);
 203   }
 204
 205   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 206
 207   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
 208   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
 209   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
 210     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
 211     reserveRegisterTuples(Reserved, Reg);
 212   }
 213
 214   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
 215   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
 216   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
 217     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
 218     reserveRegisterTuples(Reserved, Reg);
 219     Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
 220     reserveRegisterTuples(Reserved, Reg);
 221   }
 222
 223   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 224
 225   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
 226   if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
 227     // Reserve 1 SGPR for scratch wave offset in case we need to spill.
 228     reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
 229   }
 230
 231   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
 232   if (ScratchRSrcReg != AMDGPU::NoRegister) {
 233     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
 234     // to spill.
 235     // TODO: May need to reserve a VGPR if doing LDS spilling.
 236     reserveRegisterTuples(Reserved, ScratchRSrcReg);
 237     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
 238   }
 239
 240   // We have to assume the SP is needed in case there are calls in the function,
 241   // which is detected after the function is lowered. If we aren't really going
 242   // to need SP, don't bother reserving it.
 243   unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
 244
 245   if (StackPtrReg != AMDGPU::NoRegister) {
 246     reserveRegisterTuples(Reserved, StackPtrReg);
 247     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
 248   }
 249
 250   unsigned FrameReg = MFI->getFrameOffsetReg();
 251   if (FrameReg != AMDGPU::NoRegister) {
 252     reserveRegisterTuples(Reserved, FrameReg);
 253     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
 254   }
 255
 256   for (unsigned Reg : MFI->WWMReservedRegs) {
 257     reserveRegisterTuples(Reserved, Reg);
 258   }
 259
 260   // FIXME: Stop using reserved registers for this.
 261   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
 262     reserveRegisterTuples(Reserved, Reg);
 263
 264   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
 265     reserveRegisterTuples(Reserved, Reg);
 266
 267   return Reserved;
 268 }
 269
 270 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
 271   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 272   // On entry, the base address is 0, so it can't possibly need any more
 273   // alignment.
 274
 275   // FIXME: Should be able to specify the entry frame alignment per calling
 276   // convention instead.
 277   if (Info->isEntryFunction())
 278     return false;
 279
 280   return TargetRegisterInfo::canRealignStack(MF);
 281 }
 282
 283 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
 284   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
 285   if (Info->isEntryFunction()) {
 286     const MachineFrameInfo &MFI = Fn.getFrameInfo();
 287     return MFI.hasStackObjects() || MFI.hasCalls();
 288   }
 289
 290   // May need scavenger for dealing with callee saved registers.
 291   return true;
 292 }
 293
 294 bool SIRegisterInfo::requiresFrameIndexScavenging(
 295   const MachineFunction &MF) const {
 296   const MachineFrameInfo &MFI = MF.getFrameInfo();
 297   if (MFI.hasStackObjects())
 298     return true;
 299
 300   // May need to deal with callee saved registers.
 301   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 302   return !Info->isEntryFunction();
 303 }
 304
 305 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
 306   const MachineFunction &MF) const {
 307   const MachineFrameInfo &MFI = MF.getFrameInfo();
 308   if (!MFI.hasStackObjects())
 309     return false;
 310
 311   // The scavenger is used for large frames which may require finding a free
 312   // register for large offsets.
 313   if (!isUInt<12>(MFI.getStackSize()))
 314     return true;
 315
 316   // If using scalar stores, for spills, m0 is needed for the scalar store
 317   // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
 318   // register for it during frame index elimination, so the scavenger is
 319   // directly needed.
 320   return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
 321          MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
 322 }
 323
 324 bool SIRegisterInfo::requiresVirtualBaseRegisters(
 325   const MachineFunction &) const {
 326   // There are no special dedicated stack or frame pointers.
 327   return true;
 328 }
 329
 330 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
 331   // This helps catch bugs as verifier errors.
 332   return true;
 333 }
 334
 335 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
 336   assert(SIInstrInfo::isMUBUF(*MI));
 337
 338   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
 339                                           AMDGPU::OpName::offset);
 340   return MI->getOperand(OffIdx).getImm();
 341 }
 342
 343 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
 344                                                  int Idx) const {
 345   if (!SIInstrInfo::isMUBUF(*MI))
 346     return 0;
 347
 348   assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
 349                                            AMDGPU::OpName::vaddr) &&
 350          "Should never see frame index on non-address operand");
 351
 352   return getMUBUFInstrOffset(MI);
 353 }
 354
 355 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
 356   if (!MI->mayLoadOrStore())
 357     return false;
 358
 359   int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
 360
 361   return !isUInt<12>(FullOffset);
 362 }
 363
 364 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
 365                                                   unsigned BaseReg,
 366                                                   int FrameIdx,
 367                                                   int64_t Offset) const {
 368   MachineBasicBlock::iterator Ins = MBB->begin();
 369   DebugLoc DL; // Defaults to "unknown"
 370
 371   if (Ins != MBB->end())
 372     DL = Ins->getDebugLoc();
 373
 374   MachineFunction *MF = MBB->getParent();
 375   const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
 376   const SIInstrInfo *TII = Subtarget.getInstrInfo();
 377
 378   if (Offset == 0) {
 379     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
 380       .addFrameIndex(FrameIdx);
 381     return;
 382   }
 383
 384   MachineRegisterInfo &MRI = MF->getRegInfo();
 385   unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 386
 387   unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 388
 389   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
 390     .addImm(Offset);
 391   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
 392     .addFrameIndex(FrameIdx);
 393
 394   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
 395     .addReg(OffsetReg, RegState::Kill)
 396     .addReg(FIReg)
 397     .addImm(0); // clamp bit
 398 }
 399
 400 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 401                                        int64_t Offset) const {
 402
 403   MachineBasicBlock *MBB = MI.getParent();
 404   MachineFunction *MF = MBB->getParent();
 405   const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
 406   const SIInstrInfo *TII = Subtarget.getInstrInfo();
 407
 408 #ifndef NDEBUG
 409   // FIXME: Is it possible to be storing a frame index to itself?
 410   bool SeenFI = false;
 411   for (const MachineOperand &MO: MI.operands()) {
 412     if (MO.isFI()) {
 413       if (SeenFI)
 414         llvm_unreachable("should not see multiple frame indices");
 415
 416       SeenFI = true;
 417     }
 418   }
 419 #endif
 420
 421   MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
 422   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
 423   assert(TII->isMUBUF(MI));
 424   assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
 425          MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
 426          "should only be seeing frame offset relative FrameIndex");
 427
 428
 429   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
 430   int64_t NewOffset = OffsetOp->getImm() + Offset;
 431   assert(isUInt<12>(NewOffset) && "offset should be legal");
 432
 433   FIOp->ChangeToRegister(BaseReg, false);
 434   OffsetOp->setImm(NewOffset);
 435 }
 436
 437 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
 438                                         unsigned BaseReg,
 439                                         int64_t Offset) const {
 440   if (!SIInstrInfo::isMUBUF(*MI))
 441     return false;
 442
 443   int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
 444
 445   return isUInt<12>(NewOffset);
 446 }
 447
 448 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
 449   const MachineFunction &MF, unsigned Kind) const {
 450   // This is inaccurate. It depends on the instruction and address space. The
 451   // only place where we should hit this is for dealing with frame indexes /
 452   // private accesses, so this is correct in that case.
 453   return &AMDGPU::VGPR_32RegClass;
 454 }
 455
 456 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
 457
 458   switch (Op) {
 459   case AMDGPU::SI_SPILL_S1024_SAVE:
 460   case AMDGPU::SI_SPILL_S1024_RESTORE:
 461   case AMDGPU::SI_SPILL_V1024_SAVE:
 462   case AMDGPU::SI_SPILL_V1024_RESTORE:
 463   case AMDGPU::SI_SPILL_A1024_SAVE:
 464   case AMDGPU::SI_SPILL_A1024_RESTORE:
 465     return 32;
 466   case AMDGPU::SI_SPILL_S512_SAVE:
 467   case AMDGPU::SI_SPILL_S512_RESTORE:
 468   case AMDGPU::SI_SPILL_V512_SAVE:
 469   case AMDGPU::SI_SPILL_V512_RESTORE:
 470   case AMDGPU::SI_SPILL_A512_SAVE:
 471   case AMDGPU::SI_SPILL_A512_RESTORE:
 472     return 16;
 473   case AMDGPU::SI_SPILL_S256_SAVE:
 474   case AMDGPU::SI_SPILL_S256_RESTORE:
 475   case AMDGPU::SI_SPILL_V256_SAVE:
 476   case AMDGPU::SI_SPILL_V256_RESTORE:
 477     return 8;
 478   case AMDGPU::SI_SPILL_S160_SAVE:
 479   case AMDGPU::SI_SPILL_S160_RESTORE:
 480   case AMDGPU::SI_SPILL_V160_SAVE:
 481   case AMDGPU::SI_SPILL_V160_RESTORE:
 482     return 5;
 483   case AMDGPU::SI_SPILL_S128_SAVE:
 484   case AMDGPU::SI_SPILL_S128_RESTORE:
 485   case AMDGPU::SI_SPILL_V128_SAVE:
 486   case AMDGPU::SI_SPILL_V128_RESTORE:
 487   case AMDGPU::SI_SPILL_A128_SAVE:
 488   case AMDGPU::SI_SPILL_A128_RESTORE:
 489     return 4;
 490   case AMDGPU::SI_SPILL_S96_SAVE:
 491   case AMDGPU::SI_SPILL_S96_RESTORE:
 492   case AMDGPU::SI_SPILL_V96_SAVE:
 493   case AMDGPU::SI_SPILL_V96_RESTORE:
 494     return 3;
 495   case AMDGPU::SI_SPILL_S64_SAVE:
 496   case AMDGPU::SI_SPILL_S64_RESTORE:
 497   case AMDGPU::SI_SPILL_V64_SAVE:
 498   case AMDGPU::SI_SPILL_V64_RESTORE:
 499   case AMDGPU::SI_SPILL_A64_SAVE:
 500   case AMDGPU::SI_SPILL_A64_RESTORE:
 501     return 2;
 502   case AMDGPU::SI_SPILL_S32_SAVE:
 503   case AMDGPU::SI_SPILL_S32_RESTORE:
 504   case AMDGPU::SI_SPILL_V32_SAVE:
 505   case AMDGPU::SI_SPILL_V32_RESTORE:
 506   case AMDGPU::SI_SPILL_A32_SAVE:
 507   case AMDGPU::SI_SPILL_A32_RESTORE:
 508     return 1;
 509   default: llvm_unreachable("Invalid spill opcode");
 510   }
 511 }
 512
 513 static int getOffsetMUBUFStore(unsigned Opc) {
 514   switch (Opc) {
 515   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
 516     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
 517   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
 518     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
 519   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
 520     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
 521   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
 522     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
 523   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
 524     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
 525   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
 526     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
 527   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
 528     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
 529   default:
 530     return -1;
 531   }
 532 }
 533
 534 static int getOffsetMUBUFLoad(unsigned Opc) {
 535   switch (Opc) {
 536   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
 537     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
 538   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
 539     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
 540   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
 541     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
 542   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
 543     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
 544   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
 545     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
 546   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
 547     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
 548   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
 549     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
 550   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
 551     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
 552   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
 553     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
 554   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
 555     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
 556   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
 557     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
 558   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
 559     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
 560   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
 561     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
 562   default:
 563     return -1;
 564   }
 565 }
 566
 567 static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
 568                                            int Index,
 569                                            unsigned Lane,
 570                                            unsigned ValueReg,
 571                                            bool IsKill) {
 572   MachineBasicBlock *MBB = MI->getParent();
 573   MachineFunction *MF = MI->getParent()->getParent();
 574   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 575   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
 576   const SIInstrInfo *TII = ST.getInstrInfo();
 577
 578   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
 579
 580   if (Reg == AMDGPU::NoRegister)
 581     return MachineInstrBuilder();
 582
 583   bool IsStore = MI->mayStore();
 584   MachineRegisterInfo &MRI = MF->getRegInfo();
 585   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
 586
 587   unsigned Dst = IsStore ? Reg : ValueReg;
 588   unsigned Src = IsStore ? ValueReg : Reg;
 589   unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
 590                                                    : AMDGPU::V_ACCVGPR_READ_B32;
 591
 592   return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
 593            .addReg(Src, getKillRegState(IsKill));
 594 }
 595
 596 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
 597 // need to handle the case where an SGPR may need to be spilled while spilling.
 598 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
 599                                       MachineFrameInfo &MFI,
 600                                       MachineBasicBlock::iterator MI,
 601                                       int Index,
 602                                       int64_t Offset) {
 603   MachineBasicBlock *MBB = MI->getParent();
 604   const DebugLoc &DL = MI->getDebugLoc();
 605   bool IsStore = MI->mayStore();
 606
 607   unsigned Opc = MI->getOpcode();
 608   int LoadStoreOp = IsStore ?
 609     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
 610   if (LoadStoreOp == -1)
 611     return false;
 612
 613   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
 614   if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
 615     return true;
 616
 617   MachineInstrBuilder NewMI =
 618       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
 619           .add(*Reg)
 620           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
 621           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
 622           .addImm(Offset)
 623           .addImm(0) // glc
 624           .addImm(0) // slc
 625           .addImm(0) // tfe
 626           .addImm(0) // dlc
 627           .cloneMemRefs(*MI);
 628
 629   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
 630                                                        AMDGPU::OpName::vdata_in);
 631   if (VDataIn)
 632     NewMI.add(*VDataIn);
 633   return true;
 634 }
 635
 636 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
 637                                          unsigned LoadStoreOp,
 638                                          int Index,
 639                                          unsigned ValueReg,
 640                                          bool IsKill,
 641                                          unsigned ScratchRsrcReg,
 642                                          unsigned ScratchOffsetReg,
 643                                          int64_t InstOffset,
 644                                          MachineMemOperand *MMO,
 645                                          RegScavenger *RS) const {
 646   MachineBasicBlock *MBB = MI->getParent();
 647   MachineFunction *MF = MI->getParent()->getParent();
 648   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
 649   const SIInstrInfo *TII = ST.getInstrInfo();
 650   const MachineFrameInfo &MFI = MF->getFrameInfo();
 651
 652   const MCInstrDesc &Desc = TII->get(LoadStoreOp);
 653   const DebugLoc &DL = MI->getDebugLoc();
 654   bool IsStore = Desc.mayStore();
 655
 656   bool Scavenged = false;
 657   unsigned SOffset = ScratchOffsetReg;
 658
 659   const unsigned EltSize = 4;
 660   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
 661   unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
 662   unsigned Size = NumSubRegs * EltSize;
 663   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
 664   int64_t ScratchOffsetRegDelta = 0;
 665
 666   unsigned Align = MFI.getObjectAlignment(Index);
 667   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
 668
 669   Register TmpReg =
 670     hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
 671                  : Register();
 672
 673   assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
 674
 675   if (!isUInt<12>(Offset + Size - EltSize)) {
 676     SOffset = AMDGPU::NoRegister;
 677
 678     // We currently only support spilling VGPRs to EltSize boundaries, meaning
 679     // we can simplify the adjustment of Offset here to just scale with
 680     // WavefrontSize.
 681     Offset *= ST.getWavefrontSize();
 682
 683     // We don't have access to the register scavenger if this function is called
 684     // during  PEI::scavengeFrameVirtualRegs().
 685     if (RS)
 686       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
 687
 688     if (SOffset == AMDGPU::NoRegister) {
 689       // There are no free SGPRs, and since we are in the process of spilling
 690       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
 691       // on SI/CI and on VI it is true until we implement spilling using scalar
 692       // stores), we have no way to free up an SGPR.  Our solution here is to
 693       // add the offset directly to the ScratchOffset register, and then
 694       // subtract the offset after the spill to return ScratchOffset to it's
 695       // original value.
 696       SOffset = ScratchOffsetReg;
 697       ScratchOffsetRegDelta = Offset;
 698     } else {
 699       Scavenged = true;
 700     }
 701
 702     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
 703       .addReg(ScratchOffsetReg)
 704       .addImm(Offset);
 705
 706     Offset = 0;
 707   }
 708
 709   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
 710     unsigned SubReg = NumSubRegs == 1 ?
 711       ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
 712
 713     unsigned SOffsetRegState = 0;
 714     unsigned SrcDstRegState = getDefRegState(!IsStore);
 715     if (i + 1 == e) {
 716       SOffsetRegState |= getKillRegState(Scavenged);
 717       // The last implicit use carries the "Kill" flag.
 718       SrcDstRegState |= getKillRegState(IsKill);
 719     }
 720
 721     auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
 722
 723     if (!MIB.getInstr()) {
 724       unsigned FinalReg = SubReg;
 725       if (TmpReg != AMDGPU::NoRegister) {
 726         if (IsStore)
 727           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
 728             .addReg(SubReg, getKillRegState(IsKill));
 729         SubReg = TmpReg;
 730       }
 731
 732       MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
 733       MachineMemOperand *NewMMO
 734         = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
 735                                    EltSize, MinAlign(Align, EltSize * i));
 736
 737       MIB = BuildMI(*MBB, MI, DL, Desc)
 738         .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
 739         .addReg(ScratchRsrcReg)
 740         .addReg(SOffset, SOffsetRegState)
 741         .addImm(Offset)
 742         .addImm(0) // glc
 743         .addImm(0) // slc
 744         .addImm(0) // tfe
 745         .addImm(0) // dlc
 746         .addMemOperand(NewMMO);
 747
 748       if (!IsStore && TmpReg != AMDGPU::NoRegister)
 749         MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
 750                       FinalReg)
 751           .addReg(TmpReg, RegState::Kill);
 752     }
 753
 754     if (NumSubRegs > 1)
 755       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
 756   }
 757
 758   if (ScratchOffsetRegDelta != 0) {
 759     // Subtract the offset we added to the ScratchOffset register.
 760     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
 761         .addReg(ScratchOffsetReg)
 762         .addImm(ScratchOffsetRegDelta);
 763   }
 764 }
 765
 766 static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
 767                                                      bool Store) {
 768   if (SuperRegSize % 16 == 0) {
 769     return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
 770                          AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
 771   }
 772
 773   if (SuperRegSize % 8 == 0) {
 774     return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
 775                         AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
 776   }
 777
 778   return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
 779                       AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
 780 }
 781
 782 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
 783                                int Index,
 784                                RegScavenger *RS,
 785                                bool OnlyToVGPR) const {
 786   MachineBasicBlock *MBB = MI->getParent();
 787   MachineFunction *MF = MBB->getParent();
 788   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 789   DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
 790
 791   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
 792     = MFI->getSGPRToVGPRSpills(Index);
 793   bool SpillToVGPR = !VGPRSpills.empty();
 794   if (OnlyToVGPR && !SpillToVGPR)
 795     return false;
 796
 797   MachineRegisterInfo &MRI = MF->getRegInfo();
 798   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
 799   const SIInstrInfo *TII = ST.getInstrInfo();
 800
 801   unsigned SuperReg = MI->getOperand(0).getReg();
 802   bool IsKill = MI->getOperand(0).isKill();
 803   const DebugLoc &DL = MI->getDebugLoc();
 804
 805   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 806
 807   bool SpillToSMEM = spillSGPRToSMEM();
 808   if (SpillToSMEM && OnlyToVGPR)
 809     return false;
 810
 811   Register FrameReg = getFrameRegister(*MF);
 812
 813   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
 814                          SuperReg != MFI->getFrameOffsetReg() &&
 815                          SuperReg != MFI->getScratchWaveOffsetReg()));
 816
 817   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 818
 819   unsigned OffsetReg = AMDGPU::M0;
 820   unsigned M0CopyReg = AMDGPU::NoRegister;
 821
 822   if (SpillToSMEM) {
 823     if (RS->isRegUsed(AMDGPU::M0)) {
 824       M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 825       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
 826         .addReg(AMDGPU::M0);
 827     }
 828   }
 829
 830   unsigned ScalarStoreOp;
 831   unsigned EltSize = 4;
 832   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
 833   if (SpillToSMEM && isSGPRClass(RC)) {
 834     // XXX - if private_element_size is larger than 4 it might be useful to be
 835     // able to spill wider vmem spills.
 836     std::tie(EltSize, ScalarStoreOp) =
 837           getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
 838   }
 839
 840   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
 841   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
 842
 843   // SubReg carries the "Kill" flag when SubReg == SuperReg.
 844   unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
 845   for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
 846     unsigned SubReg = NumSubRegs == 1 ?
 847       SuperReg : getSubReg(SuperReg, SplitParts[i]);
 848
 849     if (SpillToSMEM) {
 850       int64_t FrOffset = FrameInfo.getObjectOffset(Index);
 851
 852       // The allocated memory size is really the wavefront size * the frame
 853       // index size. The widest register class is 64 bytes, so a 4-byte scratch
 854       // allocation is enough to spill this in a single stack object.
 855       //
 856       // FIXME: Frame size/offsets are computed earlier than this, so the extra
 857       // space is still unnecessarily allocated.
 858
 859       unsigned Align = FrameInfo.getObjectAlignment(Index);
 860       MachinePointerInfo PtrInfo
 861         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
 862       MachineMemOperand *MMO
 863         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
 864                                    EltSize, MinAlign(Align, EltSize * i));
 865
 866       // SMEM instructions only support a single offset, so increment the wave
 867       // offset.
 868
 869       int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
 870       if (Offset != 0) {
 871         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
 872           .addReg(FrameReg)
 873           .addImm(Offset);
 874       } else {
 875         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
 876           .addReg(FrameReg);
 877       }
 878
 879       BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
 880         .addReg(SubReg, getKillRegState(IsKill)) // sdata
 881         .addReg(MFI->getScratchRSrcReg())        // sbase
 882         .addReg(OffsetReg, RegState::Kill)       // soff
 883         .addImm(0)                               // glc
 884         .addImm(0)                               // dlc
 885         .addMemOperand(MMO);
 886
 887       continue;
 888     }
 889
 890     if (SpillToVGPR) {
 891       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
 892
 893       // During SGPR spilling to VGPR, determine if the VGPR is defined. The
 894       // only circumstance in which we say it is undefined is when it is the
 895       // first spill to this VGPR in the first basic block.
 896       bool VGPRDefined = true;
 897       if (MBB == &MF->front())
 898         VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
 899
 900       // Mark the "old value of vgpr" input undef only if this is the first sgpr
 901       // spill to this specific vgpr in the first basic block.
 902       BuildMI(*MBB, MI, DL,
 903               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
 904               Spill.VGPR)
 905         .addReg(SubReg, getKillRegState(IsKill))
 906         .addImm(Spill.Lane)
 907         .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
 908
 909       // FIXME: Since this spills to another register instead of an actual
 910       // frame index, we should delete the frame index when all references to
 911       // it are fixed.
 912     } else {
 913       // XXX - Can to VGPR spill fail for some subregisters but not others?
 914       if (OnlyToVGPR)
 915         return false;
 916
 917       // Spill SGPR to a frame index.
 918       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
 919       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 920       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
 921
 922       MachineInstrBuilder Mov
 923         = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
 924         .addReg(SubReg, SubKillState);
 925
 926
 927       // There could be undef components of a spilled super register.
 928       // TODO: Can we detect this and skip the spill?
 929       if (NumSubRegs > 1) {
 930         // The last implicit use of the SuperReg carries the "Kill" flag.
 931         unsigned SuperKillState = 0;
 932         if (i + 1 == e)
 933           SuperKillState |= getKillRegState(IsKill);
 934         Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
 935       }
 936
 937       unsigned Align = FrameInfo.getObjectAlignment(Index);
 938       MachinePointerInfo PtrInfo
 939         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
 940       MachineMemOperand *MMO
 941         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
 942                                    EltSize, MinAlign(Align, EltSize * i));
 943       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
 944         .addReg(TmpReg, RegState::Kill)       // src
 945         .addFrameIndex(Index)                 // vaddr
 946         .addReg(MFI->getScratchRSrcReg())     // srrsrc
 947         .addReg(MFI->getStackPtrOffsetReg())  // soffset
 948         .addImm(i * 4)                        // offset
 949         .addMemOperand(MMO);
 950     }
 951   }
 952
 953   if (M0CopyReg != AMDGPU::NoRegister) {
 954     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
 955       .addReg(M0CopyReg, RegState::Kill);
 956   }
 957
 958   MI->eraseFromParent();
 959   MFI->addToSpilledSGPRs(NumSubRegs);
 960   return true;
 961 }
 962
 963 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
 964                                  int Index,
 965                                  RegScavenger *RS,
 966                                  bool OnlyToVGPR) const {
 967   MachineFunction *MF = MI->getParent()->getParent();
 968   MachineRegisterInfo &MRI = MF->getRegInfo();
 969   MachineBasicBlock *MBB = MI->getParent();
 970   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 971
 972   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
 973     = MFI->getSGPRToVGPRSpills(Index);
 974   bool SpillToVGPR = !VGPRSpills.empty();
 975   if (OnlyToVGPR && !SpillToVGPR)
 976     return false;
 977
 978   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 979   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
 980   const SIInstrInfo *TII = ST.getInstrInfo();
 981   const DebugLoc &DL = MI->getDebugLoc();
 982
 983   unsigned SuperReg = MI->getOperand(0).getReg();
 984   bool SpillToSMEM = spillSGPRToSMEM();
 985   if (SpillToSMEM && OnlyToVGPR)
 986     return false;
 987
 988   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 989
 990   unsigned OffsetReg = AMDGPU::M0;
 991   unsigned M0CopyReg = AMDGPU::NoRegister;
 992
 993   if (SpillToSMEM) {
 994     if (RS->isRegUsed(AMDGPU::M0)) {
 995       M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 996       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
 997         .addReg(AMDGPU::M0);
 998     }
 999   }
1000
1001   unsigned EltSize = 4;
1002   unsigned ScalarLoadOp;
1003
1004   Register FrameReg = getFrameRegister(*MF);
1005
1006   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1007   if (SpillToSMEM && isSGPRClass(RC)) {
1008     // XXX - if private_element_size is larger than 4 it might be useful to be
1009     // able to spill wider vmem spills.
1010     std::tie(EltSize, ScalarLoadOp) =
1011           getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
1012   }
1013
1014   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1015   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1016
1017   // SubReg carries the "Kill" flag when SubReg == SuperReg.
1018   int64_t FrOffset = FrameInfo.getObjectOffset(Index);
1019
1020   for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1021     unsigned SubReg = NumSubRegs == 1 ?
1022       SuperReg : getSubReg(SuperReg, SplitParts[i]);
1023
1024     if (SpillToSMEM) {
1025       // FIXME: Size may be > 4 but extra bytes wasted.
1026       unsigned Align = FrameInfo.getObjectAlignment(Index);
1027       MachinePointerInfo PtrInfo
1028         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
1029       MachineMemOperand *MMO
1030         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
1031                                    EltSize, MinAlign(Align, EltSize * i));
1032
1033       // Add i * 4 offset
1034       int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
1035       if (Offset != 0) {
1036         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
1037           .addReg(FrameReg)
1038           .addImm(Offset);
1039       } else {
1040         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
1041           .addReg(FrameReg);
1042       }
1043
1044       auto MIB =
1045         BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
1046         .addReg(MFI->getScratchRSrcReg())  // sbase
1047         .addReg(OffsetReg, RegState::Kill) // soff
1048         .addImm(0)                         // glc
1049         .addImm(0)                         // dlc
1050         .addMemOperand(MMO);
1051
1052       if (NumSubRegs > 1 && i == 0)
1053         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1054
1055       continue;
1056     }
1057
1058     if (SpillToVGPR) {
1059       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1060       auto MIB =
1061         BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1062                 SubReg)
1063         .addReg(Spill.VGPR)
1064         .addImm(Spill.Lane);
1065
1066       if (NumSubRegs > 1 && i == 0)
1067         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1068     } else {
1069       if (OnlyToVGPR)
1070         return false;
1071
1072       // Restore SGPR from a stack slot.
1073       // FIXME: We should use S_LOAD_DWORD here for VI.
1074       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1075       unsigned Align = FrameInfo.getObjectAlignment(Index);
1076
1077       MachinePointerInfo PtrInfo
1078         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
1079
1080       MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
1081         MachineMemOperand::MOLoad, EltSize,
1082         MinAlign(Align, EltSize * i));
1083
1084       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
1085         .addFrameIndex(Index)                 // vaddr
1086         .addReg(MFI->getScratchRSrcReg())     // srsrc
1087         .addReg(MFI->getStackPtrOffsetReg())  // soffset
1088         .addImm(i * 4)                        // offset
1089         .addMemOperand(MMO);
1090
1091       auto MIB =
1092         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
1093         .addReg(TmpReg, RegState::Kill);
1094
1095       if (NumSubRegs > 1)
1096         MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
1097     }
1098   }
1099
1100   if (M0CopyReg != AMDGPU::NoRegister) {
1101     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
1102       .addReg(M0CopyReg, RegState::Kill);
1103   }
1104
1105   MI->eraseFromParent();
1106   return true;
1107 }
1108
1109 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1110 /// a VGPR and the stack slot can be safely eliminated when all other users are
1111 /// handled.
1112 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1113   MachineBasicBlock::iterator MI,
1114   int FI,
1115   RegScavenger *RS) const {
1116   switch (MI->getOpcode()) {
1117   case AMDGPU::SI_SPILL_S1024_SAVE:
1118   case AMDGPU::SI_SPILL_S512_SAVE:
1119   case AMDGPU::SI_SPILL_S256_SAVE:
1120   case AMDGPU::SI_SPILL_S160_SAVE:
1121   case AMDGPU::SI_SPILL_S128_SAVE:
1122   case AMDGPU::SI_SPILL_S96_SAVE:
1123   case AMDGPU::SI_SPILL_S64_SAVE:
1124   case AMDGPU::SI_SPILL_S32_SAVE:
1125     return spillSGPR(MI, FI, RS, true);
1126   case AMDGPU::SI_SPILL_S1024_RESTORE:
1127   case AMDGPU::SI_SPILL_S512_RESTORE:
1128   case AMDGPU::SI_SPILL_S256_RESTORE:
1129   case AMDGPU::SI_SPILL_S160_RESTORE:
1130   case AMDGPU::SI_SPILL_S128_RESTORE:
1131   case AMDGPU::SI_SPILL_S96_RESTORE:
1132   case AMDGPU::SI_SPILL_S64_RESTORE:
1133   case AMDGPU::SI_SPILL_S32_RESTORE:
1134     return restoreSGPR(MI, FI, RS, true);
1135   default:
1136     llvm_unreachable("not an SGPR spill instruction");
1137   }
1138 }
1139
1140 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
1141                                         int SPAdj, unsigned FIOperandNum,
1142                                         RegScavenger *RS) const {
1143   MachineFunction *MF = MI->getParent()->getParent();
1144   MachineRegisterInfo &MRI = MF->getRegInfo();
1145   MachineBasicBlock *MBB = MI->getParent();
1146   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1147   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1148   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
1149   const SIInstrInfo *TII = ST.getInstrInfo();
1150   DebugLoc DL = MI->getDebugLoc();
1151
1152   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
1153
1154   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
1155   int Index = MI->getOperand(FIOperandNum).getIndex();
1156
1157   Register FrameReg = getFrameRegister(*MF);
1158
1159   switch (MI->getOpcode()) {
1160     // SGPR register spill
1161     case AMDGPU::SI_SPILL_S1024_SAVE:
1162     case AMDGPU::SI_SPILL_S512_SAVE:
1163     case AMDGPU::SI_SPILL_S256_SAVE:
1164     case AMDGPU::SI_SPILL_S160_SAVE:
1165     case AMDGPU::SI_SPILL_S128_SAVE:
1166     case AMDGPU::SI_SPILL_S96_SAVE:
1167     case AMDGPU::SI_SPILL_S64_SAVE:
1168     case AMDGPU::SI_SPILL_S32_SAVE: {
1169       spillSGPR(MI, Index, RS);
1170       break;
1171     }
1172
1173     // SGPR register restore
1174     case AMDGPU::SI_SPILL_S1024_RESTORE:
1175     case AMDGPU::SI_SPILL_S512_RESTORE:
1176     case AMDGPU::SI_SPILL_S256_RESTORE:
1177     case AMDGPU::SI_SPILL_S160_RESTORE:
1178     case AMDGPU::SI_SPILL_S128_RESTORE:
1179     case AMDGPU::SI_SPILL_S96_RESTORE:
1180     case AMDGPU::SI_SPILL_S64_RESTORE:
1181     case AMDGPU::SI_SPILL_S32_RESTORE: {
1182       restoreSGPR(MI, Index, RS);
1183       break;
1184     }
1185
1186     // VGPR register spill
1187     case AMDGPU::SI_SPILL_V1024_SAVE:
1188     case AMDGPU::SI_SPILL_V512_SAVE:
1189     case AMDGPU::SI_SPILL_V256_SAVE:
1190     case AMDGPU::SI_SPILL_V160_SAVE:
1191     case AMDGPU::SI_SPILL_V128_SAVE:
1192     case AMDGPU::SI_SPILL_V96_SAVE:
1193     case AMDGPU::SI_SPILL_V64_SAVE:
1194     case AMDGPU::SI_SPILL_V32_SAVE:
1195     case AMDGPU::SI_SPILL_A1024_SAVE:
1196     case AMDGPU::SI_SPILL_A512_SAVE:
1197     case AMDGPU::SI_SPILL_A128_SAVE:
1198     case AMDGPU::SI_SPILL_A64_SAVE:
1199     case AMDGPU::SI_SPILL_A32_SAVE: {
1200       const MachineOperand *VData = TII->getNamedOperand(*MI,
1201                                                          AMDGPU::OpName::vdata);
1202       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1203              MFI->getStackPtrOffsetReg());
1204
1205       buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
1206             Index,
1207             VData->getReg(), VData->isKill(),
1208             TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1209             FrameReg,
1210             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1211             *MI->memoperands_begin(),
1212             RS);
1213       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
1214       MI->eraseFromParent();
1215       break;
1216     }
1217     case AMDGPU::SI_SPILL_V32_RESTORE:
1218     case AMDGPU::SI_SPILL_V64_RESTORE:
1219     case AMDGPU::SI_SPILL_V96_RESTORE:
1220     case AMDGPU::SI_SPILL_V128_RESTORE:
1221     case AMDGPU::SI_SPILL_V160_RESTORE:
1222     case AMDGPU::SI_SPILL_V256_RESTORE:
1223     case AMDGPU::SI_SPILL_V512_RESTORE:
1224     case AMDGPU::SI_SPILL_V1024_RESTORE:
1225     case AMDGPU::SI_SPILL_A32_RESTORE:
1226     case AMDGPU::SI_SPILL_A64_RESTORE:
1227     case AMDGPU::SI_SPILL_A128_RESTORE:
1228     case AMDGPU::SI_SPILL_A512_RESTORE:
1229     case AMDGPU::SI_SPILL_A1024_RESTORE: {
1230       const MachineOperand *VData = TII->getNamedOperand(*MI,
1231                                                          AMDGPU::OpName::vdata);
1232       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1233              MFI->getStackPtrOffsetReg());
1234
1235       buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
1236             Index,
1237             VData->getReg(), VData->isKill(),
1238             TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1239             FrameReg,
1240             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1241             *MI->memoperands_begin(),
1242             RS);
1243       MI->eraseFromParent();
1244       break;
1245     }
1246
1247     default: {
1248       const DebugLoc &DL = MI->getDebugLoc();
1249       bool IsMUBUF = TII->isMUBUF(*MI);
1250
1251       if (!IsMUBUF && !MFI->isEntryFunction()) {
1252         // Convert to an absolute stack address by finding the offset from the
1253         // scratch wave base and scaling by the wave size.
1254         //
1255         // In an entry function/kernel the offset is already the absolute
1256         // address relative to the frame register.
1257
1258         unsigned DiffReg
1259           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1260
1261         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1262         Register ResultReg = IsCopy ?
1263           MI->getOperand(0).getReg() :
1264           MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1265
1266         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
1267           .addReg(FrameReg)
1268           .addReg(MFI->getScratchWaveOffsetReg());
1269
1270         int64_t Offset = FrameInfo.getObjectOffset(Index);
1271         if (Offset == 0) {
1272           // XXX - This never happens because of emergency scavenging slot at 0?
1273           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1274             .addImm(Log2_32(ST.getWavefrontSize()))
1275             .addReg(DiffReg);
1276         } else {
1277           unsigned ScaledReg
1278             = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1279
1280           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
1281             .addImm(Log2_32(ST.getWavefrontSize()))
1282             .addReg(DiffReg, RegState::Kill);
1283
1284           // TODO: Fold if use instruction is another add of a constant.
1285           if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1286             TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
1287               .addImm(Offset)
1288               .addReg(ScaledReg, RegState::Kill)
1289               .addImm(0); // clamp bit
1290           } else {
1291             unsigned ConstOffsetReg
1292               = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1293
1294             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1295               .addImm(Offset);
1296             TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
1297               .addReg(ConstOffsetReg, RegState::Kill)
1298               .addReg(ScaledReg, RegState::Kill)
1299               .addImm(0); // clamp bit
1300           }
1301         }
1302
1303         // Don't introduce an extra copy if we're just materializing in a mov.
1304         if (IsCopy)
1305           MI->eraseFromParent();
1306         else
1307           FIOp.ChangeToRegister(ResultReg, false, false, true);
1308         return;
1309       }
1310
1311       if (IsMUBUF) {
1312         // Disable offen so we don't need a 0 vgpr base.
1313         assert(static_cast<int>(FIOperandNum) ==
1314                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1315                                           AMDGPU::OpName::vaddr));
1316
1317         assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1318                MFI->getStackPtrOffsetReg());
1319
1320         TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
1321
1322         int64_t Offset = FrameInfo.getObjectOffset(Index);
1323         int64_t OldImm
1324           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1325         int64_t NewOffset = OldImm + Offset;
1326
1327         if (isUInt<12>(NewOffset) &&
1328             buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
1329           MI->eraseFromParent();
1330           return;
1331         }
1332       }
1333
1334       // If the offset is simply too big, don't convert to a scratch wave offset
1335       // relative index.
1336
1337       int64_t Offset = FrameInfo.getObjectOffset(Index);
1338       FIOp.ChangeToImmediate(Offset);
1339       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
1340         unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1341         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1342           .addImm(Offset);
1343         FIOp.ChangeToRegister(TmpReg, false, false, true);
1344       }
1345     }
1346   }
1347 }
1348
1349 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
1350   return AMDGPUInstPrinter::getRegisterName(Reg);
1351 }
1352
1353 // FIXME: This is very slow. It might be worth creating a map from physreg to
1354 // register class.
1355 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
1356   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
1357
1358   static const TargetRegisterClass *const BaseClasses[] = {
1359     &AMDGPU::VGPR_32RegClass,
1360     &AMDGPU::SReg_32RegClass,
1361     &AMDGPU::AGPR_32RegClass,
1362     &AMDGPU::VReg_64RegClass,
1363     &AMDGPU::SReg_64RegClass,
1364     &AMDGPU::AReg_64RegClass,
1365     &AMDGPU::VReg_96RegClass,
1366     &AMDGPU::SReg_96RegClass,
1367     &AMDGPU::VReg_128RegClass,
1368     &AMDGPU::SReg_128RegClass,
1369     &AMDGPU::AReg_128RegClass,
1370     &AMDGPU::VReg_160RegClass,
1371     &AMDGPU::SReg_160RegClass,
1372     &AMDGPU::VReg_256RegClass,
1373     &AMDGPU::SReg_256RegClass,
1374     &AMDGPU::VReg_512RegClass,
1375     &AMDGPU::SReg_512RegClass,
1376     &AMDGPU::AReg_512RegClass,
1377     &AMDGPU::SReg_1024RegClass,
1378     &AMDGPU::VReg_1024RegClass,
1379     &AMDGPU::AReg_1024RegClass,
1380     &AMDGPU::SCC_CLASSRegClass,
1381     &AMDGPU::Pseudo_SReg_32RegClass,
1382     &AMDGPU::Pseudo_SReg_128RegClass,
1383   };
1384
1385   for (const TargetRegisterClass *BaseClass : BaseClasses) {
1386     if (BaseClass->contains(Reg)) {
1387       return BaseClass;
1388     }
1389   }
1390   return nullptr;
1391 }
1392
1393 // TODO: It might be helpful to have some target specific flags in
1394 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
1395 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1396   unsigned Size = getRegSizeInBits(*RC);
1397   if (Size < 32)
1398     return false;
1399   switch (Size) {
1400   case 32:
1401     return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
1402   case 64:
1403     return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
1404   case 96:
1405     return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
1406   case 128:
1407     return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
1408   case 160:
1409     return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr;
1410   case 256:
1411     return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
1412   case 512:
1413     return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
1414   case 1024:
1415     return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
1416   default:
1417     llvm_unreachable("Invalid register class size");
1418   }
1419 }
1420
1421 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
1422   unsigned Size = getRegSizeInBits(*RC);
1423   if (Size < 32)
1424     return false;
1425   switch (Size) {
1426   case 32:
1427     return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr;
1428   case 64:
1429     return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr;
1430   case 96:
1431     return false;
1432   case 128:
1433     return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr;
1434   case 160:
1435   case 256:
1436     return false;
1437   case 512:
1438     return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr;
1439   case 1024:
1440     return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr;
1441   default:
1442     llvm_unreachable("Invalid register class size");
1443   }
1444 }
1445
1446 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
1447                                          const TargetRegisterClass *SRC) const {
1448   switch (getRegSizeInBits(*SRC)) {
1449   case 32:
1450     return &AMDGPU::VGPR_32RegClass;
1451   case 64:
1452     return &AMDGPU::VReg_64RegClass;
1453   case 96:
1454     return &AMDGPU::VReg_96RegClass;
1455   case 128:
1456     return &AMDGPU::VReg_128RegClass;
1457   case 160:
1458     return &AMDGPU::VReg_160RegClass;
1459   case 256:
1460     return &AMDGPU::VReg_256RegClass;
1461   case 512:
1462     return &AMDGPU::VReg_512RegClass;
1463   case 1024:
1464     return &AMDGPU::VReg_1024RegClass;
1465   default:
1466     llvm_unreachable("Invalid register class size");
1467   }
1468 }
1469
1470 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass(
1471                                          const TargetRegisterClass *SRC) const {
1472   switch (getRegSizeInBits(*SRC)) {
1473   case 32:
1474     return &AMDGPU::AGPR_32RegClass;
1475   case 64:
1476     return &AMDGPU::AReg_64RegClass;
1477   case 128:
1478     return &AMDGPU::AReg_128RegClass;
1479   case 512:
1480     return &AMDGPU::AReg_512RegClass;
1481   case 1024:
1482     return &AMDGPU::AReg_1024RegClass;
1483   default:
1484     llvm_unreachable("Invalid register class size");
1485   }
1486 }
1487
1488 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
1489                                          const TargetRegisterClass *VRC) const {
1490   switch (getRegSizeInBits(*VRC)) {
1491   case 32:
1492     return &AMDGPU::SGPR_32RegClass;
1493   case 64:
1494     return &AMDGPU::SReg_64RegClass;
1495   case 96:
1496     return &AMDGPU::SReg_96RegClass;
1497   case 128:
1498     return &AMDGPU::SReg_128RegClass;
1499   case 160:
1500     return &AMDGPU::SReg_160RegClass;
1501   case 256:
1502     return &AMDGPU::SReg_256RegClass;
1503   case 512:
1504     return &AMDGPU::SReg_512RegClass;
1505   case 1024:
1506     return &AMDGPU::SReg_1024RegClass;
1507   default:
1508     llvm_unreachable("Invalid register class size");
1509   }
1510 }
1511
1512 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
1513                          const TargetRegisterClass *RC, unsigned SubIdx) const {
1514   if (SubIdx == AMDGPU::NoSubRegister)
1515     return RC;
1516
1517   // We can assume that each lane corresponds to one 32-bit register.
1518   unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes();
1519   if (isSGPRClass(RC)) {
1520     switch (Count) {
1521     case 1:
1522       return &AMDGPU::SGPR_32RegClass;
1523     case 2:
1524       return &AMDGPU::SReg_64RegClass;
1525     case 3:
1526       return &AMDGPU::SReg_96RegClass;
1527     case 4:
1528       return &AMDGPU::SReg_128RegClass;
1529     case 5:
1530       return &AMDGPU::SReg_160RegClass;
1531     case 8:
1532       return &AMDGPU::SReg_256RegClass;
1533     case 16:
1534       return &AMDGPU::SReg_512RegClass;
1535     case 32: /* fall-through */
1536     default:
1537       llvm_unreachable("Invalid sub-register class size");
1538     }
1539   } else if (hasAGPRs(RC)) {
1540     switch (Count) {
1541     case 1:
1542       return &AMDGPU::AGPR_32RegClass;
1543     case 2:
1544       return &AMDGPU::AReg_64RegClass;
1545     case 4:
1546       return &AMDGPU::AReg_128RegClass;
1547     case 16:
1548       return &AMDGPU::AReg_512RegClass;
1549     case 32: /* fall-through */
1550     default:
1551       llvm_unreachable("Invalid sub-register class size");
1552     }
1553   } else {
1554     switch (Count) {
1555     case 1:
1556       return &AMDGPU::VGPR_32RegClass;
1557     case 2:
1558       return &AMDGPU::VReg_64RegClass;
1559     case 3:
1560       return &AMDGPU::VReg_96RegClass;
1561     case 4:
1562       return &AMDGPU::VReg_128RegClass;
1563     case 5:
1564       return &AMDGPU::VReg_160RegClass;
1565     case 8:
1566       return &AMDGPU::VReg_256RegClass;
1567     case 16:
1568       return &AMDGPU::VReg_512RegClass;
1569     case 32: /* fall-through */
1570     default:
1571       llvm_unreachable("Invalid sub-register class size");
1572     }
1573   }
1574 }
1575
1576 bool SIRegisterInfo::shouldRewriteCopySrc(
1577   const TargetRegisterClass *DefRC,
1578   unsigned DefSubReg,
1579   const TargetRegisterClass *SrcRC,
1580   unsigned SrcSubReg) const {
1581   // We want to prefer the smallest register class possible, so we don't want to
1582   // stop and rewrite on anything that looks like a subregister
1583   // extract. Operations mostly don't care about the super register class, so we
1584   // only want to stop on the most basic of copies between the same register
1585   // class.
1586   //
1587   // e.g. if we have something like
1588   // %0 = ...
1589   // %1 = ...
1590   // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
1591   // %3 = COPY %2, sub0
1592   //
1593   // We want to look through the COPY to find:
1594   //  => %3 = COPY %0
1595
1596   // Plain copy.
1597   return getCommonSubClass(DefRC, SrcRC) != nullptr;
1598 }
1599
1600 /// Returns a register that is not used at any point in the function.
1601 ///        If all registers are used, then this function will return
1602 //         AMDGPU::NoRegister.
1603 unsigned
1604 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
1605                                    const TargetRegisterClass *RC,
1606                                    const MachineFunction &MF) const {
1607
1608   for (unsigned Reg : *RC)
1609     if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
1610       return Reg;
1611   return AMDGPU::NoRegister;
1612 }
1613
1614 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
1615                                                    unsigned EltSize) const {
1616   if (EltSize == 4) {
1617     static const int16_t Sub0_31[] = {
1618       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1619       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1620       AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1621       AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1622       AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
1623       AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
1624       AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
1625       AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31,
1626     };
1627
1628     static const int16_t Sub0_15[] = {
1629       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1630       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1631       AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1632       AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1633     };
1634
1635     static const int16_t Sub0_7[] = {
1636       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1637       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1638     };
1639
1640     static const int16_t Sub0_4[] = {
1641       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
1642     };
1643
1644     static const int16_t Sub0_3[] = {
1645       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1646     };
1647
1648     static const int16_t Sub0_2[] = {
1649       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
1650     };
1651
1652     static const int16_t Sub0_1[] = {
1653       AMDGPU::sub0, AMDGPU::sub1,
1654     };
1655
1656     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1657     case 32:
1658       return {};
1659     case 64:
1660       return makeArrayRef(Sub0_1);
1661     case 96:
1662       return makeArrayRef(Sub0_2);
1663     case 128:
1664       return makeArrayRef(Sub0_3);
1665     case 160:
1666       return makeArrayRef(Sub0_4);
1667     case 256:
1668       return makeArrayRef(Sub0_7);
1669     case 512:
1670       return makeArrayRef(Sub0_15);
1671     case 1024:
1672       return makeArrayRef(Sub0_31);
1673     default:
1674       llvm_unreachable("unhandled register size");
1675     }
1676   }
1677
1678   if (EltSize == 8) {
1679     static const int16_t Sub0_31_64[] = {
1680       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1681       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1682       AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1683       AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1684       AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
1685       AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
1686       AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
1687       AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
1688     };
1689
1690     static const int16_t Sub0_15_64[] = {
1691       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1692       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1693       AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1694       AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
1695     };
1696
1697     static const int16_t Sub0_7_64[] = {
1698       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1699       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
1700     };
1701
1702
1703     static const int16_t Sub0_3_64[] = {
1704       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
1705     };
1706
1707     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1708     case 64:
1709       return {};
1710     case 128:
1711       return makeArrayRef(Sub0_3_64);
1712     case 256:
1713       return makeArrayRef(Sub0_7_64);
1714     case 512:
1715       return makeArrayRef(Sub0_15_64);
1716     case 1024:
1717       return makeArrayRef(Sub0_31_64);
1718     default:
1719       llvm_unreachable("unhandled register size");
1720     }
1721   }
1722
1723   if (EltSize == 16) {
1724
1725     static const int16_t Sub0_31_128[] = {
1726       AMDGPU::sub0_sub1_sub2_sub3,
1727       AMDGPU::sub4_sub5_sub6_sub7,
1728       AMDGPU::sub8_sub9_sub10_sub11,
1729       AMDGPU::sub12_sub13_sub14_sub15,
1730       AMDGPU::sub16_sub17_sub18_sub19,
1731       AMDGPU::sub20_sub21_sub22_sub23,
1732       AMDGPU::sub24_sub25_sub26_sub27,
1733       AMDGPU::sub28_sub29_sub30_sub31
1734     };
1735
1736     static const int16_t Sub0_15_128[] = {
1737       AMDGPU::sub0_sub1_sub2_sub3,
1738       AMDGPU::sub4_sub5_sub6_sub7,
1739       AMDGPU::sub8_sub9_sub10_sub11,
1740       AMDGPU::sub12_sub13_sub14_sub15
1741     };
1742
1743     static const int16_t Sub0_7_128[] = {
1744       AMDGPU::sub0_sub1_sub2_sub3,
1745       AMDGPU::sub4_sub5_sub6_sub7
1746     };
1747
1748     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1749     case 128:
1750       return {};
1751     case 256:
1752       return makeArrayRef(Sub0_7_128);
1753     case 512:
1754       return makeArrayRef(Sub0_15_128);
1755     case 1024:
1756       return makeArrayRef(Sub0_31_128);
1757     default:
1758       llvm_unreachable("unhandled register size");
1759     }
1760   }
1761
1762   assert(EltSize == 32 && "unhandled elt size");
1763
1764   static const int16_t Sub0_31_256[] = {
1765     AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
1766     AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
1767     AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
1768     AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
1769   };
1770
1771   static const int16_t Sub0_15_256[] = {
1772     AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
1773     AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15
1774   };
1775
1776   switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1777   case 256:
1778     return {};
1779   case 512:
1780     return makeArrayRef(Sub0_15_256);
1781   case 1024:
1782     return makeArrayRef(Sub0_31_256);
1783   default:
1784     llvm_unreachable("unhandled register size");
1785   }
1786 }
1787
1788 const TargetRegisterClass*
1789 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
1790                                   unsigned Reg) const {
1791   if (TargetRegisterInfo::isVirtualRegister(Reg))
1792     return  MRI.getRegClass(Reg);
1793
1794   return getPhysRegClass(Reg);
1795 }
1796
1797 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
1798                             unsigned Reg) const {
1799   const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
1800   assert(RC && "Register class for the reg not found");
1801   return hasVGPRs(RC);
1802 }
1803
1804 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
1805                             unsigned Reg) const {
1806   const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
1807   assert(RC && "Register class for the reg not found");
1808   return hasAGPRs(RC);
1809 }
1810
1811 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
1812                                     const TargetRegisterClass *SrcRC,
1813                                     unsigned SubReg,
1814                                     const TargetRegisterClass *DstRC,
1815                                     unsigned DstSubReg,
1816                                     const TargetRegisterClass *NewRC,
1817                                     LiveIntervals &LIS) const {
1818   unsigned SrcSize = getRegSizeInBits(*SrcRC);
1819   unsigned DstSize = getRegSizeInBits(*DstRC);
1820   unsigned NewSize = getRegSizeInBits(*NewRC);
1821
1822   // Do not increase size of registers beyond dword, we would need to allocate
1823   // adjacent registers and constraint regalloc more than needed.
1824
1825   // Always allow dword coalescing.
1826   if (SrcSize <= 32 || DstSize <= 32)
1827     return true;
1828
1829   return NewSize <= DstSize || NewSize <= SrcSize;
1830 }
1831
1832 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
1833                                              MachineFunction &MF) const {
1834
1835   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1836   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1837
1838   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
1839                                                        MF.getFunction());
1840   switch (RC->getID()) {
1841   default:
1842     return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
1843   case AMDGPU::VGPR_32RegClassID:
1844     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
1845   case AMDGPU::SGPR_32RegClassID:
1846     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
1847   }
1848 }
1849
1850 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
1851                                                 unsigned Idx) const {
1852   if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet())
1853     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
1854                                const_cast<MachineFunction &>(MF));
1855
1856   if (Idx == getSGPRPressureSet())
1857     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
1858                                const_cast<MachineFunction &>(MF));
1859
1860   return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
1861 }
1862
1863 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
1864   static const int Empty[] = { -1 };
1865
1866   if (hasRegUnit(AMDGPU::M0, RegUnit))
1867     return Empty;
1868   return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
1869 }
1870
1871 unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
1872   // Not a callee saved register.
1873   return AMDGPU::SGPR30_SGPR31;
1874 }
1875
1876 const TargetRegisterClass *
1877 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
1878                                          const RegisterBank &RB,
1879                                          const MachineRegisterInfo &MRI) const {
1880   switch (Size) {
1881   case 1: {
1882     switch (RB.getID()) {
1883     case AMDGPU::VGPRRegBankID:
1884       return &AMDGPU::VGPR_32RegClass;
1885     case AMDGPU::VCCRegBankID:
1886       return isWave32 ?
1887         &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
1888     case AMDGPU::SGPRRegBankID:
1889       return &AMDGPU::SReg_32_XM0RegClass;
1890     case AMDGPU::SCCRegBankID:
1891       // This needs to return an allocatable class, so don't bother returning
1892       // the dummy SCC class.
1893       return &AMDGPU::SReg_32_XM0RegClass;
1894     default:
1895       llvm_unreachable("unknown register bank");
1896     }
1897   }
1898   case 32:
1899     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
1900                                                  &AMDGPU::SReg_32_XM0RegClass;
1901   case 64:
1902     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
1903                                                  &AMDGPU::SReg_64_XEXECRegClass;
1904   case 96:
1905     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
1906                                                  &AMDGPU::SReg_96RegClass;
1907   case 128:
1908     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
1909                                                  &AMDGPU::SReg_128RegClass;
1910   case 160:
1911     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
1912                                                  &AMDGPU::SReg_160RegClass;
1913   case 256:
1914     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass :
1915                                                  &AMDGPU::SReg_256RegClass;
1916   case 512:
1917     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
1918                                                  &AMDGPU::SReg_512RegClass;
1919   default:
1920     if (Size < 32)
1921       return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
1922                                                    &AMDGPU::SReg_32_XM0RegClass;
1923     return nullptr;
1924   }
1925 }
1926
1927 const TargetRegisterClass *
1928 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
1929                                          const MachineRegisterInfo &MRI) const {
1930   if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
1931     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
1932   return nullptr;
1933 }
1934
1935 unsigned SIRegisterInfo::getVCC() const {
1936   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
1937 }
1938
1939 const TargetRegisterClass *
1940 SIRegisterInfo::getRegClass(unsigned RCID) const {
1941   switch ((int)RCID) {
1942   case AMDGPU::SReg_1RegClassID:
1943     return getBoolRC();
1944   case AMDGPU::SReg_1_XEXECRegClassID:
1945     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
1946       : &AMDGPU::SReg_64_XEXECRegClass;
1947   case -1:
1948     return nullptr;
1949   default:
1950     return AMDGPURegisterInfo::getRegClass(RCID);
1951   }
1952 }
1953
1954 // Find reaching register definition
1955 MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
1956                                               MachineInstr &Use,
1957                                               MachineRegisterInfo &MRI,
1958                                               LiveIntervals *LIS) const {
1959   auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
1960   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
1961   SlotIndex DefIdx;
1962
1963   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1964     if (!LIS->hasInterval(Reg))
1965       return nullptr;
1966     LiveInterval &LI = LIS->getInterval(Reg);
1967     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
1968                                   : MRI.getMaxLaneMaskForVReg(Reg);
1969     VNInfo *V = nullptr;
1970     if (LI.hasSubRanges()) {
1971       for (auto &S : LI.subranges()) {
1972         if ((S.LaneMask & SubLanes) == SubLanes) {
1973           V = S.getVNInfoAt(UseIdx);
1974           break;
1975         }
1976       }
1977     } else {
1978       V = LI.getVNInfoAt(UseIdx);
1979     }
1980     if (!V)
1981       return nullptr;
1982     DefIdx = V->def;
1983   } else {
1984     // Find last def.
1985     for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) {
1986       LiveRange &LR = LIS->getRegUnit(*Units);
1987       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
1988         if (!DefIdx.isValid() ||
1989             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
1990                           LIS->getInstructionFromIndex(V->def)))
1991           DefIdx = V->def;
1992       } else {
1993         return nullptr;
1994       }
1995     }
1996   }
1997
1998   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
1999
2000   if (!Def || !MDT.dominates(Def, &Use))
2001     return nullptr;
2002
2003   assert(Def->modifiesRegister(Reg, this));
2004
2005   return Def;
2006 }