llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp

   1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 ///
  10 /// This file implements methods from the AMDGPUCustomBehaviour class.
  11 ///
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPUCustomBehaviour.h"
  15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  16 #include "TargetInfo/AMDGPUTargetInfo.h"
  17 #include "Utils/AMDGPUBaseInfo.h"
  18 #include "llvm/MC/TargetRegistry.h"
  19 #include "llvm/Support/WithColor.h"
  20
  21 namespace llvm::mca {
  22
  23 void AMDGPUInstrPostProcess::postProcessInstruction(
  24     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
  25   switch (MCI.getOpcode()) {
  26   case AMDGPU::S_WAITCNT:
  27   case AMDGPU::S_WAITCNT_soft:
  28   case AMDGPU::S_WAITCNT_EXPCNT:
  29   case AMDGPU::S_WAITCNT_LGKMCNT:
  30   case AMDGPU::S_WAITCNT_VMCNT:
  31   case AMDGPU::S_WAITCNT_VSCNT:
  32   case AMDGPU::S_WAITCNT_VSCNT_soft:
  33   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
  34   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
  35   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
  36   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
  37   case AMDGPU::S_WAITCNT_gfx10:
  38   case AMDGPU::S_WAITCNT_gfx6_gfx7:
  39   case AMDGPU::S_WAITCNT_vi:
  40     return processWaitCnt(Inst, MCI);
  41   }
  42 }
  43
  44 // s_waitcnt instructions encode important information as immediate operands
  45 // which are lost during the MCInst -> mca::Instruction lowering.
  46 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
  47                                             const MCInst &MCI) {
  48   for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
  49     MCAOperand Op;
  50     const MCOperand &MCOp = MCI.getOperand(Idx);
  51     if (MCOp.isReg()) {
  52       Op = MCAOperand::createReg(MCOp.getReg());
  53     } else if (MCOp.isImm()) {
  54       Op = MCAOperand::createImm(MCOp.getImm());
  55     }
  56     Op.setIndex(Idx);
  57     Inst->addOperand(Op);
  58   }
  59 }
  60
  61 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
  62                                              const mca::SourceMgr &SrcMgr,
  63                                              const MCInstrInfo &MCII)
  64     : CustomBehaviour(STI, SrcMgr, MCII) {
  65   generateWaitCntInfo();
  66 }
  67
  68 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
  69                                                   const InstRef &IR) {
  70   const Instruction &Inst = *IR.getInstruction();
  71   unsigned Opcode = Inst.getOpcode();
  72
  73   // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
  74   // pseudo instructions here. However, there are plans for the future to make
  75   // it possible to use mca within backend passes. As such, I have left the
  76   // pseudo version of s_waitcnt within this switch statement.
  77   switch (Opcode) {
  78   default:
  79     return 0;
  80   case AMDGPU::S_WAITCNT: // This instruction
  81   case AMDGPU::S_WAITCNT_soft:
  82   case AMDGPU::S_WAITCNT_EXPCNT:
  83   case AMDGPU::S_WAITCNT_LGKMCNT:
  84   case AMDGPU::S_WAITCNT_VMCNT:
  85   case AMDGPU::S_WAITCNT_VSCNT:
  86   case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
  87   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
  88   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
  89   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
  90   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
  91   case AMDGPU::S_WAITCNT_gfx10:
  92   case AMDGPU::S_WAITCNT_gfx6_gfx7:
  93   case AMDGPU::S_WAITCNT_vi:
  94     // s_endpgm also behaves as if there is an implicit
  95     // s_waitcnt 0, but I'm not sure if it would be appropriate
  96     // to model this in llvm-mca based on how the iterations work
  97     // while simulating the pipeline over and over.
  98     return handleWaitCnt(IssuedInst, IR);
  99   }
 100
 101   return 0;
 102 }
 103
 104 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
 105                                               const InstRef &IR) {
 106   // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
 107   // I do not know how that instruction works so I did not attempt to model it.
 108   // set the max values to begin
 109   unsigned Vmcnt = 63;
 110   unsigned Expcnt = 7;
 111   unsigned Lgkmcnt = 31;
 112   unsigned Vscnt = 63;
 113   unsigned CurrVmcnt = 0;
 114   unsigned CurrExpcnt = 0;
 115   unsigned CurrLgkmcnt = 0;
 116   unsigned CurrVscnt = 0;
 117   unsigned CyclesToWaitVm = ~0U;
 118   unsigned CyclesToWaitExp = ~0U;
 119   unsigned CyclesToWaitLgkm = ~0U;
 120   unsigned CyclesToWaitVs = ~0U;
 121
 122   computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
 123
 124   // We will now look at each of the currently executing instructions
 125   // to find out if this wait instruction still needs to wait.
 126   for (const InstRef &PrevIR : IssuedInst) {
 127     const Instruction &PrevInst = *PrevIR.getInstruction();
 128     const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
 129     const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
 130     const int CyclesLeft = PrevInst.getCyclesLeft();
 131     assert(CyclesLeft != UNKNOWN_CYCLES &&
 132            "We should know how many cycles are left for this instruction");
 133     if (PrevInstWaitInfo.VmCnt) {
 134       CurrVmcnt++;
 135       if ((unsigned)CyclesLeft < CyclesToWaitVm)
 136         CyclesToWaitVm = CyclesLeft;
 137     }
 138     if (PrevInstWaitInfo.ExpCnt) {
 139       CurrExpcnt++;
 140       if ((unsigned)CyclesLeft < CyclesToWaitExp)
 141         CyclesToWaitExp = CyclesLeft;
 142     }
 143     if (PrevInstWaitInfo.LgkmCnt) {
 144       CurrLgkmcnt++;
 145       if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
 146         CyclesToWaitLgkm = CyclesLeft;
 147     }
 148     if (PrevInstWaitInfo.VsCnt) {
 149       CurrVscnt++;
 150       if ((unsigned)CyclesLeft < CyclesToWaitVs)
 151         CyclesToWaitVs = CyclesLeft;
 152     }
 153   }
 154
 155   unsigned CyclesToWait = ~0U;
 156   if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
 157     CyclesToWait = CyclesToWaitVm;
 158   if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
 159     CyclesToWait = CyclesToWaitExp;
 160   if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
 161     CyclesToWait = CyclesToWaitLgkm;
 162   if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
 163     CyclesToWait = CyclesToWaitVs;
 164
 165   // We may underestimate how many cycles we need to wait, but this
 166   // isn't a big deal. Our return value is just how many cycles until
 167   // this function gets run again. So as long as we don't overestimate
 168   // the wait time, we'll still end up stalling at this instruction
 169   // for the correct number of cycles.
 170
 171   if (CyclesToWait == ~0U)
 172     return 0;
 173   return CyclesToWait;
 174 }
 175
 176 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
 177                                            unsigned &Expcnt, unsigned &Lgkmcnt,
 178                                            unsigned &Vscnt) {
 179   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
 180   const Instruction &Inst = *IR.getInstruction();
 181   unsigned Opcode = Inst.getOpcode();
 182
 183   switch (Opcode) {
 184   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
 185   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
 186   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
 187   case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
 188     // Should probably be checking for nullptr
 189     // here, but I'm not sure how I should handle the case
 190     // where we see a nullptr.
 191     const MCAOperand *OpReg = Inst.getOperand(0);
 192     const MCAOperand *OpImm = Inst.getOperand(1);
 193     assert(OpReg && OpReg->isReg() && "First operand should be a register.");
 194     assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
 195     if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
 196       // Instruction is using a real register.
 197       // Since we can't know what value this register will have,
 198       // we can't compute what the value of this wait should be.
 199       WithColor::warning() << "The register component of "
 200                            << MCII.getName(Opcode) << " will be completely "
 201                            << "ignored. So the wait may not be accurate.\n";
 202     }
 203     switch (Opcode) {
 204     // Redundant switch so I don't have to repeat the code above
 205     // for each case. There are more clever ways to avoid this
 206     // extra switch and anyone can feel free to implement one of them.
 207     case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
 208       Expcnt = OpImm->getImm();
 209       break;
 210     case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
 211       Lgkmcnt = OpImm->getImm();
 212       break;
 213     case AMDGPU::S_WAITCNT_VMCNT_gfx10:
 214       Vmcnt = OpImm->getImm();
 215       break;
 216     case AMDGPU::S_WAITCNT_VSCNT_gfx10:
 217       Vscnt = OpImm->getImm();
 218       break;
 219     }
 220     return;
 221   }
 222   case AMDGPU::S_WAITCNT_gfx10:
 223   case AMDGPU::S_WAITCNT_gfx6_gfx7:
 224   case AMDGPU::S_WAITCNT_vi:
 225     unsigned WaitCnt = Inst.getOperand(0)->getImm();
 226     AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
 227     return;
 228   }
 229 }
 230
 231 void AMDGPUCustomBehaviour::generateWaitCntInfo() {
 232   // The core logic from this function is taken from
 233   // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
 234   // that are being looked at are in the MachineInstr format, whereas we have
 235   // access to the MCInst format. The side effects of this are that we can't use
 236   // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
 237   // functions. Therefore, we conservatively assume that these functions will
 238   // return true. This may cause a few instructions to be incorrectly tagged
 239   // with an extra CNT. However, these are instructions that do interact with at
 240   // least one CNT so giving them an extra CNT shouldn't cause issues in most
 241   // scenarios.
 242   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
 243   InstrWaitCntInfo.resize(SrcMgr.size());
 244
 245   for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
 246     const std::unique_ptr<Instruction> &Inst = EN.value();
 247     unsigned Index = EN.index();
 248     unsigned Opcode = Inst->getOpcode();
 249     const MCInstrDesc &MCID = MCII.get(Opcode);
 250     if ((MCID.TSFlags & SIInstrFlags::DS) &&
 251         (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
 252       InstrWaitCntInfo[Index].LgkmCnt = true;
 253       if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
 254         InstrWaitCntInfo[Index].ExpCnt = true;
 255     } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
 256       // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
 257       // and mayAccessLDSThroughFlat(Inst) would both return true for this
 258       // instruction. We have to do this because those functions use
 259       // information about the memory operands that we don't have access to.
 260       InstrWaitCntInfo[Index].LgkmCnt = true;
 261       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
 262         InstrWaitCntInfo[Index].VmCnt = true;
 263       else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
 264         InstrWaitCntInfo[Index].VmCnt = true;
 265       else
 266         InstrWaitCntInfo[Index].VsCnt = true;
 267     } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
 268       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
 269         InstrWaitCntInfo[Index].VmCnt = true;
 270       else if ((MCID.mayLoad() &&
 271                 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
 272                ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
 273                 !MCID.mayStore()))
 274         InstrWaitCntInfo[Index].VmCnt = true;
 275       else if (MCID.mayStore())
 276         InstrWaitCntInfo[Index].VsCnt = true;
 277
 278       // (IV.Major < 7) is meant to represent
 279       // GCNTarget.vmemWriteNeedsExpWaitcnt()
 280       // which is defined as
 281       // { return getGeneration() < SEA_ISLANDS; }
 282       if (IV.Major < 7 &&
 283           (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
 284         InstrWaitCntInfo[Index].ExpCnt = true;
 285     } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
 286       InstrWaitCntInfo[Index].LgkmCnt = true;
 287     } else if (MCID.TSFlags & SIInstrFlags::EXP) {
 288       InstrWaitCntInfo[Index].ExpCnt = true;
 289     } else {
 290       switch (Opcode) {
 291       case AMDGPU::S_SENDMSG:
 292       case AMDGPU::S_SENDMSGHALT:
 293       case AMDGPU::S_MEMTIME:
 294       case AMDGPU::S_MEMREALTIME:
 295         InstrWaitCntInfo[Index].LgkmCnt = true;
 296         break;
 297       }
 298     }
 299   }
 300 }
 301
 302 // taken from SIInstrInfo::isVMEM()
 303 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
 304   return MCID.TSFlags & SIInstrFlags::MUBUF ||
 305          MCID.TSFlags & SIInstrFlags::MTBUF ||
 306          MCID.TSFlags & SIInstrFlags::MIMG;
 307 }
 308
 309 // taken from SIInstrInfo::hasModifiersSet()
 310 bool AMDGPUCustomBehaviour::hasModifiersSet(
 311     const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
 312   int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
 313   if (Idx == -1)
 314     return false;
 315
 316   const MCAOperand *Op = Inst->getOperand(Idx);
 317   if (Op == nullptr || !Op->isImm() || !Op->getImm())
 318     return false;
 319
 320   return true;
 321 }
 322
 323 // taken from SIInstrInfo::isGWS()
 324 bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
 325   const MCInstrDesc &MCID = MCII.get(Opcode);
 326   return MCID.TSFlags & SIInstrFlags::GWS;
 327 }
 328
 329 // taken from SIInstrInfo::isAlwaysGDS()
 330 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
 331   return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
 332 }
 333
 334 } // namespace llvm::mca
 335
 336 using namespace llvm;
 337 using namespace mca;
 338
 339 static CustomBehaviour *
 340 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
 341                             const mca::SourceMgr &SrcMgr,
 342                             const MCInstrInfo &MCII) {
 343   return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
 344 }
 345
 346 static InstrPostProcess *
 347 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
 348                              const MCInstrInfo &MCII) {
 349   return new AMDGPUInstrPostProcess(STI, MCII);
 350 }
 351
 352 /// Extern function to initialize the targets for the AMDGPU backend
 353
 354 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
 355   TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
 356                                           createAMDGPUCustomBehaviour);
 357   TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
 358                                            createAMDGPUInstrPostProcess);
 359
 360   TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
 361                                           createAMDGPUCustomBehaviour);
 362   TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
 363                                            createAMDGPUInstrPostProcess);
 364 }