llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp

   1 //===-- SchedClassResolution.cpp --------------------------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "SchedClassResolution.h"
  10 #include "BenchmarkResult.h"
  11 #include "llvm/ADT/STLExtras.h"
  12 #include "llvm/MC/MCAsmInfo.h"
  13 #include "llvm/MCA/Support.h"
  14 #include "llvm/Support/FormatVariadic.h"
  15 #include <limits>
  16 #include <unordered_set>
  17 #include <vector>
  18
  19 namespace llvm {
  20 namespace exegesis {
  21
  22 // Return the non-redundant list of WriteProcRes used by the given sched class.
  23 // The scheduling model for LLVM is such that each instruction has a certain
  24 // number of uops which consume resources which are described by WriteProcRes
  25 // entries. Each entry describe how many cycles are spent on a specific ProcRes
  26 // kind.
  27 // For example, an instruction might have 3 uOps, one dispatching on P0
  28 // (ProcResIdx=1) and two on P06 (ProcResIdx = 7).
  29 // Note that LLVM additionally denormalizes resource consumption to include
  30 // usage of super resources by subresources. So in practice if there exists a
  31 // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by
  32 // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed
  33 // by P06 are also consumed by P016. In the figure below, parenthesized cycles
  34 // denote implied usage of superresources by subresources:
  35 //            P0      P06    P016
  36 //     uOp1    1      (1)     (1)
  37 //     uOp2            1      (1)
  38 //     uOp3            1      (1)
  39 //     =============================
  40 //             1       3       3
  41 // Eventually we end up with three entries for the WriteProcRes of the
  42 // instruction:
  43 //    {ProcResIdx=1,  Cycles=1}  // P0
  44 //    {ProcResIdx=7,  Cycles=3}  // P06
  45 //    {ProcResIdx=10, Cycles=3}  // P016
  46 //
  47 // Note that in this case, P016 does not contribute any cycles, so it would
  48 // be removed by this function.
  49 // FIXME: Merge this with the equivalent in llvm-mca.
  50 static SmallVector<MCWriteProcResEntry, 8>
  51 getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc,
  52                             const MCSubtargetInfo &STI) {
  53   SmallVector<MCWriteProcResEntry, 8> Result;
  54   const auto &SM = STI.getSchedModel();
  55   const unsigned NumProcRes = SM.getNumProcResourceKinds();
  56
  57   // Collect resource masks.
  58   SmallVector<uint64_t> ProcResourceMasks(NumProcRes);
  59   mca::computeProcResourceMasks(SM, ProcResourceMasks);
  60
  61   // Sort entries by smaller resources for (basic) topological ordering.
  62   using ResourceMaskAndEntry = std::pair<uint64_t, const MCWriteProcResEntry *>;
  63   SmallVector<ResourceMaskAndEntry, 8> ResourceMaskAndEntries;
  64   for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc),
  65                   *const WPREnd = STI.getWriteProcResEnd(&SCDesc);
  66        WPR != WPREnd; ++WPR) {
  67     uint64_t Mask = ProcResourceMasks[WPR->ProcResourceIdx];
  68     ResourceMaskAndEntries.push_back({Mask, WPR});
  69   }
  70   sort(ResourceMaskAndEntries,
  71        [](const ResourceMaskAndEntry &A, const ResourceMaskAndEntry &B) {
  72          unsigned popcntA = llvm::popcount(A.first);
  73          unsigned popcntB = llvm::popcount(B.first);
  74          if (popcntA < popcntB)
  75            return true;
  76          if (popcntA > popcntB)
  77            return false;
  78          return A.first < B.first;
  79        });
  80
  81   SmallVector<float, 32> ProcResUnitUsage(NumProcRes);
  82   for (const ResourceMaskAndEntry &Entry : ResourceMaskAndEntries) {
  83     const MCWriteProcResEntry *WPR = Entry.second;
  84     const MCProcResourceDesc *const ProcResDesc =
  85         SM.getProcResource(WPR->ProcResourceIdx);
  86     // TODO: Handle AcquireAtAtCycle in llvm-exegesis and llvm-mca. See
  87     // https://github.com/llvm/llvm-project/issues/62680 and
  88     // https://github.com/llvm/llvm-project/issues/62681
  89     assert(WPR->AcquireAtCycle == 0 &&
  90            "`llvm-exegesis` does not handle AcquireAtCycle > 0");
  91     if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
  92       // This is a ProcResUnit.
  93       Result.push_back(
  94           {WPR->ProcResourceIdx, WPR->ReleaseAtCycle, WPR->AcquireAtCycle});
  95       ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->ReleaseAtCycle;
  96     } else {
  97       // This is a ProcResGroup. First see if it contributes any cycles or if
  98       // it has cycles just from subunits.
  99       float RemainingCycles = WPR->ReleaseAtCycle;
 100       for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
 101            SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
 102            ++SubResIdx) {
 103         RemainingCycles -= ProcResUnitUsage[*SubResIdx];
 104       }
 105       if (RemainingCycles < 0.01f) {
 106         // The ProcResGroup contributes no cycles of its own.
 107         continue;
 108       }
 109       // The ProcResGroup contributes `RemainingCycles` cycles of its own.
 110       Result.push_back({WPR->ProcResourceIdx,
 111                         static_cast<uint16_t>(std::round(RemainingCycles)),
 112                         WPR->AcquireAtCycle});
 113       // Spread the remaining cycles over all subunits.
 114       for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
 115            SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
 116            ++SubResIdx) {
 117         ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits;
 118       }
 119     }
 120   }
 121   return Result;
 122 }
 123
 124 // Distributes a pressure budget as evenly as possible on the provided subunits
 125 // given the already existing port pressure distribution.
 126 //
 127 // The algorithm is as follows: while there is remaining pressure to
 128 // distribute, find the subunits with minimal pressure, and distribute
 129 // remaining pressure equally up to the pressure of the unit with
 130 // second-to-minimal pressure.
 131 // For example, let's assume we want to distribute 2*P1256
 132 // (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is:
 133 //     DensePressure =        P0   P1   P2   P3   P4   P5   P6   P7
 134 //                           0.1  0.3  0.2  0.0  0.0  0.5  0.5  0.5
 135 //     RemainingPressure = 2.0
 136 // We sort the subunits by pressure:
 137 //     Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)]
 138 // We'll first start by the subunits with minimal pressure, which are at
 139 // the beginning of the sorted array. In this example there is one (P2).
 140 // The subunit with second-to-minimal pressure is the next one in the
 141 // array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles
 142 // from the budget.
 143 //     Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)]
 144 //     RemainingPressure = 1.9
 145 // We repeat this process: distribute 0.2 pressure on each of the minimal
 146 // P2 and P1, decrease budget by 2*0.2:
 147 //     Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)]
 148 //     RemainingPressure = 1.5
 149 // There are no second-to-minimal subunits so we just share the remaining
 150 // budget (1.5 cycles) equally:
 151 //     Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)]
 152 //     RemainingPressure = 0.0
 153 // We stop as there is no remaining budget to distribute.
 154 static void distributePressure(float RemainingPressure,
 155                                SmallVector<uint16_t, 32> Subunits,
 156                                SmallVector<float, 32> &DensePressure) {
 157   // Find the number of subunits with minimal pressure (they are at the
 158   // front).
 159   sort(Subunits, [&DensePressure](const uint16_t A, const uint16_t B) {
 160     return DensePressure[A] < DensePressure[B];
 161   });
 162   const auto getPressureForSubunit = [&DensePressure,
 163                                       &Subunits](size_t I) -> float & {
 164     return DensePressure[Subunits[I]];
 165   };
 166   size_t NumMinimalSU = 1;
 167   while (NumMinimalSU < Subunits.size() &&
 168          getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) {
 169     ++NumMinimalSU;
 170   }
 171   while (RemainingPressure > 0.0f) {
 172     if (NumMinimalSU == Subunits.size()) {
 173       // All units are minimal, just distribute evenly and be done.
 174       for (size_t I = 0; I < NumMinimalSU; ++I) {
 175         getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
 176       }
 177       return;
 178     }
 179     // Distribute the remaining pressure equally.
 180     const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1);
 181     const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU);
 182     assert(MinimalPressure < SecondToMinimalPressure);
 183     const float Increment = SecondToMinimalPressure - MinimalPressure;
 184     if (RemainingPressure <= NumMinimalSU * Increment) {
 185       // There is not enough remaining pressure.
 186       for (size_t I = 0; I < NumMinimalSU; ++I) {
 187         getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
 188       }
 189       return;
 190     }
 191     // Bump all minimal pressure subunits to `SecondToMinimalPressure`.
 192     for (size_t I = 0; I < NumMinimalSU; ++I) {
 193       getPressureForSubunit(I) = SecondToMinimalPressure;
 194       RemainingPressure -= SecondToMinimalPressure;
 195     }
 196     while (NumMinimalSU < Subunits.size() &&
 197            getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) {
 198       ++NumMinimalSU;
 199     }
 200   }
 201 }
 202
 203 std::vector<std::pair<uint16_t, float>>
 204 computeIdealizedProcResPressure(const MCSchedModel &SM,
 205                                 SmallVector<MCWriteProcResEntry, 8> WPRS) {
 206   // DensePressure[I] is the port pressure for Proc Resource I.
 207   SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds());
 208   sort(WPRS, [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) {
 209     return A.ProcResourceIdx < B.ProcResourceIdx;
 210   });
 211   for (const MCWriteProcResEntry &WPR : WPRS) {
 212     // Get units for the entry.
 213     const MCProcResourceDesc *const ProcResDesc =
 214         SM.getProcResource(WPR.ProcResourceIdx);
 215     if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
 216       // This is a ProcResUnit.
 217       DensePressure[WPR.ProcResourceIdx] += WPR.ReleaseAtCycle;
 218     } else {
 219       // This is a ProcResGroup.
 220       SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin,
 221                                          ProcResDesc->SubUnitsIdxBegin +
 222                                              ProcResDesc->NumUnits);
 223       distributePressure(WPR.ReleaseAtCycle, Subunits, DensePressure);
 224     }
 225   }
 226   // Turn dense pressure into sparse pressure by removing zero entries.
 227   std::vector<std::pair<uint16_t, float>> Pressure;
 228   for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
 229     if (DensePressure[I] > 0.0f)
 230       Pressure.emplace_back(I, DensePressure[I]);
 231   }
 232   return Pressure;
 233 }
 234
 235 ResolvedSchedClass::ResolvedSchedClass(const MCSubtargetInfo &STI,
 236                                        unsigned ResolvedSchedClassId,
 237                                        bool WasVariant)
 238     : SchedClassId(ResolvedSchedClassId),
 239       SCDesc(STI.getSchedModel().getSchedClassDesc(ResolvedSchedClassId)),
 240       WasVariant(WasVariant),
 241       NonRedundantWriteProcRes(getNonRedundantWriteProcRes(*SCDesc, STI)),
 242       IdealizedProcResPressure(computeIdealizedProcResPressure(
 243           STI.getSchedModel(), NonRedundantWriteProcRes)) {
 244   assert((SCDesc == nullptr || !SCDesc->isVariant()) &&
 245          "ResolvedSchedClass should never be variant");
 246 }
 247
 248 static unsigned ResolveVariantSchedClassId(const MCSubtargetInfo &STI,
 249                                            const MCInstrInfo &InstrInfo,
 250                                            unsigned SchedClassId,
 251                                            const MCInst &MCI) {
 252   const auto &SM = STI.getSchedModel();
 253   while (SchedClassId && SM.getSchedClassDesc(SchedClassId)->isVariant()) {
 254     SchedClassId = STI.resolveVariantSchedClass(SchedClassId, &MCI, &InstrInfo,
 255                                                 SM.getProcessorID());
 256   }
 257   return SchedClassId;
 258 }
 259
 260 std::pair<unsigned /*SchedClassId*/, bool /*WasVariant*/>
 261 ResolvedSchedClass::resolveSchedClassId(const MCSubtargetInfo &SubtargetInfo,
 262                                         const MCInstrInfo &InstrInfo,
 263                                         const MCInst &MCI) {
 264   unsigned SchedClassId = InstrInfo.get(MCI.getOpcode()).getSchedClass();
 265   const bool WasVariant = SchedClassId && SubtargetInfo.getSchedModel()
 266                                               .getSchedClassDesc(SchedClassId)
 267                                               ->isVariant();
 268   SchedClassId =
 269       ResolveVariantSchedClassId(SubtargetInfo, InstrInfo, SchedClassId, MCI);
 270   return std::make_pair(SchedClassId, WasVariant);
 271 }
 272
 273 // Returns a ProxResIdx by id or name.
 274 static unsigned findProcResIdx(const MCSubtargetInfo &STI,
 275                                const StringRef NameOrId) {
 276   // Interpret the key as an ProcResIdx.
 277   unsigned ProcResIdx = 0;
 278   if (to_integer(NameOrId, ProcResIdx, 10))
 279     return ProcResIdx;
 280   // Interpret the key as a ProcRes name.
 281   const auto &SchedModel = STI.getSchedModel();
 282   for (int I = 0, E = SchedModel.getNumProcResourceKinds(); I < E; ++I) {
 283     if (NameOrId == SchedModel.getProcResource(I)->Name)
 284       return I;
 285   }
 286   return 0;
 287 }
 288
 289 std::vector<BenchmarkMeasure> ResolvedSchedClass::getAsPoint(
 290     Benchmark::ModeE Mode, const MCSubtargetInfo &STI,
 291     ArrayRef<PerInstructionStats> Representative) const {
 292   const size_t NumMeasurements = Representative.size();
 293
 294   std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements);
 295
 296   if (Mode == Benchmark::Latency) {
 297     assert(NumMeasurements == 1 && "Latency is a single measure.");
 298     BenchmarkMeasure &LatencyMeasure = SchedClassPoint[0];
 299
 300     // Find the latency.
 301     LatencyMeasure.PerInstructionValue = 0.0;
 302
 303     for (unsigned I = 0; I < SCDesc->NumWriteLatencyEntries; ++I) {
 304       const MCWriteLatencyEntry *const WLE =
 305           STI.getWriteLatencyEntry(SCDesc, I);
 306       LatencyMeasure.PerInstructionValue =
 307           std::max<double>(LatencyMeasure.PerInstructionValue, WLE->Cycles);
 308     }
 309   } else if (Mode == Benchmark::Uops) {
 310     for (auto I : zip(SchedClassPoint, Representative)) {
 311       BenchmarkMeasure &Measure = std::get<0>(I);
 312       const PerInstructionStats &Stats = std::get<1>(I);
 313
 314       StringRef Key = Stats.key();
 315       uint16_t ProcResIdx = findProcResIdx(STI, Key);
 316       if (ProcResIdx > 0) {
 317         // Find the pressure on ProcResIdx `Key`.
 318         const auto ProcResPressureIt =
 319             llvm::find_if(IdealizedProcResPressure,
 320                           [ProcResIdx](const std::pair<uint16_t, float> &WPR) {
 321                             return WPR.first == ProcResIdx;
 322                           });
 323         Measure.PerInstructionValue =
 324             ProcResPressureIt == IdealizedProcResPressure.end()
 325                 ? 0.0
 326                 : ProcResPressureIt->second;
 327       } else if (Key == "NumMicroOps") {
 328         Measure.PerInstructionValue = SCDesc->NumMicroOps;
 329       } else {
 330         errs() << "expected `key` to be either a ProcResIdx or a ProcRes "
 331                   "name, got "
 332                << Key << "\n";
 333         return {};
 334       }
 335     }
 336   } else if (Mode == Benchmark::InverseThroughput) {
 337     assert(NumMeasurements == 1 && "Inverse Throughput is a single measure.");
 338     BenchmarkMeasure &RThroughputMeasure = SchedClassPoint[0];
 339
 340     RThroughputMeasure.PerInstructionValue =
 341         MCSchedModel::getReciprocalThroughput(STI, *SCDesc);
 342   } else {
 343     llvm_unreachable("unimplemented measurement matching mode");
 344   }
 345
 346   return SchedClassPoint;
 347 }
 348
 349 } // namespace exegesis
 350 } // namespace llvm