tools/llvm-mca/Views/BottleneckAnalysis.h

   1 //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 ///
  10 /// This file implements the bottleneck analysis view.
  11 ///
  12 /// This view internally observes backend pressure increase events in order to
  13 /// identify problematic data dependencies and processor resource interferences.
  14 ///
  15 /// Example of bottleneck analysis report for a dot-product on X86 btver2:
  16 ///
  17 /// Cycles with backend pressure increase [ 40.76% ]
  18 /// Throughput Bottlenecks:
  19 ///   Resource Pressure       [ 39.34% ]
  20 ///   - JFPA  [ 39.34% ]
  21 ///   - JFPU0  [ 39.34% ]
  22 ///   Data Dependencies:      [ 1.42% ]
  23 ///   - Register Dependencies [ 1.42% ]
  24 ///   - Memory Dependencies   [ 0.00% ]
  25 ///
  26 /// According to the example, backend pressure increased during the 40.76% of
  27 /// the simulated cycles.  In particular, the major cause of backend pressure
  28 /// increases was the contention on floating point adder JFPA accessible from
  29 /// pipeline resource JFPU0.
  30 ///
  31 /// At the end of each cycle, if pressure on the simulated out-of-order buffers
  32 /// has increased, a backend pressure event is reported.
  33 /// In particular, this occurs when there is a delta between the number of uOps
  34 /// dispatched and the number of uOps issued to the underlying pipelines.
  35 ///
  36 /// The bottleneck analysis view is also responsible for identifying and printing
  37 /// the most "critical" sequence of dependent instructions according to the
  38 /// simulated run.
  39 ///
  40 /// Below is the critical sequence computed for the dot-product example on
  41 /// btver2:
  42 ///
  43 ///              Instruction                     Dependency Information
  44 /// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
  45 /// |
  46 /// |    < loop carried >
  47 /// |
  48 /// |      0.    vmulps  %xmm0, %xmm0, %xmm2
  49 /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
  50 /// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
  51 /// |
  52 /// |    < loop carried >
  53 /// |
  54 /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
  55 ///
  56 ///
  57 /// The algorithm that computes the critical sequence is very similar to a
  58 /// critical path analysis.
  59 ///
  60 /// A dependency graph is used internally to track dependencies between nodes.
  61 /// Nodes of the graph represent instructions from the input assembly sequence,
  62 /// and edges of the graph represent data dependencies or processor resource
  63 /// interferences.
  64 ///
  65 /// Edges are dynamically 'discovered' by observing instruction state transitions
  66 /// and backend pressure increase events. Edges are internally ranked based on
  67 /// their "criticality". A dependency is considered to be critical if it takes a
  68 /// long time to execute, and if it contributes to backend pressure increases.
  69 /// Criticality is internally measured in terms of cycles; it is computed for
  70 /// every edge in the graph as a function of the edge latency and the number of
  71 /// backend pressure increase cycles contributed by that edge.
  72 ///
  73 /// At the end of simulation, costs are propagated to nodes through the edges of
  74 /// the graph, and the most expensive path connecting the root-set (a
  75 /// set of nodes with no predecessors) to a leaf node is reported as critical
  76 /// sequence.
  77 //
  78 //===----------------------------------------------------------------------===//
  79
  80 #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
  81 #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
  82
  83 #include "Views/View.h"
  84 #include "llvm/ADT/DenseMap.h"
  85 #include "llvm/ADT/SmallVector.h"
  86 #include "llvm/MC/MCInstPrinter.h"
  87 #include "llvm/MC/MCSchedule.h"
  88 #include "llvm/MC/MCSubtargetInfo.h"
  89 #include "llvm/Support/raw_ostream.h"
  90
  91 namespace llvm {
  92 namespace mca {
  93
  94 class PressureTracker {
  95   const MCSchedModel &SM;
  96
  97   // Resource pressure distribution. There is an element for every processor
  98   // resource declared by the scheduling model. Quantities are number of cycles.
  99   SmallVector<unsigned, 4> ResourcePressureDistribution;
 100
 101   // Each processor resource is associated with a so-called processor resource
 102   // mask. This vector allows to correlate processor resource IDs with processor
 103   // resource masks. There is exactly one element per each processor resource
 104   // declared by the scheduling model.
 105   SmallVector<uint64_t, 4> ProcResID2Mask;
 106
 107   // Maps processor resource state indices (returned by calls to
 108   // `getResourceStateIndex(Mask)` to processor resource identifiers.
 109   SmallVector<unsigned, 4> ResIdx2ProcResID;
 110
 111   // Maps Processor Resource identifiers to ResourceUsers indices.
 112   SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
 113
 114   // Identifies the last user of a processor resource unit.
 115   // This vector is updated on every instruction issued event.
 116   // There is one entry for every processor resource unit declared by the
 117   // processor model. An all_ones value is treated like an invalid instruction
 118   // identifier.
 119   using User = std::pair<unsigned, unsigned>;
 120   SmallVector<User, 4> ResourceUsers;
 121
 122   struct InstructionPressureInfo {
 123     unsigned RegisterPressureCycles;
 124     unsigned MemoryPressureCycles;
 125     unsigned ResourcePressureCycles;
 126   };
 127   DenseMap<unsigned, InstructionPressureInfo> IPI;
 128
 129   void updateResourcePressureDistribution(uint64_t CumulativeMask);
 130
 131   User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
 132     unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
 133     return ResourceUsers[Index + UnitID];
 134   }
 135
 136 public:
 137   PressureTracker(const MCSchedModel &Model);
 138
 139   ArrayRef<unsigned> getResourcePressureDistribution() const {
 140     return ResourcePressureDistribution;
 141   }
 142
 143   void getResourceUsers(uint64_t ResourceMask,
 144                         SmallVectorImpl<User> &Users) const;
 145
 146   unsigned getRegisterPressureCycles(unsigned IID) const {
 147     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
 148     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 149     return Info.RegisterPressureCycles;
 150   }
 151
 152   unsigned getMemoryPressureCycles(unsigned IID) const {
 153     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
 154     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 155     return Info.MemoryPressureCycles;
 156   }
 157
 158   unsigned getResourcePressureCycles(unsigned IID) const {
 159     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
 160     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 161     return Info.ResourcePressureCycles;
 162   }
 163
 164   const char *resolveResourceName(uint64_t ResourceMask) const {
 165     unsigned Index = getResourceStateIndex(ResourceMask);
 166     unsigned ProcResID = ResIdx2ProcResID[Index];
 167     const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
 168     return PRDesc.Name;
 169   }
 170
 171   void onInstructionDispatched(unsigned IID);
 172   void onInstructionExecuted(unsigned IID);
 173
 174   void handlePressureEvent(const HWPressureEvent &Event);
 175   void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
 176 };
 177
 178 // A dependency edge.
 179 struct DependencyEdge {
 180   enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
 181
 182   // Dependency edge descriptor.
 183   //
 184   // It specifies the dependency type, as well as the edge cost in cycles.
 185   struct Dependency {
 186     DependencyType Type;
 187     uint64_t ResourceOrRegID;
 188     uint64_t Cost;
 189   };
 190   Dependency Dep;
 191
 192   unsigned FromIID;
 193   unsigned ToIID;
 194
 195   // Used by the bottleneck analysis to compute the interference
 196   // probability for processor resources.
 197   unsigned Frequency;
 198 };
 199
 200 // A dependency graph used by the bottleneck analysis to describe data
 201 // dependencies and processor resource interferences between instructions.
 202 //
 203 // There is a node (an instance of struct DGNode) for every instruction in the
 204 // input assembly sequence. Edges of the graph represent dependencies between
 205 // instructions.
 206 //
 207 // Each edge of the graph is associated with a cost value which is used
 208 // internally to rank dependency based on their impact on the runtime
 209 // performance (see field DependencyEdge::Dependency::Cost). In general, the
 210 // higher the cost of an edge, the higher the impact on performance.
 211 //
 212 // The cost of a dependency is a function of both the latency and the number of
 213 // cycles where the dependency has been seen as critical (i.e. contributing to
 214 // back-pressure increases).
 215 //
 216 // Loop carried dependencies are carefully expanded by the bottleneck analysis
 217 // to guarantee that the graph stays acyclic. To this end, extra nodes are
 218 // pre-allocated at construction time to describe instructions from "past and
 219 // future" iterations. The graph is kept acyclic mainly because it simplifies the
 220 // complexity of the algorithm that computes the critical sequence.
 221 class DependencyGraph {
 222   struct DGNode {
 223     unsigned NumPredecessors;
 224     unsigned NumVisitedPredecessors;
 225     uint64_t Cost;
 226     unsigned Depth;
 227
 228     DependencyEdge CriticalPredecessor;
 229     SmallVector<DependencyEdge, 8> OutgoingEdges;
 230   };
 231   SmallVector<DGNode, 16> Nodes;
 232
 233   DependencyGraph(const DependencyGraph &) = delete;
 234   DependencyGraph &operator=(const DependencyGraph &) = delete;
 235
 236   void addDependency(unsigned From, unsigned To,
 237                      DependencyEdge::Dependency &&DE);
 238
 239   void pruneEdges(unsigned Iterations);
 240   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
 241   void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
 242
 243 #ifndef NDEBUG
 244   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
 245                           MCInstPrinter &MCIP) const;
 246 #endif
 247
 248 public:
 249   DependencyGraph(unsigned Size) : Nodes(Size) {}
 250
 251   void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
 252                       unsigned Cost) {
 253     addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
 254   }
 255
 256   void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
 257     addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
 258   }
 259
 260   void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
 261                       unsigned Cost) {
 262     addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
 263   }
 264
 265   // Called by the bottleneck analysis at the end of simulation to propagate
 266   // costs through the edges of the graph, and compute a critical path.
 267   void finalizeGraph(unsigned Iterations) {
 268     SmallVector<unsigned, 16> RootSet;
 269     pruneEdges(Iterations);
 270     initializeRootSet(RootSet);
 271     propagateThroughEdges(RootSet, Iterations);
 272   }
 273
 274   // Returns a sequence of edges representing the critical sequence based on the
 275   // simulated run. It assumes that the graph has already been finalized (i.e.
 276   // method `finalizeGraph()` has already been called on this graph).
 277   void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
 278
 279 #ifndef NDEBUG
 280   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
 281 #endif
 282 };
 283
 284 /// A view that collects and prints a few performance numbers.
 285 class BottleneckAnalysis : public View {
 286   const MCSubtargetInfo &STI;
 287   MCInstPrinter &MCIP;
 288   PressureTracker Tracker;
 289   DependencyGraph DG;
 290
 291   ArrayRef<MCInst> Source;
 292   unsigned Iterations;
 293   unsigned TotalCycles;
 294
 295   bool PressureIncreasedBecauseOfResources;
 296   bool PressureIncreasedBecauseOfRegisterDependencies;
 297   bool PressureIncreasedBecauseOfMemoryDependencies;
 298   // True if throughput was affected by dispatch stalls.
 299   bool SeenStallCycles;
 300
 301   struct BackPressureInfo {
 302     // Cycles where backpressure increased.
 303     unsigned PressureIncreaseCycles;
 304     // Cycles where backpressure increased because of pipeline pressure.
 305     unsigned ResourcePressureCycles;
 306     // Cycles where backpressure increased because of data dependencies.
 307     unsigned DataDependencyCycles;
 308     // Cycles where backpressure increased because of register dependencies.
 309     unsigned RegisterDependencyCycles;
 310     // Cycles where backpressure increased because of memory dependencies.
 311     unsigned MemoryDependencyCycles;
 312   };
 313   BackPressureInfo BPI;
 314
 315   // Used to populate the dependency graph DG.
 316   void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
 317   void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
 318   void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
 319
 320   // Prints a bottleneck message to OS.
 321   void printBottleneckHints(raw_ostream &OS) const;
 322   void printCriticalSequence(raw_ostream &OS) const;
 323
 324 public:
 325   BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
 326                      ArrayRef<MCInst> Sequence, unsigned Iterations);
 327
 328   void onCycleEnd() override;
 329   void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
 330   void onEvent(const HWPressureEvent &Event) override;
 331   void onEvent(const HWInstructionEvent &Event) override;
 332
 333   void printView(raw_ostream &OS) const override;
 334
 335 #ifndef NDEBUG
 336   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
 337 #endif
 338 };
 339
 340 } // namespace mca
 341 } // namespace llvm
 342
 343 #endif