llvm/tools/llvm-mca/Views/BottleneckAnalysis.h

   1 //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 ///
  10 /// This file implements the bottleneck analysis view.
  11 ///
  12 /// This view internally observes backend pressure increase events in order to
  13 /// identify problematic data dependencies and processor resource interferences.
  14 ///
  15 /// Example of bottleneck analysis report for a dot-product on X86 btver2:
  16 ///
  17 /// Cycles with backend pressure increase [ 40.76% ]
  18 /// Throughput Bottlenecks:
  19 ///   Resource Pressure       [ 39.34% ]
  20 ///   - JFPA  [ 39.34% ]
  21 ///   - JFPU0  [ 39.34% ]
  22 ///   Data Dependencies:      [ 1.42% ]
  23 ///   - Register Dependencies [ 1.42% ]
  24 ///   - Memory Dependencies   [ 0.00% ]
  25 ///
  26 /// According to the example, backend pressure increased during the 40.76% of
  27 /// the simulated cycles.  In particular, the major cause of backend pressure
  28 /// increases was the contention on floating point adder JFPA accessible from
  29 /// pipeline resource JFPU0.
  30 ///
  31 /// At the end of each cycle, if pressure on the simulated out-of-order buffers
  32 /// has increased, a backend pressure event is reported.
  33 /// In particular, this occurs when there is a delta between the number of uOps
  34 /// dispatched and the number of uOps issued to the underlying pipelines.
  35 ///
  36 /// The bottleneck analysis view is also responsible for identifying and
  37 /// printing the most "critical" sequence of dependent instructions according to
  38 /// the simulated run.
  39 ///
  40 /// Below is the critical sequence computed for the dot-product example on
  41 /// btver2:
  42 ///
  43 ///              Instruction                     Dependency Information
  44 /// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
  45 /// |
  46 /// |    < loop carried >
  47 /// |
  48 /// |      0.    vmulps  %xmm0, %xmm0, %xmm2
  49 /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
  50 /// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
  51 /// |
  52 /// |    < loop carried >
  53 /// |
  54 /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
  55 ///
  56 ///
  57 /// The algorithm that computes the critical sequence is very similar to a
  58 /// critical path analysis.
  59 ///
  60 /// A dependency graph is used internally to track dependencies between nodes.
  61 /// Nodes of the graph represent instructions from the input assembly sequence,
  62 /// and edges of the graph represent data dependencies or processor resource
  63 /// interferences.
  64 ///
  65 /// Edges are dynamically 'discovered' by observing instruction state
  66 /// transitions and backend pressure increase events. Edges are internally
  67 /// ranked based on their "criticality". A dependency is considered to be
  68 /// critical if it takes a long time to execute, and if it contributes to
  69 /// backend pressure increases. Criticality is internally measured in terms of
  70 /// cycles; it is computed for every edge in the graph as a function of the edge
  71 /// latency and the number of backend pressure increase cycles contributed by
  72 /// that edge.
  73 ///
  74 /// At the end of simulation, costs are propagated to nodes through the edges of
  75 /// the graph, and the most expensive path connecting the root-set (a
  76 /// set of nodes with no predecessors) to a leaf node is reported as critical
  77 /// sequence.
  78 //
  79 //===----------------------------------------------------------------------===//
  80
  81 #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
  82 #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
  83
  84 #include "Views/InstructionView.h"
  85 #include "llvm/ADT/DenseMap.h"
  86 #include "llvm/ADT/SmallVector.h"
  87 #include "llvm/MC/MCInstPrinter.h"
  88 #include "llvm/MC/MCSchedule.h"
  89 #include "llvm/MC/MCSubtargetInfo.h"
  90 #include "llvm/Support/FormattedStream.h"
  91 #include "llvm/Support/raw_ostream.h"
  92
  93 namespace llvm {
  94 namespace mca {
  95
  96 class PressureTracker {
  97   const MCSchedModel &SM;
  98
  99   // Resource pressure distribution. There is an element for every processor
 100   // resource declared by the scheduling model. Quantities are number of cycles.
 101   SmallVector<unsigned, 4> ResourcePressureDistribution;
 102
 103   // Each processor resource is associated with a so-called processor resource
 104   // mask. This vector allows to correlate processor resource IDs with processor
 105   // resource masks. There is exactly one element per each processor resource
 106   // declared by the scheduling model.
 107   SmallVector<uint64_t, 4> ProcResID2Mask;
 108
 109   // Maps processor resource state indices (returned by calls to
 110   // `getResourceStateIndex(Mask)` to processor resource identifiers.
 111   SmallVector<unsigned, 4> ResIdx2ProcResID;
 112
 113   // Maps Processor Resource identifiers to ResourceUsers indices.
 114   SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
 115
 116   // Identifies the last user of a processor resource unit.
 117   // This vector is updated on every instruction issued event.
 118   // There is one entry for every processor resource unit declared by the
 119   // processor model. An all_ones value is treated like an invalid instruction
 120   // identifier.
 121   using User = std::pair<unsigned, unsigned>;
 122   SmallVector<User, 4> ResourceUsers;
 123
 124   struct InstructionPressureInfo {
 125     unsigned RegisterPressureCycles;
 126     unsigned MemoryPressureCycles;
 127     unsigned ResourcePressureCycles;
 128   };
 129   DenseMap<unsigned, InstructionPressureInfo> IPI;
 130
 131   void updateResourcePressureDistribution(uint64_t CumulativeMask);
 132
 133   User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
 134     unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
 135     return ResourceUsers[Index + UnitID];
 136   }
 137
 138 public:
 139   PressureTracker(const MCSchedModel &Model);
 140
 141   ArrayRef<unsigned> getResourcePressureDistribution() const {
 142     return ResourcePressureDistribution;
 143   }
 144
 145   void getResourceUsers(uint64_t ResourceMask,
 146                         SmallVectorImpl<User> &Users) const;
 147
 148   unsigned getRegisterPressureCycles(unsigned IID) const {
 149     assert(IPI.contains(IID) && "Instruction is not tracked!");
 150     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 151     return Info.RegisterPressureCycles;
 152   }
 153
 154   unsigned getMemoryPressureCycles(unsigned IID) const {
 155     assert(IPI.contains(IID) && "Instruction is not tracked!");
 156     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 157     return Info.MemoryPressureCycles;
 158   }
 159
 160   unsigned getResourcePressureCycles(unsigned IID) const {
 161     assert(IPI.contains(IID) && "Instruction is not tracked!");
 162     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 163     return Info.ResourcePressureCycles;
 164   }
 165
 166   const char *resolveResourceName(uint64_t ResourceMask) const {
 167     unsigned Index = getResourceStateIndex(ResourceMask);
 168     unsigned ProcResID = ResIdx2ProcResID[Index];
 169     const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
 170     return PRDesc.Name;
 171   }
 172
 173   void onInstructionDispatched(unsigned IID);
 174   void onInstructionExecuted(unsigned IID);
 175
 176   void handlePressureEvent(const HWPressureEvent &Event);
 177   void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
 178 };
 179
 180 // A dependency edge.
 181 struct DependencyEdge {
 182   enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
 183
 184   // Dependency edge descriptor.
 185   //
 186   // It specifies the dependency type, as well as the edge cost in cycles.
 187   struct Dependency {
 188     DependencyType Type;
 189     uint64_t ResourceOrRegID;
 190     uint64_t Cost;
 191   };
 192   Dependency Dep;
 193
 194   unsigned FromIID;
 195   unsigned ToIID;
 196
 197   // Used by the bottleneck analysis to compute the interference
 198   // probability for processor resources.
 199   unsigned Frequency;
 200 };
 201
 202 // A dependency graph used by the bottleneck analysis to describe data
 203 // dependencies and processor resource interferences between instructions.
 204 //
 205 // There is a node (an instance of struct DGNode) for every instruction in the
 206 // input assembly sequence. Edges of the graph represent dependencies between
 207 // instructions.
 208 //
 209 // Each edge of the graph is associated with a cost value which is used
 210 // internally to rank dependency based on their impact on the runtime
 211 // performance (see field DependencyEdge::Dependency::Cost). In general, the
 212 // higher the cost of an edge, the higher the impact on performance.
 213 //
 214 // The cost of a dependency is a function of both the latency and the number of
 215 // cycles where the dependency has been seen as critical (i.e. contributing to
 216 // back-pressure increases).
 217 //
 218 // Loop carried dependencies are carefully expanded by the bottleneck analysis
 219 // to guarantee that the graph stays acyclic. To this end, extra nodes are
 220 // pre-allocated at construction time to describe instructions from "past and
 221 // future" iterations. The graph is kept acyclic mainly because it simplifies
 222 // the complexity of the algorithm that computes the critical sequence.
 223 class DependencyGraph {
 224   struct DGNode {
 225     unsigned NumPredecessors;
 226     unsigned NumVisitedPredecessors;
 227     uint64_t Cost;
 228     unsigned Depth;
 229
 230     DependencyEdge CriticalPredecessor;
 231     SmallVector<DependencyEdge, 8> OutgoingEdges;
 232   };
 233   SmallVector<DGNode, 16> Nodes;
 234
 235   DependencyGraph(const DependencyGraph &) = delete;
 236   DependencyGraph &operator=(const DependencyGraph &) = delete;
 237
 238   void addDependency(unsigned From, unsigned To,
 239                      DependencyEdge::Dependency &&DE);
 240
 241   void pruneEdges(unsigned Iterations);
 242   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
 243   void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet,
 244                              unsigned Iterations);
 245
 246 #ifndef NDEBUG
 247   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
 248                           MCInstPrinter &MCIP) const;
 249 #endif
 250
 251 public:
 252   DependencyGraph(unsigned Size) : Nodes(Size) {}
 253
 254   void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
 255                       unsigned Cost) {
 256     addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
 257   }
 258
 259   void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
 260     addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
 261   }
 262
 263   void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
 264                       unsigned Cost) {
 265     addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
 266   }
 267
 268   // Called by the bottleneck analysis at the end of simulation to propagate
 269   // costs through the edges of the graph, and compute a critical path.
 270   void finalizeGraph(unsigned Iterations) {
 271     SmallVector<unsigned, 16> RootSet;
 272     pruneEdges(Iterations);
 273     initializeRootSet(RootSet);
 274     propagateThroughEdges(RootSet, Iterations);
 275   }
 276
 277   // Returns a sequence of edges representing the critical sequence based on the
 278   // simulated run. It assumes that the graph has already been finalized (i.e.
 279   // method `finalizeGraph()` has already been called on this graph).
 280   void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
 281
 282 #ifndef NDEBUG
 283   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
 284 #endif
 285 };
 286
 287 /// A view that collects and prints a few performance numbers.
 288 class BottleneckAnalysis : public InstructionView {
 289   PressureTracker Tracker;
 290   DependencyGraph DG;
 291
 292   unsigned Iterations;
 293   unsigned TotalCycles;
 294
 295   bool PressureIncreasedBecauseOfResources;
 296   bool PressureIncreasedBecauseOfRegisterDependencies;
 297   bool PressureIncreasedBecauseOfMemoryDependencies;
 298   // True if throughput was affected by dispatch stalls.
 299   bool SeenStallCycles;
 300
 301   struct BackPressureInfo {
 302     // Cycles where backpressure increased.
 303     unsigned PressureIncreaseCycles;
 304     // Cycles where backpressure increased because of pipeline pressure.
 305     unsigned ResourcePressureCycles;
 306     // Cycles where backpressure increased because of data dependencies.
 307     unsigned DataDependencyCycles;
 308     // Cycles where backpressure increased because of register dependencies.
 309     unsigned RegisterDependencyCycles;
 310     // Cycles where backpressure increased because of memory dependencies.
 311     unsigned MemoryDependencyCycles;
 312   };
 313   BackPressureInfo BPI;
 314
 315   // Used to populate the dependency graph DG.
 316   void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
 317   void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
 318   void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
 319
 320   void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,
 321                         bool UseDifferentColor = false) const;
 322
 323   // Prints a bottleneck message to OS.
 324   void printBottleneckHints(raw_ostream &OS) const;
 325   void printCriticalSequence(raw_ostream &OS) const;
 326
 327 public:
 328   BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
 329                      ArrayRef<MCInst> Sequence, unsigned Iterations);
 330
 331   void onCycleEnd() override;
 332   void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
 333   void onEvent(const HWPressureEvent &Event) override;
 334   void onEvent(const HWInstructionEvent &Event) override;
 335
 336   void printView(raw_ostream &OS) const override;
 337   StringRef getNameAsString() const override { return "BottleneckAnalysis"; }
 338   bool isSerializable() const override { return false; }
 339
 340 #ifndef NDEBUG
 341   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
 342 #endif
 343 };
 344
 345 } // namespace mca
 346 } // namespace llvm
 347
 348 #endif