1 //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements the bottleneck analysis view.
12 /// This view internally observes backend pressure increase events in order to
13 /// identify problematic data dependencies and processor resource interferences.
15 /// Example of bottleneck analysis report for a dot-product on X86 btver2:
17 /// Cycles with backend pressure increase [ 40.76% ]
18 /// Throughput Bottlenecks:
19 /// Resource Pressure [ 39.34% ]
21 /// - JFPU0 [ 39.34% ]
22 /// Data Dependencies: [ 1.42% ]
23 /// - Register Dependencies [ 1.42% ]
24 /// - Memory Dependencies [ 0.00% ]
26 /// According to the example, backend pressure increased during the 40.76% of
27 /// the simulated cycles. In particular, the major cause of backend pressure
28 /// increases was the contention on floating point adder JFPA accessible from
29 /// pipeline resource JFPU0.
31 /// At the end of each cycle, if pressure on the simulated out-of-order buffers
32 /// has increased, a backend pressure event is reported.
33 /// In particular, this occurs when there is a delta between the number of uOps
34 /// dispatched and the number of uOps issued to the underlying pipelines.
36 /// The bottleneck analysis view is also responsible for identifying and
37 /// printing the most "critical" sequence of dependent instructions according to
38 /// the simulated run.
40 /// Below is the critical sequence computed for the dot-product example on
43 /// Instruction Dependency Information
44 /// +----< 2. vhaddps %xmm3, %xmm3, %xmm4
46 /// | < loop carried >
48 /// | 0. vmulps %xmm0, %xmm0, %xmm2
49 /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
50 /// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3
52 /// | < loop carried >
54 /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
57 /// The algorithm that computes the critical sequence is very similar to a
58 /// critical path analysis.
60 /// A dependency graph is used internally to track dependencies between nodes.
61 /// Nodes of the graph represent instructions from the input assembly sequence,
62 /// and edges of the graph represent data dependencies or processor resource
65 /// Edges are dynamically 'discovered' by observing instruction state
66 /// transitions and backend pressure increase events. Edges are internally
67 /// ranked based on their "criticality". A dependency is considered to be
68 /// critical if it takes a long time to execute, and if it contributes to
69 /// backend pressure increases. Criticality is internally measured in terms of
70 /// cycles; it is computed for every edge in the graph as a function of the edge
71 /// latency and the number of backend pressure increase cycles contributed by
74 /// At the end of simulation, costs are propagated to nodes through the edges of
75 /// the graph, and the most expensive path connecting the root-set (a
76 /// set of nodes with no predecessors) to a leaf node is reported as critical
79 //===----------------------------------------------------------------------===//
81 #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
82 #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
84 #include "Views/InstructionView.h"
85 #include "llvm/ADT/DenseMap.h"
86 #include "llvm/ADT/SmallVector.h"
87 #include "llvm/MC/MCInstPrinter.h"
88 #include "llvm/MC/MCSchedule.h"
89 #include "llvm/MC/MCSubtargetInfo.h"
90 #include "llvm/Support/FormattedStream.h"
91 #include "llvm/Support/raw_ostream.h"
96 class PressureTracker
{
97 const MCSchedModel
&SM
;
99 // Resource pressure distribution. There is an element for every processor
100 // resource declared by the scheduling model. Quantities are number of cycles.
101 SmallVector
<unsigned, 4> ResourcePressureDistribution
;
103 // Each processor resource is associated with a so-called processor resource
104 // mask. This vector allows to correlate processor resource IDs with processor
105 // resource masks. There is exactly one element per each processor resource
106 // declared by the scheduling model.
107 SmallVector
<uint64_t, 4> ProcResID2Mask
;
109 // Maps processor resource state indices (returned by calls to
110 // `getResourceStateIndex(Mask)` to processor resource identifiers.
111 SmallVector
<unsigned, 4> ResIdx2ProcResID
;
113 // Maps Processor Resource identifiers to ResourceUsers indices.
114 SmallVector
<unsigned, 4> ProcResID2ResourceUsersIndex
;
116 // Identifies the last user of a processor resource unit.
117 // This vector is updated on every instruction issued event.
118 // There is one entry for every processor resource unit declared by the
119 // processor model. An all_ones value is treated like an invalid instruction
121 using User
= std::pair
<unsigned, unsigned>;
122 SmallVector
<User
, 4> ResourceUsers
;
124 struct InstructionPressureInfo
{
125 unsigned RegisterPressureCycles
;
126 unsigned MemoryPressureCycles
;
127 unsigned ResourcePressureCycles
;
129 DenseMap
<unsigned, InstructionPressureInfo
> IPI
;
131 void updateResourcePressureDistribution(uint64_t CumulativeMask
);
133 User
getResourceUser(unsigned ProcResID
, unsigned UnitID
) const {
134 unsigned Index
= ProcResID2ResourceUsersIndex
[ProcResID
];
135 return ResourceUsers
[Index
+ UnitID
];
139 PressureTracker(const MCSchedModel
&Model
);
141 ArrayRef
<unsigned> getResourcePressureDistribution() const {
142 return ResourcePressureDistribution
;
145 void getResourceUsers(uint64_t ResourceMask
,
146 SmallVectorImpl
<User
> &Users
) const;
148 unsigned getRegisterPressureCycles(unsigned IID
) const {
149 assert(IPI
.contains(IID
) && "Instruction is not tracked!");
150 const InstructionPressureInfo
&Info
= IPI
.find(IID
)->second
;
151 return Info
.RegisterPressureCycles
;
154 unsigned getMemoryPressureCycles(unsigned IID
) const {
155 assert(IPI
.contains(IID
) && "Instruction is not tracked!");
156 const InstructionPressureInfo
&Info
= IPI
.find(IID
)->second
;
157 return Info
.MemoryPressureCycles
;
160 unsigned getResourcePressureCycles(unsigned IID
) const {
161 assert(IPI
.contains(IID
) && "Instruction is not tracked!");
162 const InstructionPressureInfo
&Info
= IPI
.find(IID
)->second
;
163 return Info
.ResourcePressureCycles
;
166 const char *resolveResourceName(uint64_t ResourceMask
) const {
167 unsigned Index
= getResourceStateIndex(ResourceMask
);
168 unsigned ProcResID
= ResIdx2ProcResID
[Index
];
169 const MCProcResourceDesc
&PRDesc
= *SM
.getProcResource(ProcResID
);
173 void onInstructionDispatched(unsigned IID
);
174 void onInstructionExecuted(unsigned IID
);
176 void handlePressureEvent(const HWPressureEvent
&Event
);
177 void handleInstructionIssuedEvent(const HWInstructionIssuedEvent
&Event
);
180 // A dependency edge.
181 struct DependencyEdge
{
182 enum DependencyType
{ DT_INVALID
, DT_REGISTER
, DT_MEMORY
, DT_RESOURCE
};
184 // Dependency edge descriptor.
186 // It specifies the dependency type, as well as the edge cost in cycles.
189 uint64_t ResourceOrRegID
;
197 // Used by the bottleneck analysis to compute the interference
198 // probability for processor resources.
202 // A dependency graph used by the bottleneck analysis to describe data
203 // dependencies and processor resource interferences between instructions.
205 // There is a node (an instance of struct DGNode) for every instruction in the
206 // input assembly sequence. Edges of the graph represent dependencies between
209 // Each edge of the graph is associated with a cost value which is used
210 // internally to rank dependency based on their impact on the runtime
211 // performance (see field DependencyEdge::Dependency::Cost). In general, the
212 // higher the cost of an edge, the higher the impact on performance.
214 // The cost of a dependency is a function of both the latency and the number of
215 // cycles where the dependency has been seen as critical (i.e. contributing to
216 // back-pressure increases).
218 // Loop carried dependencies are carefully expanded by the bottleneck analysis
219 // to guarantee that the graph stays acyclic. To this end, extra nodes are
220 // pre-allocated at construction time to describe instructions from "past and
221 // future" iterations. The graph is kept acyclic mainly because it simplifies
222 // the complexity of the algorithm that computes the critical sequence.
223 class DependencyGraph
{
225 unsigned NumPredecessors
;
226 unsigned NumVisitedPredecessors
;
230 DependencyEdge CriticalPredecessor
;
231 SmallVector
<DependencyEdge
, 8> OutgoingEdges
;
233 SmallVector
<DGNode
, 16> Nodes
;
235 DependencyGraph(const DependencyGraph
&) = delete;
236 DependencyGraph
&operator=(const DependencyGraph
&) = delete;
238 void addDependency(unsigned From
, unsigned To
,
239 DependencyEdge::Dependency
&&DE
);
241 void pruneEdges(unsigned Iterations
);
242 void initializeRootSet(SmallVectorImpl
<unsigned> &RootSet
) const;
243 void propagateThroughEdges(SmallVectorImpl
<unsigned> &RootSet
,
244 unsigned Iterations
);
247 void dumpDependencyEdge(raw_ostream
&OS
, const DependencyEdge
&DE
,
248 MCInstPrinter
&MCIP
) const;
252 DependencyGraph(unsigned Size
) : Nodes(Size
) {}
254 void addRegisterDep(unsigned From
, unsigned To
, unsigned RegID
,
256 addDependency(From
, To
, {DependencyEdge::DT_REGISTER
, RegID
, Cost
});
259 void addMemoryDep(unsigned From
, unsigned To
, unsigned Cost
) {
260 addDependency(From
, To
, {DependencyEdge::DT_MEMORY
, /* unused */ 0, Cost
});
263 void addResourceDep(unsigned From
, unsigned To
, uint64_t Mask
,
265 addDependency(From
, To
, {DependencyEdge::DT_RESOURCE
, Mask
, Cost
});
268 // Called by the bottleneck analysis at the end of simulation to propagate
269 // costs through the edges of the graph, and compute a critical path.
270 void finalizeGraph(unsigned Iterations
) {
271 SmallVector
<unsigned, 16> RootSet
;
272 pruneEdges(Iterations
);
273 initializeRootSet(RootSet
);
274 propagateThroughEdges(RootSet
, Iterations
);
277 // Returns a sequence of edges representing the critical sequence based on the
278 // simulated run. It assumes that the graph has already been finalized (i.e.
279 // method `finalizeGraph()` has already been called on this graph).
280 void getCriticalSequence(SmallVectorImpl
<const DependencyEdge
*> &Seq
) const;
283 void dump(raw_ostream
&OS
, MCInstPrinter
&MCIP
) const;
287 /// A view that collects and prints a few performance numbers.
288 class BottleneckAnalysis
: public InstructionView
{
289 PressureTracker Tracker
;
293 unsigned TotalCycles
;
295 bool PressureIncreasedBecauseOfResources
;
296 bool PressureIncreasedBecauseOfRegisterDependencies
;
297 bool PressureIncreasedBecauseOfMemoryDependencies
;
298 // True if throughput was affected by dispatch stalls.
299 bool SeenStallCycles
;
301 struct BackPressureInfo
{
302 // Cycles where backpressure increased.
303 unsigned PressureIncreaseCycles
;
304 // Cycles where backpressure increased because of pipeline pressure.
305 unsigned ResourcePressureCycles
;
306 // Cycles where backpressure increased because of data dependencies.
307 unsigned DataDependencyCycles
;
308 // Cycles where backpressure increased because of register dependencies.
309 unsigned RegisterDependencyCycles
;
310 // Cycles where backpressure increased because of memory dependencies.
311 unsigned MemoryDependencyCycles
;
313 BackPressureInfo BPI
;
315 // Used to populate the dependency graph DG.
316 void addRegisterDep(unsigned From
, unsigned To
, unsigned RegID
, unsigned Cy
);
317 void addMemoryDep(unsigned From
, unsigned To
, unsigned Cy
);
318 void addResourceDep(unsigned From
, unsigned To
, uint64_t Mask
, unsigned Cy
);
320 void printInstruction(formatted_raw_ostream
&FOS
, const MCInst
&MCI
,
321 bool UseDifferentColor
= false) const;
323 // Prints a bottleneck message to OS.
324 void printBottleneckHints(raw_ostream
&OS
) const;
325 void printCriticalSequence(raw_ostream
&OS
) const;
328 BottleneckAnalysis(const MCSubtargetInfo
&STI
, MCInstPrinter
&MCIP
,
329 ArrayRef
<MCInst
> Sequence
, unsigned Iterations
);
331 void onCycleEnd() override
;
332 void onEvent(const HWStallEvent
&Event
) override
{ SeenStallCycles
= true; }
333 void onEvent(const HWPressureEvent
&Event
) override
;
334 void onEvent(const HWInstructionEvent
&Event
) override
;
336 void printView(raw_ostream
&OS
) const override
;
337 StringRef
getNameAsString() const override
{ return "BottleneckAnalysis"; }
338 bool isSerializable() const override
{ return false; }
341 void dump(raw_ostream
&OS
, MCInstPrinter
&MCIP
) const { DG
.dump(OS
, MCIP
); }