Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / bolt / lib / Passes / CacheMetrics.cpp
blobba3a2a5f685f38ed10e8e75411a3464bda3c7b94
1 //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the CacheMetrics class and functions for showing metrics
10 // of cache lines.
12 //===----------------------------------------------------------------------===//
14 #include "bolt/Passes/CacheMetrics.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include <unordered_map>
20 using namespace llvm;
21 using namespace bolt;
23 namespace opts {
25 extern cl::OptionCategory BoltOptCategory;
27 extern cl::opt<unsigned> ITLBPageSize;
28 extern cl::opt<unsigned> ITLBEntries;
30 } // namespace opts
32 namespace {
34 /// Initialize and return a position map for binary basic blocks
35 void extractBasicBlockInfo(
36 const std::vector<BinaryFunction *> &BinaryFunctions,
37 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
38 std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
40 for (BinaryFunction *BF : BinaryFunctions) {
41 const BinaryContext &BC = BF->getBinaryContext();
42 for (BinaryBasicBlock &BB : *BF) {
43 if (BF->isSimple() || BC.HasRelocations) {
44 // Use addresses/sizes as in the output binary
45 BBAddr[&BB] = BB.getOutputAddressRange().first;
46 BBSize[&BB] = BB.getOutputSize();
47 } else {
48 // Output ranges should match the input if the body hasn't changed
49 BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress();
50 BBSize[&BB] = BB.getOriginalSize();
56 /// Calculate TSP metric, which quantifies the number of fallthrough jumps in
57 /// the ordering of basic blocks. The method returns a pair
58 /// (the number of fallthrough branches, the total number of branches)
59 std::pair<uint64_t, uint64_t>
60 calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
61 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
62 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
63 uint64_t Score = 0;
64 uint64_t JumpCount = 0;
65 for (BinaryFunction *BF : BinaryFunctions) {
66 if (!BF->hasProfile())
67 continue;
68 for (BinaryBasicBlock *SrcBB : BF->getLayout().blocks()) {
69 auto BI = SrcBB->branch_info_begin();
70 for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
71 if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
72 JumpCount += BI->Count;
73 if (BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB))
74 Score += BI->Count;
76 ++BI;
80 return std::make_pair(Score, JumpCount);
83 using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
85 /// Build a simplified version of the call graph: For every function, keep
86 /// its callers and the frequencies of the calls
87 std::unordered_map<const BinaryFunction *, Predecessors>
88 extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
89 std::unordered_map<const BinaryFunction *, Predecessors> Calls;
91 for (BinaryFunction *SrcFunction : BinaryFunctions) {
92 const BinaryContext &BC = SrcFunction->getBinaryContext();
93 for (const BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) {
94 // Find call instructions and extract target symbols from each one
95 for (const MCInst &Inst : *BB) {
96 if (!BC.MIB->isCall(Inst))
97 continue;
99 // Call info
100 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
101 uint64_t Count = BB->getKnownExecutionCount();
102 // Ignore calls w/o information
103 if (DstSym == nullptr || Count == 0)
104 continue;
106 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
107 // Ignore recursive calls
108 if (DstFunction == nullptr || DstFunction->getLayout().block_empty() ||
109 DstFunction == SrcFunction)
110 continue;
112 // Record the call
113 Calls[DstFunction].emplace_back(SrcFunction, Count);
117 return Calls;
120 /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
121 /// Given an assignment of functions to the i-TLB pages), we divide all
122 /// functions calls into two categories:
123 /// - 'short' ones that have a caller-callee distance less than a page;
124 /// - 'long' ones where the distance exceeds a page.
125 /// The short calls are likely to result in a i-TLB cache hit. For the long
126 /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
127 /// often the page is accessed). Assuming that functions are sent to the i-TLB
128 /// cache in a random order, the probability that a page is present in the cache
129 /// is proportional to the number of samples corresponding to the functions on
130 /// the page. The following procedure detects short and long calls, and
131 /// estimates the expected number of cache misses for the long ones.
132 double expectedCacheHitRatio(
133 const std::vector<BinaryFunction *> &BinaryFunctions,
134 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
135 const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
137 const double PageSize = opts::ITLBPageSize;
138 const uint64_t CacheEntries = opts::ITLBEntries;
139 std::unordered_map<const BinaryFunction *, Predecessors> Calls =
140 extractFunctionCalls(BinaryFunctions);
141 // Compute 'hotness' of the functions
142 double TotalSamples = 0;
143 std::unordered_map<BinaryFunction *, double> FunctionSamples;
144 for (BinaryFunction *BF : BinaryFunctions) {
145 double Samples = 0;
146 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF])
147 Samples += Pair.second;
148 Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
149 FunctionSamples[BF] = Samples;
150 TotalSamples += Samples;
153 // Compute 'hotness' of the pages
154 std::unordered_map<uint64_t, double> PageSamples;
155 for (BinaryFunction *BF : BinaryFunctions) {
156 if (BF->getLayout().block_empty())
157 continue;
158 double Page = BBAddr.at(BF->getLayout().block_front()) / PageSize;
159 PageSamples[Page] += FunctionSamples.at(BF);
162 // Computing the expected number of misses for every function
163 double Misses = 0;
164 for (BinaryFunction *BF : BinaryFunctions) {
165 // Skip the function if it has no samples
166 if (BF->getLayout().block_empty() || FunctionSamples.at(BF) == 0.0)
167 continue;
168 double Samples = FunctionSamples.at(BF);
169 double Page = BBAddr.at(BF->getLayout().block_front()) / PageSize;
170 // The probability that the page is not present in the cache
171 double MissProb = pow(1.0 - PageSamples[Page] / TotalSamples, CacheEntries);
173 // Processing all callers of the function
174 for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
175 BinaryFunction *SrcFunction = Pair.first;
176 double SrcPage =
177 BBAddr.at(SrcFunction->getLayout().block_front()) / PageSize;
178 // Is this a 'long' or a 'short' call?
179 if (Page != SrcPage) {
180 // This is a miss
181 Misses += MissProb * Pair.second;
183 Samples -= Pair.second;
185 assert(Samples >= 0.0 && "Function samples computed incorrectly");
186 // The remaining samples likely come from the jitted code
187 Misses += Samples * MissProb;
190 return 100.0 * (1.0 - Misses / TotalSamples);
193 } // namespace
195 void CacheMetrics::printAll(const std::vector<BinaryFunction *> &BFs) {
196 // Stats related to hot-cold code splitting
197 size_t NumFunctions = 0;
198 size_t NumProfiledFunctions = 0;
199 size_t NumHotFunctions = 0;
200 size_t NumBlocks = 0;
201 size_t NumHotBlocks = 0;
203 size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
204 size_t TotalCodeMaxAddr = 0;
205 size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
206 size_t HotCodeMaxAddr = 0;
208 for (BinaryFunction *BF : BFs) {
209 NumFunctions++;
210 if (BF->hasProfile())
211 NumProfiledFunctions++;
212 if (BF->hasValidIndex())
213 NumHotFunctions++;
214 for (const BinaryBasicBlock &BB : *BF) {
215 NumBlocks++;
216 size_t BBAddrMin = BB.getOutputAddressRange().first;
217 size_t BBAddrMax = BB.getOutputAddressRange().second;
218 TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
219 TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
220 if (BF->hasValidIndex() && !BB.isCold()) {
221 NumHotBlocks++;
222 HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
223 HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
228 outs() << format(" There are %zu functions;", NumFunctions)
229 << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
230 100.0 * NumHotFunctions / NumFunctions)
231 << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
232 100.0 * NumProfiledFunctions / NumFunctions);
233 outs() << format(" There are %zu basic blocks;", NumBlocks)
234 << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
235 100.0 * NumHotBlocks / NumBlocks);
237 assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
238 size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
239 size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
241 size_t HugePage2MB = 2 << 20;
242 outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
243 "%.2lf huge pages)\n",
244 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize,
245 TotalCodeSize, double(HotCodeSize) / HugePage2MB);
247 // Stats related to expected cache performance
248 std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
249 std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
250 extractBasicBlockInfo(BFs, BBAddr, BBSize);
252 outs() << " Expected i-TLB cache hit ratio: "
253 << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
255 auto Stats = calcTSPScore(BFs, BBAddr, BBSize);
256 outs() << " TSP score: "
257 << format("%.2lf%% (%zu out of %zu)\n",
258 100.0 * Stats.first / std::max<uint64_t>(Stats.second, 1),
259 Stats.first, Stats.second);