1 //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the CacheMetrics class and functions for showing metrics
12 //===----------------------------------------------------------------------===//
14 #include "bolt/Passes/CacheMetrics.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include <unordered_map>
25 extern cl::OptionCategory BoltOptCategory
;
27 extern cl::opt
<unsigned> ITLBPageSize
;
28 extern cl::opt
<unsigned> ITLBEntries
;
34 /// Initialize and return a position map for binary basic blocks
35 void extractBasicBlockInfo(
36 const std::vector
<BinaryFunction
*> &BinaryFunctions
,
37 std::unordered_map
<BinaryBasicBlock
*, uint64_t> &BBAddr
,
38 std::unordered_map
<BinaryBasicBlock
*, uint64_t> &BBSize
) {
40 for (BinaryFunction
*BF
: BinaryFunctions
) {
41 const BinaryContext
&BC
= BF
->getBinaryContext();
42 for (BinaryBasicBlock
&BB
: *BF
) {
43 if (BF
->isSimple() || BC
.HasRelocations
) {
44 // Use addresses/sizes as in the output binary
45 BBAddr
[&BB
] = BB
.getOutputAddressRange().first
;
46 BBSize
[&BB
] = BB
.getOutputSize();
48 // Output ranges should match the input if the body hasn't changed
49 BBAddr
[&BB
] = BB
.getInputAddressRange().first
+ BF
->getAddress();
50 BBSize
[&BB
] = BB
.getOriginalSize();
56 /// Calculate TSP metric, which quantifies the number of fallthrough jumps in
57 /// the ordering of basic blocks. The method returns a pair
58 /// (the number of fallthrough branches, the total number of branches)
59 std::pair
<uint64_t, uint64_t>
60 calcTSPScore(const std::vector
<BinaryFunction
*> &BinaryFunctions
,
61 const std::unordered_map
<BinaryBasicBlock
*, uint64_t> &BBAddr
,
62 const std::unordered_map
<BinaryBasicBlock
*, uint64_t> &BBSize
) {
64 uint64_t JumpCount
= 0;
65 for (BinaryFunction
*BF
: BinaryFunctions
) {
66 if (!BF
->hasProfile())
68 for (BinaryBasicBlock
*SrcBB
: BF
->getLayout().blocks()) {
69 auto BI
= SrcBB
->branch_info_begin();
70 for (BinaryBasicBlock
*DstBB
: SrcBB
->successors()) {
71 if (SrcBB
!= DstBB
&& BI
->Count
!= BinaryBasicBlock::COUNT_NO_PROFILE
) {
72 JumpCount
+= BI
->Count
;
73 if (BBAddr
.at(SrcBB
) + BBSize
.at(SrcBB
) == BBAddr
.at(DstBB
))
80 return std::make_pair(Score
, JumpCount
);
83 using Predecessors
= std::vector
<std::pair
<BinaryFunction
*, uint64_t>>;
85 /// Build a simplified version of the call graph: For every function, keep
86 /// its callers and the frequencies of the calls
87 std::unordered_map
<const BinaryFunction
*, Predecessors
>
88 extractFunctionCalls(const std::vector
<BinaryFunction
*> &BinaryFunctions
) {
89 std::unordered_map
<const BinaryFunction
*, Predecessors
> Calls
;
91 for (BinaryFunction
*SrcFunction
: BinaryFunctions
) {
92 const BinaryContext
&BC
= SrcFunction
->getBinaryContext();
93 for (const BinaryBasicBlock
*BB
: SrcFunction
->getLayout().blocks()) {
94 // Find call instructions and extract target symbols from each one
95 for (const MCInst
&Inst
: *BB
) {
96 if (!BC
.MIB
->isCall(Inst
))
100 const MCSymbol
*DstSym
= BC
.MIB
->getTargetSymbol(Inst
);
101 uint64_t Count
= BB
->getKnownExecutionCount();
102 // Ignore calls w/o information
103 if (DstSym
== nullptr || Count
== 0)
106 const BinaryFunction
*DstFunction
= BC
.getFunctionForSymbol(DstSym
);
107 // Ignore recursive calls
108 if (DstFunction
== nullptr || DstFunction
->getLayout().block_empty() ||
109 DstFunction
== SrcFunction
)
113 Calls
[DstFunction
].emplace_back(SrcFunction
, Count
);
120 /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
121 /// Given an assignment of functions to the i-TLB pages), we divide all
122 /// functions calls into two categories:
123 /// - 'short' ones that have a caller-callee distance less than a page;
124 /// - 'long' ones where the distance exceeds a page.
125 /// The short calls are likely to result in a i-TLB cache hit. For the long
126 /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
127 /// often the page is accessed). Assuming that functions are sent to the i-TLB
128 /// cache in a random order, the probability that a page is present in the cache
129 /// is proportional to the number of samples corresponding to the functions on
130 /// the page. The following procedure detects short and long calls, and
131 /// estimates the expected number of cache misses for the long ones.
132 double expectedCacheHitRatio(
133 const std::vector
<BinaryFunction
*> &BinaryFunctions
,
134 const std::unordered_map
<BinaryBasicBlock
*, uint64_t> &BBAddr
,
135 const std::unordered_map
<BinaryBasicBlock
*, uint64_t> &BBSize
) {
137 const double PageSize
= opts::ITLBPageSize
;
138 const uint64_t CacheEntries
= opts::ITLBEntries
;
139 std::unordered_map
<const BinaryFunction
*, Predecessors
> Calls
=
140 extractFunctionCalls(BinaryFunctions
);
141 // Compute 'hotness' of the functions
142 double TotalSamples
= 0;
143 std::unordered_map
<BinaryFunction
*, double> FunctionSamples
;
144 for (BinaryFunction
*BF
: BinaryFunctions
) {
146 for (std::pair
<BinaryFunction
*, uint64_t> Pair
: Calls
[BF
])
147 Samples
+= Pair
.second
;
148 Samples
= std::max(Samples
, (double)BF
->getKnownExecutionCount());
149 FunctionSamples
[BF
] = Samples
;
150 TotalSamples
+= Samples
;
153 // Compute 'hotness' of the pages
154 std::unordered_map
<uint64_t, double> PageSamples
;
155 for (BinaryFunction
*BF
: BinaryFunctions
) {
156 if (BF
->getLayout().block_empty())
158 double Page
= BBAddr
.at(BF
->getLayout().block_front()) / PageSize
;
159 PageSamples
[Page
] += FunctionSamples
.at(BF
);
162 // Computing the expected number of misses for every function
164 for (BinaryFunction
*BF
: BinaryFunctions
) {
165 // Skip the function if it has no samples
166 if (BF
->getLayout().block_empty() || FunctionSamples
.at(BF
) == 0.0)
168 double Samples
= FunctionSamples
.at(BF
);
169 double Page
= BBAddr
.at(BF
->getLayout().block_front()) / PageSize
;
170 // The probability that the page is not present in the cache
171 double MissProb
= pow(1.0 - PageSamples
[Page
] / TotalSamples
, CacheEntries
);
173 // Processing all callers of the function
174 for (std::pair
<BinaryFunction
*, uint64_t> Pair
: Calls
[BF
]) {
175 BinaryFunction
*SrcFunction
= Pair
.first
;
177 BBAddr
.at(SrcFunction
->getLayout().block_front()) / PageSize
;
178 // Is this a 'long' or a 'short' call?
179 if (Page
!= SrcPage
) {
181 Misses
+= MissProb
* Pair
.second
;
183 Samples
-= Pair
.second
;
185 assert(Samples
>= 0.0 && "Function samples computed incorrectly");
186 // The remaining samples likely come from the jitted code
187 Misses
+= Samples
* MissProb
;
190 return 100.0 * (1.0 - Misses
/ TotalSamples
);
195 void CacheMetrics::printAll(const std::vector
<BinaryFunction
*> &BFs
) {
196 // Stats related to hot-cold code splitting
197 size_t NumFunctions
= 0;
198 size_t NumProfiledFunctions
= 0;
199 size_t NumHotFunctions
= 0;
200 size_t NumBlocks
= 0;
201 size_t NumHotBlocks
= 0;
203 size_t TotalCodeMinAddr
= std::numeric_limits
<size_t>::max();
204 size_t TotalCodeMaxAddr
= 0;
205 size_t HotCodeMinAddr
= std::numeric_limits
<size_t>::max();
206 size_t HotCodeMaxAddr
= 0;
208 for (BinaryFunction
*BF
: BFs
) {
210 if (BF
->hasProfile())
211 NumProfiledFunctions
++;
212 if (BF
->hasValidIndex())
214 for (const BinaryBasicBlock
&BB
: *BF
) {
216 size_t BBAddrMin
= BB
.getOutputAddressRange().first
;
217 size_t BBAddrMax
= BB
.getOutputAddressRange().second
;
218 TotalCodeMinAddr
= std::min(TotalCodeMinAddr
, BBAddrMin
);
219 TotalCodeMaxAddr
= std::max(TotalCodeMaxAddr
, BBAddrMax
);
220 if (BF
->hasValidIndex() && !BB
.isCold()) {
222 HotCodeMinAddr
= std::min(HotCodeMinAddr
, BBAddrMin
);
223 HotCodeMaxAddr
= std::max(HotCodeMaxAddr
, BBAddrMax
);
228 outs() << format(" There are %zu functions;", NumFunctions
)
229 << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions
,
230 100.0 * NumHotFunctions
/ NumFunctions
)
231 << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions
,
232 100.0 * NumProfiledFunctions
/ NumFunctions
);
233 outs() << format(" There are %zu basic blocks;", NumBlocks
)
234 << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks
,
235 100.0 * NumHotBlocks
/ NumBlocks
);
237 assert(TotalCodeMinAddr
<= TotalCodeMaxAddr
&& "incorrect output addresses");
238 size_t HotCodeSize
= HotCodeMaxAddr
- HotCodeMinAddr
;
239 size_t TotalCodeSize
= TotalCodeMaxAddr
- TotalCodeMinAddr
;
241 size_t HugePage2MB
= 2 << 20;
242 outs() << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
243 "%.2lf huge pages)\n",
244 100.0 * HotCodeSize
/ TotalCodeSize
, HotCodeSize
,
245 TotalCodeSize
, double(HotCodeSize
) / HugePage2MB
);
247 // Stats related to expected cache performance
248 std::unordered_map
<BinaryBasicBlock
*, uint64_t> BBAddr
;
249 std::unordered_map
<BinaryBasicBlock
*, uint64_t> BBSize
;
250 extractBasicBlockInfo(BFs
, BBAddr
, BBSize
);
252 outs() << " Expected i-TLB cache hit ratio: "
253 << format("%.2lf%%\n", expectedCacheHitRatio(BFs
, BBAddr
, BBSize
));
255 auto Stats
= calcTSPScore(BFs
, BBAddr
, BBSize
);
256 outs() << " TSP score: "
257 << format("%.2lf%% (%zu out of %zu)\n",
258 100.0 * Stats
.first
/ std::max
<uint64_t>(Stats
.second
, 1),
259 Stats
.first
, Stats
.second
);