1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // BOLT often has to deal with profiles collected on binaries built from several
10 // revisions behind release. As a result, a certain percentage of functions is
11 // considered stale and not optimized. This file implements an ability to match
12 // profile to functions that are not 100% binary identical, and thus, increasing
13 // the optimization coverage and boost the performance of applications.
15 // The algorithm consists of two phases: matching and inference:
16 // - At the matching phase, we try to "guess" as many block and jump counts from
17 // the stale profile as possible. To this end, the content of each basic block
18 // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
19 // it computes block hashes and identifies the corresponding entries in the
20 // stale profile. It yields a partial profile for every CFG in the binary.
21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
22 // reconstruct "realistic" block and jump counts from the partial profile
23 // generated at the first stage. In practice, we don't always produce proper
24 // profile data but the majority (e.g., >90%) of CFGs get the correct counts.
26 //===----------------------------------------------------------------------===//
28 #include "bolt/Core/HashUtilities.h"
29 #include "bolt/Profile/YAMLProfileReader.h"
30 #include "llvm/ADT/Bitfields.h"
31 #include "llvm/ADT/Hashing.h"
32 #include "llvm/MC/MCPseudoProbe.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Support/Timer.h"
35 #include "llvm/Support/xxhash.h"
36 #include "llvm/Transforms/Utils/SampleProfileInference.h"
43 #define DEBUG_TYPE "bolt-prof"
47 extern cl::opt
<bool> TimeRewrite
;
48 extern cl::OptionCategory BoltOptCategory
;
51 InferStaleProfile("infer-stale-profile",
52 cl::desc("Infer counts from stale profile data."),
53 cl::init(false), cl::Hidden
, cl::cat(BoltOptCategory
));
55 cl::opt
<unsigned> StaleMatchingMinMatchedBlock(
56 "stale-matching-min-matched-block",
57 cl::desc("Percentage threshold of matched basic blocks at which stale "
58 "profile inference is executed."),
59 cl::init(0), cl::Hidden
, cl::cat(BoltOptCategory
));
61 cl::opt
<unsigned> StaleMatchingMaxFuncSize(
62 "stale-matching-max-func-size",
63 cl::desc("The maximum size of a function to consider for inference."),
64 cl::init(10000), cl::Hidden
, cl::cat(BoltOptCategory
));
66 // Parameters of the profile inference algorithm. The default values are tuned
67 // on several benchmarks.
68 cl::opt
<bool> StaleMatchingEvenFlowDistribution(
69 "stale-matching-even-flow-distribution",
70 cl::desc("Try to evenly distribute flow when there are multiple equally "
72 cl::init(true), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
74 cl::opt
<bool> StaleMatchingRebalanceUnknown(
75 "stale-matching-rebalance-unknown",
76 cl::desc("Evenly re-distribute flow among unknown subgraphs."),
77 cl::init(false), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
79 cl::opt
<bool> StaleMatchingJoinIslands(
80 "stale-matching-join-islands",
81 cl::desc("Join isolated components having positive flow."), cl::init(true),
82 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
84 cl::opt
<unsigned> StaleMatchingCostBlockInc(
85 "stale-matching-cost-block-inc",
86 cl::desc("The cost of increasing a block count by one."), cl::init(150),
87 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
89 cl::opt
<unsigned> StaleMatchingCostBlockDec(
90 "stale-matching-cost-block-dec",
91 cl::desc("The cost of decreasing a block count by one."), cl::init(150),
92 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
94 cl::opt
<unsigned> StaleMatchingCostJumpInc(
95 "stale-matching-cost-jump-inc",
96 cl::desc("The cost of increasing a jump count by one."), cl::init(150),
97 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
99 cl::opt
<unsigned> StaleMatchingCostJumpDec(
100 "stale-matching-cost-jump-dec",
101 cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
102 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
104 cl::opt
<unsigned> StaleMatchingCostBlockUnknownInc(
105 "stale-matching-cost-block-unknown-inc",
106 cl::desc("The cost of increasing an unknown block count by one."),
107 cl::init(1), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
109 cl::opt
<unsigned> StaleMatchingCostJumpUnknownInc(
110 "stale-matching-cost-jump-unknown-inc",
111 cl::desc("The cost of increasing an unknown jump count by one."),
112 cl::init(140), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
114 cl::opt
<unsigned> StaleMatchingCostJumpUnknownFTInc(
115 "stale-matching-cost-jump-unknown-ft-inc",
117 "The cost of increasing an unknown fall-through jump count by one."),
118 cl::init(3), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
120 cl::opt
<bool> StaleMatchingWithPseudoProbes(
121 "stale-matching-with-pseudo-probes",
122 cl::desc("Turns on stale matching with block pseudo probes."),
123 cl::init(false), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
130 /// An object wrapping several components of a basic block hash. The combined
131 /// (blended) hash is represented and stored as one uint64_t, while individual
132 /// components are of smaller size (e.g., uint16_t or uint8_t).
133 struct BlendedBlockHash
{
135 using ValueOffset
= Bitfield::Element
<uint16_t, 0, 16>;
136 using ValueOpcode
= Bitfield::Element
<uint16_t, 16, 16>;
137 using ValueInstr
= Bitfield::Element
<uint16_t, 32, 16>;
138 using ValuePred
= Bitfield::Element
<uint8_t, 48, 8>;
139 using ValueSucc
= Bitfield::Element
<uint8_t, 56, 8>;
142 explicit BlendedBlockHash() {}
144 explicit BlendedBlockHash(uint64_t Hash
) {
145 Offset
= Bitfield::get
<ValueOffset
>(Hash
);
146 OpcodeHash
= Bitfield::get
<ValueOpcode
>(Hash
);
147 InstrHash
= Bitfield::get
<ValueInstr
>(Hash
);
148 PredHash
= Bitfield::get
<ValuePred
>(Hash
);
149 SuccHash
= Bitfield::get
<ValueSucc
>(Hash
);
152 /// Combine the blended hash into uint64_t.
153 uint64_t combine() const {
155 Bitfield::set
<ValueOffset
>(Hash
, Offset
);
156 Bitfield::set
<ValueOpcode
>(Hash
, OpcodeHash
);
157 Bitfield::set
<ValueInstr
>(Hash
, InstrHash
);
158 Bitfield::set
<ValuePred
>(Hash
, PredHash
);
159 Bitfield::set
<ValueSucc
>(Hash
, SuccHash
);
163 /// Compute a distance between two given blended hashes. The smaller the
164 /// distance, the more similar two blocks are. For identical basic blocks,
165 /// the distance is zero.
166 uint64_t distance(const BlendedBlockHash
&BBH
) const {
167 assert(OpcodeHash
== BBH
.OpcodeHash
&&
168 "incorrect blended hash distance computation");
170 // Account for NeighborHash
171 Dist
+= SuccHash
== BBH
.SuccHash
? 0 : 1;
172 Dist
+= PredHash
== BBH
.PredHash
? 0 : 1;
174 // Account for InstrHash
175 Dist
+= InstrHash
== BBH
.InstrHash
? 0 : 1;
177 // Account for Offset
178 Dist
+= (Offset
>= BBH
.Offset
? Offset
- BBH
.Offset
: BBH
.Offset
- Offset
);
182 /// The offset of the basic block from the function start.
184 /// (Loose) Hash of the basic block instructions, excluding operands.
185 uint16_t OpcodeHash
{0};
186 /// (Strong) Hash of the basic block instructions, including opcodes and
188 uint16_t InstrHash
{0};
189 /// (Loose) Hashes of the predecessors of the basic block.
191 /// (Loose) Hashes of the successors of the basic block.
195 /// The object is used to identify and match basic blocks in a BinaryFunction
196 /// given their hashes computed on a binary built from several revisions behind
200 /// Initialize stale matcher.
201 void init(const std::vector
<FlowBlock
*> &Blocks
,
202 const std::vector
<BlendedBlockHash
> &Hashes
,
203 const std::vector
<uint64_t> &CallHashes
) {
204 assert(Blocks
.size() == Hashes
.size() &&
205 Hashes
.size() == CallHashes
.size() &&
206 "incorrect matcher initialization");
207 for (size_t I
= 0; I
< Blocks
.size(); I
++) {
208 FlowBlock
*Block
= Blocks
[I
];
209 uint16_t OpHash
= Hashes
[I
].OpcodeHash
;
210 OpHashToBlocks
[OpHash
].push_back(std::make_pair(Hashes
[I
], Block
));
212 CallHashToBlocks
[CallHashes
[I
]].push_back(
213 std::make_pair(Hashes
[I
], Block
));
217 /// Creates a mapping from a pseudo probe to a flow block.
218 void mapProbeToBB(const MCDecodedPseudoProbe
*Probe
, FlowBlock
*Block
) {
219 BBPseudoProbeToBlock
[Probe
] = Block
;
222 enum MatchMethod
: char {
231 /// Find the most similar flow block for a profile block given blended hash.
232 std::pair
<const FlowBlock
*, MatchMethod
>
233 matchBlockStrict(BlendedBlockHash BlendedHash
) {
234 const auto &[Block
, ExactHash
] = matchWithOpcodes(BlendedHash
);
235 if (Block
&& ExactHash
)
236 return {Block
, MATCH_EXACT
};
237 return {nullptr, NO_MATCH
};
240 /// Find the most similar flow block for a profile block given pseudo probes.
241 std::pair
<const FlowBlock
*, MatchMethod
> matchBlockProbe(
242 const ArrayRef
<yaml::bolt::PseudoProbeInfo
> PseudoProbes
,
243 const YAMLProfileReader::InlineTreeNodeMapTy
&InlineTreeNodeMap
) {
244 const auto &[ProbeBlock
, ExactProbe
] =
245 matchWithPseudoProbes(PseudoProbes
, InlineTreeNodeMap
);
247 return {ProbeBlock
, ExactProbe
? MATCH_PROBE_EXACT
: MATCH_PROBE_LOOSE
};
248 return {nullptr, NO_MATCH
};
251 /// Find the most similar flow block for a profile block given its hashes.
252 std::pair
<const FlowBlock
*, MatchMethod
>
253 matchBlockLoose(BlendedBlockHash BlendedHash
, uint64_t CallHash
) {
254 if (const FlowBlock
*CallBlock
= matchWithCalls(BlendedHash
, CallHash
))
255 return {CallBlock
, MATCH_CALL
};
256 if (const FlowBlock
*OpcodeBlock
= matchWithOpcodes(BlendedHash
).first
)
257 return {OpcodeBlock
, MATCH_OPCODE
};
258 return {nullptr, NO_MATCH
};
261 /// Returns true if the two basic blocks (in the binary and in the profile)
262 /// corresponding to the given hashes are matched to each other with a high
264 static bool isHighConfidenceMatch(BlendedBlockHash Hash1
,
265 BlendedBlockHash Hash2
) {
266 return Hash1
.InstrHash
== Hash2
.InstrHash
;
270 using HashBlockPairType
= std::pair
<BlendedBlockHash
, FlowBlock
*>;
271 std::unordered_map
<uint16_t, std::vector
<HashBlockPairType
>> OpHashToBlocks
;
272 std::unordered_map
<uint64_t, std::vector
<HashBlockPairType
>> CallHashToBlocks
;
273 DenseMap
<const MCDecodedPseudoProbe
*, FlowBlock
*> BBPseudoProbeToBlock
;
275 // Uses OpcodeHash to find the most similar block for a given hash.
276 std::pair
<const FlowBlock
*, bool>
277 matchWithOpcodes(BlendedBlockHash BlendedHash
) const {
278 auto BlockIt
= OpHashToBlocks
.find(BlendedHash
.OpcodeHash
);
279 if (BlockIt
== OpHashToBlocks
.end())
280 return {nullptr, false};
281 FlowBlock
*BestBlock
= nullptr;
282 uint64_t BestDist
= std::numeric_limits
<uint64_t>::max();
283 BlendedBlockHash BestHash
;
284 for (const auto &[Hash
, Block
] : BlockIt
->second
) {
285 uint64_t Dist
= Hash
.distance(BlendedHash
);
286 if (BestBlock
== nullptr || Dist
< BestDist
) {
292 return {BestBlock
, isHighConfidenceMatch(BestHash
, BlendedHash
)};
295 // Uses CallHash to find the most similar block for a given hash.
296 const FlowBlock
*matchWithCalls(BlendedBlockHash BlendedHash
,
297 uint64_t CallHash
) const {
300 auto BlockIt
= CallHashToBlocks
.find(CallHash
);
301 if (BlockIt
== CallHashToBlocks
.end())
303 FlowBlock
*BestBlock
= nullptr;
304 uint64_t BestDist
= std::numeric_limits
<uint64_t>::max();
305 for (const auto &[Hash
, Block
] : BlockIt
->second
) {
306 uint64_t Dist
= Hash
.OpcodeHash
> BlendedHash
.OpcodeHash
307 ? Hash
.OpcodeHash
- BlendedHash
.OpcodeHash
308 : BlendedHash
.OpcodeHash
- Hash
.OpcodeHash
;
309 if (BestBlock
== nullptr || Dist
< BestDist
) {
317 /// Matches a profile block with a binary block based on pseudo probes.
318 /// Returns the best matching block (or nullptr) and whether the match is
320 std::pair
<const FlowBlock
*, bool> matchWithPseudoProbes(
321 const ArrayRef
<yaml::bolt::PseudoProbeInfo
> BlockPseudoProbes
,
322 const YAMLProfileReader::InlineTreeNodeMapTy
&InlineTreeNodeMap
) const {
324 if (!opts::StaleMatchingWithPseudoProbes
)
325 return {nullptr, false};
327 DenseMap
<const FlowBlock
*, uint32_t> FlowBlockMatchCount
;
329 auto matchProfileProbeToBlock
= [&](uint32_t NodeId
,
330 uint64_t ProbeId
) -> const FlowBlock
* {
331 const MCDecodedPseudoProbeInlineTree
*BinaryNode
=
332 InlineTreeNodeMap
.getInlineTreeNode(NodeId
);
335 const MCDecodedPseudoProbe
Dummy(0, ProbeId
, PseudoProbeType::Block
, 0, 0,
337 ArrayRef
<MCDecodedPseudoProbe
> BinaryProbes
= BinaryNode
->getProbes();
338 auto BinaryProbeIt
= llvm::lower_bound(
339 BinaryProbes
, Dummy
, [](const auto &LHS
, const auto &RHS
) {
340 return LHS
.getIndex() < RHS
.getIndex();
342 if (BinaryProbeIt
== BinaryNode
->getProbes().end() ||
343 BinaryProbeIt
->getIndex() != ProbeId
)
345 auto It
= BBPseudoProbeToBlock
.find(&*BinaryProbeIt
);
346 if (It
== BBPseudoProbeToBlock
.end())
351 auto matchPseudoProbeInfo
= [&](const yaml::bolt::PseudoProbeInfo
354 for (uint64_t Index
= 0; Index
< 64; ++Index
)
355 if (ProfileProbe
.BlockMask
& 1ull << Index
)
356 ++FlowBlockMatchCount
[matchProfileProbeToBlock(NodeId
, Index
+ 1)];
357 for (const auto &ProfileProbes
:
358 {ProfileProbe
.BlockProbes
, ProfileProbe
.IndCallProbes
,
359 ProfileProbe
.CallProbes
})
360 for (uint64_t ProfileProbe
: ProfileProbes
)
361 ++FlowBlockMatchCount
[matchProfileProbeToBlock(NodeId
, ProfileProbe
)];
364 for (const yaml::bolt::PseudoProbeInfo
&ProfileProbe
: BlockPseudoProbes
) {
365 if (!ProfileProbe
.InlineTreeNodes
.empty())
366 for (uint32_t ProfileInlineTreeNode
: ProfileProbe
.InlineTreeNodes
)
367 matchPseudoProbeInfo(ProfileProbe
, ProfileInlineTreeNode
);
369 matchPseudoProbeInfo(ProfileProbe
, ProfileProbe
.InlineTreeIndex
);
371 uint32_t BestMatchCount
= 0;
372 uint32_t TotalMatchCount
= 0;
373 const FlowBlock
*BestMatchBlock
= nullptr;
374 for (const auto &[FlowBlock
, Count
] : FlowBlockMatchCount
) {
375 TotalMatchCount
+= Count
;
376 if (Count
< BestMatchCount
|| (Count
== BestMatchCount
&& BestMatchBlock
))
378 BestMatchBlock
= FlowBlock
;
379 BestMatchCount
= Count
;
381 return {BestMatchBlock
, BestMatchCount
== TotalMatchCount
};
385 void BinaryFunction::computeBlockHashes(HashFunction HashFunction
) const {
389 assert(hasCFG() && "the function is expected to have CFG");
391 std::vector
<BlendedBlockHash
> BlendedHashes(BasicBlocks
.size());
392 std::vector
<uint64_t> OpcodeHashes(BasicBlocks
.size());
393 // Initialize hash components.
394 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
395 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
396 assert(BB
->getIndex() == I
&& "incorrect block index");
397 BlendedHashes
[I
].Offset
= BB
->getOffset();
398 // Hashing complete instructions.
399 std::string InstrHashStr
= hashBlock(
400 BC
, *BB
, [&](const MCOperand
&Op
) { return hashInstOperand(BC
, Op
); });
401 if (HashFunction
== HashFunction::StdHash
) {
402 uint64_t InstrHash
= std::hash
<std::string
>{}(InstrHashStr
);
403 BlendedHashes
[I
].InstrHash
= (uint16_t)hash_value(InstrHash
);
404 } else if (HashFunction
== HashFunction::XXH3
) {
405 uint64_t InstrHash
= llvm::xxh3_64bits(InstrHashStr
);
406 BlendedHashes
[I
].InstrHash
= (uint16_t)InstrHash
;
408 llvm_unreachable("Unhandled HashFunction");
411 std::string OpcodeHashStr
= hashBlockLoose(BC
, *BB
);
412 if (HashFunction
== HashFunction::StdHash
) {
413 OpcodeHashes
[I
] = std::hash
<std::string
>{}(OpcodeHashStr
);
414 BlendedHashes
[I
].OpcodeHash
= (uint16_t)hash_value(OpcodeHashes
[I
]);
415 } else if (HashFunction
== HashFunction::XXH3
) {
416 OpcodeHashes
[I
] = llvm::xxh3_64bits(OpcodeHashStr
);
417 BlendedHashes
[I
].OpcodeHash
= (uint16_t)OpcodeHashes
[I
];
419 llvm_unreachable("Unhandled HashFunction");
423 // Initialize neighbor hash.
424 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
425 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
426 // Append hashes of successors.
428 for (BinaryBasicBlock
*SuccBB
: BB
->successors()) {
429 uint64_t SuccHash
= OpcodeHashes
[SuccBB
->getIndex()];
430 Hash
= hashing::detail::hash_16_bytes(Hash
, SuccHash
);
432 if (HashFunction
== HashFunction::StdHash
) {
433 // Compatibility with old behavior.
434 BlendedHashes
[I
].SuccHash
= (uint8_t)hash_value(Hash
);
436 BlendedHashes
[I
].SuccHash
= (uint8_t)Hash
;
439 // Append hashes of predecessors.
441 for (BinaryBasicBlock
*PredBB
: BB
->predecessors()) {
442 uint64_t PredHash
= OpcodeHashes
[PredBB
->getIndex()];
443 Hash
= hashing::detail::hash_16_bytes(Hash
, PredHash
);
445 if (HashFunction
== HashFunction::StdHash
) {
446 // Compatibility with old behavior.
447 BlendedHashes
[I
].PredHash
= (uint8_t)hash_value(Hash
);
449 BlendedHashes
[I
].PredHash
= (uint8_t)Hash
;
454 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
455 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
456 BB
->setHash(BlendedHashes
[I
].combine());
459 // TODO: mediate the difference between flow function construction here in BOLT
460 // and in the compiler by splitting blocks with exception throwing calls at the
461 // call and adding the landing pad as the successor.
462 /// Create a wrapper flow function to use with the profile inference algorithm,
463 /// and initialize its jumps and metadata.
465 createFlowFunction(const BinaryFunction::BasicBlockOrderType
&BlockOrder
) {
468 // Add a special "dummy" source so that there is always a unique entry point.
469 FlowBlock EntryBlock
;
470 EntryBlock
.Index
= 0;
471 Func
.Blocks
.push_back(EntryBlock
);
473 // Create FlowBlock for every basic block in the binary function.
474 for (const BinaryBasicBlock
*BB
: BlockOrder
) {
475 Func
.Blocks
.emplace_back();
476 FlowBlock
&Block
= Func
.Blocks
.back();
477 Block
.Index
= Func
.Blocks
.size() - 1;
479 assert(Block
.Index
== BB
->getIndex() + 1 &&
480 "incorrectly assigned basic block index");
483 // Add a special "dummy" sink block so there is always a unique sink.
485 SinkBlock
.Index
= Func
.Blocks
.size();
486 Func
.Blocks
.push_back(SinkBlock
);
488 // Create FlowJump for each jump between basic blocks in the binary function.
489 std::vector
<uint64_t> InDegree(Func
.Blocks
.size(), 0);
490 for (const BinaryBasicBlock
*SrcBB
: BlockOrder
) {
491 std::unordered_set
<const BinaryBasicBlock
*> UniqueSuccs
;
492 // Collect regular jumps
493 for (const BinaryBasicBlock
*DstBB
: SrcBB
->successors()) {
494 // Ignoring parallel edges
495 if (UniqueSuccs
.find(DstBB
) != UniqueSuccs
.end())
498 Func
.Jumps
.emplace_back();
499 FlowJump
&Jump
= Func
.Jumps
.back();
500 Jump
.Source
= SrcBB
->getIndex() + 1;
501 Jump
.Target
= DstBB
->getIndex() + 1;
502 InDegree
[Jump
.Target
]++;
503 UniqueSuccs
.insert(DstBB
);
505 // TODO: set jump from exit block to landing pad to Unlikely.
506 // If the block is an exit, add a dummy edge from it to the sink block.
507 if (UniqueSuccs
.empty()) {
508 Func
.Jumps
.emplace_back();
509 FlowJump
&Jump
= Func
.Jumps
.back();
510 Jump
.Source
= SrcBB
->getIndex() + 1;
511 Jump
.Target
= Func
.Blocks
.size() - 1;
512 InDegree
[Jump
.Target
]++;
515 // Collect jumps to landing pads
516 for (const BinaryBasicBlock
*DstBB
: SrcBB
->landing_pads()) {
517 // Ignoring parallel edges
518 if (UniqueSuccs
.find(DstBB
) != UniqueSuccs
.end())
521 Func
.Jumps
.emplace_back();
522 FlowJump
&Jump
= Func
.Jumps
.back();
523 Jump
.Source
= SrcBB
->getIndex() + 1;
524 Jump
.Target
= DstBB
->getIndex() + 1;
525 InDegree
[Jump
.Target
]++;
526 UniqueSuccs
.insert(DstBB
);
530 // Add dummy edges to the extra sources. If there are multiple entry blocks,
531 // add an unlikely edge from 0 to the subsequent ones. Skips the sink block.
532 assert(InDegree
[0] == 0 && "dummy entry blocks shouldn't have predecessors");
533 for (uint64_t I
= 1; I
< Func
.Blocks
.size() - 1; I
++) {
534 const BinaryBasicBlock
*BB
= BlockOrder
[I
- 1];
535 if (BB
->isEntryPoint() || InDegree
[I
] == 0) {
536 Func
.Jumps
.emplace_back();
537 FlowJump
&Jump
= Func
.Jumps
.back();
540 if (!BB
->isEntryPoint())
541 Jump
.IsUnlikely
= true;
545 // Create necessary metadata for the flow function
546 for (FlowJump
&Jump
: Func
.Jumps
) {
547 assert(Jump
.Source
< Func
.Blocks
.size());
548 Func
.Blocks
[Jump
.Source
].SuccJumps
.push_back(&Jump
);
549 assert(Jump
.Target
< Func
.Blocks
.size());
550 Func
.Blocks
[Jump
.Target
].PredJumps
.push_back(&Jump
);
555 /// Assign initial block/jump weights based on the stale profile data. The goal
556 /// is to extract as much information from the stale profile as possible. Here
557 /// we assume that each basic block is specified via a hash value computed from
558 /// its content and the hashes of the unchanged basic blocks stay the same
559 /// across different revisions of the binary. Blocks may also have pseudo probe
560 /// information in the profile and the binary which is used for matching.
561 /// Whenever there is a count in the profile with the hash corresponding to one
562 /// of the basic blocks in the binary, the count is "matched" to the block.
563 /// Similarly, if both the source and the target of a count in the profile are
564 /// matched to a jump in the binary, the count is recorded in CFG.
566 BinaryContext
&BC
, const BinaryFunction::BasicBlockOrderType
&BlockOrder
,
567 const yaml::bolt::BinaryFunctionProfile
&YamlBF
, FlowFunction
&Func
,
568 HashFunction HashFunction
, YAMLProfileReader::ProfileLookupMap
&IdToYamlBF
,
569 const BinaryFunction
&BF
,
570 const ArrayRef
<YAMLProfileReader::ProbeMatchSpec
> ProbeMatchSpecs
) {
572 assert(Func
.Blocks
.size() == BlockOrder
.size() + 2);
574 std::vector
<uint64_t> CallHashes
;
575 std::vector
<FlowBlock
*> Blocks
;
576 std::vector
<BlendedBlockHash
> BlendedHashes
;
577 for (uint64_t I
= 0; I
< BlockOrder
.size(); I
++) {
578 const BinaryBasicBlock
*BB
= BlockOrder
[I
];
579 assert(BB
->getHash() != 0 && "empty hash of BinaryBasicBlock");
581 std::string CallHashStr
= hashBlockCalls(BC
, *BB
);
582 if (CallHashStr
.empty()) {
583 CallHashes
.push_back(0);
585 if (HashFunction
== HashFunction::StdHash
)
586 CallHashes
.push_back(std::hash
<std::string
>{}(CallHashStr
));
587 else if (HashFunction
== HashFunction::XXH3
)
588 CallHashes
.push_back(llvm::xxh3_64bits(CallHashStr
));
590 llvm_unreachable("Unhandled HashFunction");
593 Blocks
.push_back(&Func
.Blocks
[I
+ 1]);
594 BlendedBlockHash
BlendedHash(BB
->getHash());
595 BlendedHashes
.push_back(BlendedHash
);
596 LLVM_DEBUG(dbgs() << "BB with index " << I
<< " has hash = "
597 << Twine::utohexstr(BB
->getHash()) << "\n");
599 StaleMatcher Matcher
;
600 // Collects function pseudo probes for use in the StaleMatcher.
601 if (opts::StaleMatchingWithPseudoProbes
) {
602 const MCPseudoProbeDecoder
*Decoder
= BC
.getPseudoProbeDecoder();
604 "If pseudo probes are in use, pseudo probe decoder should exist");
605 const AddressProbesMap
&ProbeMap
= Decoder
->getAddress2ProbesMap();
606 const uint64_t FuncAddr
= BF
.getAddress();
607 for (const MCDecodedPseudoProbe
&Probe
:
608 ProbeMap
.find(FuncAddr
, FuncAddr
+ BF
.getSize()))
609 if (const BinaryBasicBlock
*BB
=
610 BF
.getBasicBlockContainingOffset(Probe
.getAddress() - FuncAddr
))
611 Matcher
.mapProbeToBB(&Probe
, Blocks
[BB
->getIndex()]);
613 Matcher
.init(Blocks
, BlendedHashes
, CallHashes
);
616 std::pair
<const FlowBlock
*, const yaml::bolt::BinaryBasicBlockProfile
*>;
617 using ProfileBlockMatchMap
= DenseMap
<uint32_t, FlowBlockTy
>;
618 // Binary profile => block index => matched block + its block profile
619 DenseMap
<const yaml::bolt::BinaryFunctionProfile
*, ProfileBlockMatchMap
>
622 // Map of FlowBlock and matching method.
623 DenseMap
<const FlowBlock
*, StaleMatcher::MatchMethod
> MatchedFlowBlocks
;
625 auto addMatchedBlock
=
626 [&](std::pair
<const FlowBlock
*, StaleMatcher::MatchMethod
> BlockMethod
,
627 const yaml::bolt::BinaryFunctionProfile
&YamlBP
,
628 const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
) {
629 const auto &[MatchedBlock
, Method
] = BlockMethod
;
632 // Don't override earlier matches
633 if (MatchedFlowBlocks
.contains(MatchedBlock
))
635 MatchedFlowBlocks
.try_emplace(MatchedBlock
, Method
);
636 MatchedBlocks
[&YamlBP
][YamlBB
.Index
] = {MatchedBlock
, &YamlBB
};
639 // Match blocks from the profile to the blocks in CFG by strict hash.
640 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
.Blocks
) {
641 // Update matching stats.
642 ++BC
.Stats
.NumStaleBlocks
;
643 BC
.Stats
.StaleSampleCount
+= YamlBB
.ExecCount
;
645 assert(YamlBB
.Hash
!= 0 && "empty hash of BinaryBasicBlockProfile");
646 BlendedBlockHash
YamlHash(YamlBB
.Hash
);
647 addMatchedBlock(Matcher
.matchBlockStrict(YamlHash
), YamlBF
, YamlBB
);
649 // Match blocks from the profile to the blocks in CFG by pseudo probes.
650 for (const auto &[InlineNodeMap
, YamlBP
] : ProbeMatchSpecs
) {
651 for (const yaml::bolt::BinaryBasicBlockProfile
&BB
: YamlBP
.get().Blocks
)
652 if (!BB
.PseudoProbes
.empty())
653 addMatchedBlock(Matcher
.matchBlockProbe(BB
.PseudoProbes
, InlineNodeMap
),
656 // Match blocks from the profile to the blocks in CFG with loose methods.
657 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
.Blocks
) {
658 assert(YamlBB
.Hash
!= 0 && "empty hash of BinaryBasicBlockProfile");
659 BlendedBlockHash
YamlHash(YamlBB
.Hash
);
661 std::string CallHashStr
= hashBlockCalls(IdToYamlBF
, YamlBB
);
662 uint64_t CallHash
= 0;
663 if (!CallHashStr
.empty()) {
664 if (HashFunction
== HashFunction::StdHash
)
665 CallHash
= std::hash
<std::string
>{}(CallHashStr
);
666 else if (HashFunction
== HashFunction::XXH3
)
667 CallHash
= llvm::xxh3_64bits(CallHashStr
);
669 llvm_unreachable("Unhandled HashFunction");
671 auto [MatchedBlock
, Method
] = Matcher
.matchBlockLoose(YamlHash
, CallHash
);
672 if (MatchedBlock
== nullptr && YamlBB
.Index
== 0) {
673 MatchedBlock
= Blocks
[0];
674 // Report as loose match
675 Method
= StaleMatcher::MATCH_OPCODE
;
678 LLVM_DEBUG(dbgs() << "Couldn't match yaml block (bid = " << YamlBB
.Index
679 << ")" << " with hash " << Twine::utohexstr(YamlBB
.Hash
)
683 addMatchedBlock({MatchedBlock
, Method
}, YamlBF
, YamlBB
);
686 // Match jumps from the profile to the jumps from CFG
687 std::vector
<uint64_t> OutWeight(Func
.Blocks
.size(), 0);
688 std::vector
<uint64_t> InWeight(Func
.Blocks
.size(), 0);
690 for (const auto &[YamlBF
, MatchMap
] : MatchedBlocks
) {
691 for (const auto &[YamlBBIdx
, FlowBlockProfile
] : MatchMap
) {
692 const auto &[MatchedBlock
, YamlBB
] = FlowBlockProfile
;
693 StaleMatcher::MatchMethod Method
= MatchedFlowBlocks
.lookup(MatchedBlock
);
694 BlendedBlockHash BinHash
= BlendedHashes
[MatchedBlock
->Index
- 1];
695 LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBBIdx
<< ")"
696 << " with hash " << Twine::utohexstr(YamlBB
->Hash
)
697 << " to BB (index = " << MatchedBlock
->Index
- 1 << ")"
698 << " with hash " << Twine::utohexstr(BinHash
.combine())
701 uint64_t ExecCount
= YamlBB
->ExecCount
;
702 // Update matching stats accounting for the matched block.
704 case StaleMatcher::MATCH_EXACT
:
705 ++BC
.Stats
.NumExactMatchedBlocks
;
706 BC
.Stats
.ExactMatchedSampleCount
+= ExecCount
;
707 LLVM_DEBUG(dbgs() << " exact match\n");
709 case StaleMatcher::MATCH_PROBE_EXACT
:
710 ++BC
.Stats
.NumPseudoProbeExactMatchedBlocks
;
711 BC
.Stats
.PseudoProbeExactMatchedSampleCount
+= ExecCount
;
712 LLVM_DEBUG(dbgs() << " exact pseudo probe match\n");
714 case StaleMatcher::MATCH_PROBE_LOOSE
:
715 ++BC
.Stats
.NumPseudoProbeLooseMatchedBlocks
;
716 BC
.Stats
.PseudoProbeLooseMatchedSampleCount
+= ExecCount
;
717 LLVM_DEBUG(dbgs() << " loose pseudo probe match\n");
719 case StaleMatcher::MATCH_CALL
:
720 ++BC
.Stats
.NumCallMatchedBlocks
;
721 BC
.Stats
.CallMatchedSampleCount
+= ExecCount
;
722 LLVM_DEBUG(dbgs() << " call match\n");
724 case StaleMatcher::MATCH_OPCODE
:
725 ++BC
.Stats
.NumLooseMatchedBlocks
;
726 BC
.Stats
.LooseMatchedSampleCount
+= ExecCount
;
727 LLVM_DEBUG(dbgs() << " loose match\n");
729 case StaleMatcher::NO_MATCH
:
730 LLVM_DEBUG(dbgs() << " no match\n");
734 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
->Blocks
) {
735 for (const yaml::bolt::SuccessorInfo
&YamlSI
: YamlBB
.Successors
) {
736 if (YamlSI
.Count
== 0)
739 // Try to find the jump for a given (src, dst) pair from the profile and
740 // assign the jump weight based on the profile count
741 const uint64_t SrcIndex
= YamlBB
.Index
;
742 const uint64_t DstIndex
= YamlSI
.Index
;
744 const FlowBlock
*MatchedSrcBlock
= MatchMap
.lookup(SrcIndex
).first
;
745 const FlowBlock
*MatchedDstBlock
= MatchMap
.lookup(DstIndex
).first
;
747 if (MatchedSrcBlock
!= nullptr && MatchedDstBlock
!= nullptr) {
748 // Find a jump between the two blocks
749 FlowJump
*Jump
= nullptr;
750 for (FlowJump
*SuccJump
: MatchedSrcBlock
->SuccJumps
) {
751 if (SuccJump
->Target
== MatchedDstBlock
->Index
) {
756 // Assign the weight, if the corresponding jump is found
757 if (Jump
!= nullptr) {
758 Jump
->Weight
= YamlSI
.Count
;
759 Jump
->HasUnknownWeight
= false;
762 // Assign the weight for the src block, if it is found
763 if (MatchedSrcBlock
!= nullptr)
764 OutWeight
[MatchedSrcBlock
->Index
] += YamlSI
.Count
;
765 // Assign the weight for the dst block, if it is found
766 if (MatchedDstBlock
!= nullptr)
767 InWeight
[MatchedDstBlock
->Index
] += YamlSI
.Count
;
772 // Assign block counts based on in-/out- jumps
773 for (FlowBlock
&Block
: Func
.Blocks
) {
774 if (OutWeight
[Block
.Index
] == 0 && InWeight
[Block
.Index
] == 0) {
775 assert(Block
.HasUnknownWeight
&& "unmatched block with a positive count");
778 Block
.HasUnknownWeight
= false;
779 Block
.Weight
= std::max(OutWeight
[Block
.Index
], InWeight
[Block
.Index
]);
782 return MatchedBlocks
[&YamlBF
].size();
785 /// The function finds all blocks that are (i) reachable from the Entry block
786 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
787 /// so that profi does not send any flow to such blocks.
788 void preprocessUnreachableBlocks(FlowFunction
&Func
) {
789 const uint64_t NumBlocks
= Func
.Blocks
.size();
791 // Start bfs from the source
792 std::queue
<uint64_t> Queue
;
793 std::vector
<bool> VisitedEntry(NumBlocks
, false);
794 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
795 FlowBlock
&Block
= Func
.Blocks
[I
];
796 if (Block
.isEntry()) {
798 VisitedEntry
[I
] = true;
802 while (!Queue
.empty()) {
803 const uint64_t Src
= Queue
.front();
805 for (FlowJump
*Jump
: Func
.Blocks
[Src
].SuccJumps
) {
806 const uint64_t Dst
= Jump
->Target
;
807 if (!VisitedEntry
[Dst
]) {
809 VisitedEntry
[Dst
] = true;
814 // Start bfs from all sinks
815 std::vector
<bool> VisitedExit(NumBlocks
, false);
816 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
817 FlowBlock
&Block
= Func
.Blocks
[I
];
818 if (Block
.isExit() && VisitedEntry
[I
]) {
820 VisitedExit
[I
] = true;
823 while (!Queue
.empty()) {
824 const uint64_t Src
= Queue
.front();
826 for (FlowJump
*Jump
: Func
.Blocks
[Src
].PredJumps
) {
827 const uint64_t Dst
= Jump
->Source
;
828 if (!VisitedExit
[Dst
]) {
830 VisitedExit
[Dst
] = true;
835 // Make all blocks of zero weight so that flow is not sent
836 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
837 FlowBlock
&Block
= Func
.Blocks
[I
];
838 if (Block
.Weight
== 0)
840 if (!VisitedEntry
[I
] || !VisitedExit
[I
]) {
842 Block
.HasUnknownWeight
= true;
843 Block
.IsUnlikely
= true;
844 for (FlowJump
*Jump
: Block
.SuccJumps
) {
845 if (Jump
->Source
== Block
.Index
&& Jump
->Target
== Block
.Index
) {
847 Jump
->HasUnknownWeight
= true;
848 Jump
->IsUnlikely
= true;
855 /// Decide if stale profile matching can be applied for a given function.
856 /// Currently we skip inference for (very) large instances and for instances
857 /// having "unexpected" control flow (e.g., having no sink basic blocks).
858 bool canApplyInference(const FlowFunction
&Func
,
859 const yaml::bolt::BinaryFunctionProfile
&YamlBF
,
860 const uint64_t &MatchedBlocks
) {
861 if (Func
.Blocks
.size() > opts::StaleMatchingMaxFuncSize
)
864 if (MatchedBlocks
* 100 <
865 opts::StaleMatchingMinMatchedBlock
* YamlBF
.Blocks
.size())
868 // Returns false if the artificial sink block has no predecessors meaning
869 // there are no exit blocks.
870 if (Func
.Blocks
[Func
.Blocks
.size() - 1].isEntry())
876 /// Apply the profile inference algorithm for a given flow function.
877 void applyInference(FlowFunction
&Func
) {
879 // Set the params from the command-line flags.
880 Params
.EvenFlowDistribution
= opts::StaleMatchingEvenFlowDistribution
;
881 Params
.RebalanceUnknown
= opts::StaleMatchingRebalanceUnknown
;
882 Params
.JoinIslands
= opts::StaleMatchingJoinIslands
;
884 Params
.CostBlockInc
= opts::StaleMatchingCostBlockInc
;
885 Params
.CostBlockEntryInc
= opts::StaleMatchingCostBlockInc
;
886 Params
.CostBlockDec
= opts::StaleMatchingCostBlockDec
;
887 Params
.CostBlockEntryDec
= opts::StaleMatchingCostBlockDec
;
888 Params
.CostBlockUnknownInc
= opts::StaleMatchingCostBlockUnknownInc
;
890 Params
.CostJumpInc
= opts::StaleMatchingCostJumpInc
;
891 Params
.CostJumpFTInc
= opts::StaleMatchingCostJumpInc
;
892 Params
.CostJumpDec
= opts::StaleMatchingCostJumpDec
;
893 Params
.CostJumpFTDec
= opts::StaleMatchingCostJumpDec
;
894 Params
.CostJumpUnknownInc
= opts::StaleMatchingCostJumpUnknownInc
;
895 Params
.CostJumpUnknownFTInc
= opts::StaleMatchingCostJumpUnknownFTInc
;
897 applyFlowInference(Params
, Func
);
900 /// Collect inferred counts from the flow function and update annotations in
901 /// the binary function.
902 void assignProfile(BinaryFunction
&BF
,
903 const BinaryFunction::BasicBlockOrderType
&BlockOrder
,
904 FlowFunction
&Func
) {
905 BinaryContext
&BC
= BF
.getBinaryContext();
907 assert(Func
.Blocks
.size() == BlockOrder
.size() + 2);
908 for (uint64_t I
= 0; I
< BlockOrder
.size(); I
++) {
909 FlowBlock
&Block
= Func
.Blocks
[I
+ 1];
910 BinaryBasicBlock
*BB
= BlockOrder
[I
];
912 // Update block's count
913 BB
->setExecutionCount(Block
.Flow
);
915 // Update jump counts: (i) clean existing counts and then (ii) set new ones
916 auto BI
= BB
->branch_info_begin();
917 for (const BinaryBasicBlock
*DstBB
: BB
->successors()) {
920 BI
->MispredictedCount
= 0;
923 for (FlowJump
*Jump
: Block
.SuccJumps
) {
924 if (Jump
->IsUnlikely
)
929 // Skips the artificial sink block.
930 if (Jump
->Target
== Func
.Blocks
.size() - 1)
932 BinaryBasicBlock
&SuccBB
= *BlockOrder
[Jump
->Target
- 1];
933 // Check if the edge corresponds to a regular jump or a landing pad
934 if (BB
->getSuccessor(SuccBB
.getLabel())) {
935 BinaryBasicBlock::BinaryBranchInfo
&BI
= BB
->getBranchInfo(SuccBB
);
936 BI
.Count
+= Jump
->Flow
;
938 BinaryBasicBlock
*LP
= BB
->getLandingPad(SuccBB
.getLabel());
939 if (LP
&& LP
->getKnownExecutionCount() < Jump
->Flow
)
940 LP
->setExecutionCount(Jump
->Flow
);
944 // Update call-site annotations
945 auto setOrUpdateAnnotation
= [&](MCInst
&Instr
, StringRef Name
,
947 if (BC
.MIB
->hasAnnotation(Instr
, Name
))
948 BC
.MIB
->removeAnnotation(Instr
, Name
);
949 // Do not add zero-count annotations
952 BC
.MIB
->addAnnotation(Instr
, Name
, Count
);
955 for (MCInst
&Instr
: *BB
) {
956 // Ignore pseudo instructions
957 if (BC
.MIB
->isPseudo(Instr
))
959 // Ignore jump tables
960 const MCInst
*LastInstr
= BB
->getLastNonPseudoInstr();
961 if (BC
.MIB
->getJumpTable(*LastInstr
) && LastInstr
== &Instr
)
964 if (BC
.MIB
->isIndirectCall(Instr
) || BC
.MIB
->isIndirectBranch(Instr
)) {
965 auto &ICSP
= BC
.MIB
->getOrCreateAnnotationAs
<IndirectCallSiteProfile
>(
966 Instr
, "CallProfile");
968 // Try to evenly distribute the counts among the call sites
969 const uint64_t TotalCount
= Block
.Flow
;
970 const uint64_t NumSites
= ICSP
.size();
971 for (uint64_t Idx
= 0; Idx
< ICSP
.size(); Idx
++) {
972 IndirectCallProfile
&CSP
= ICSP
[Idx
];
973 uint64_t CountPerSite
= TotalCount
/ NumSites
;
974 // When counts cannot be exactly distributed, increase by 1 the
975 // counts of the first (TotalCount % NumSites) call sites
976 if (Idx
< TotalCount
% NumSites
)
978 CSP
.Count
= CountPerSite
;
981 ICSP
.emplace_back(nullptr, Block
.Flow
, 0);
983 } else if (BC
.MIB
->getConditionalTailCall(Instr
)) {
984 // We don't know exactly the number of times the conditional tail call
985 // is executed; conservatively, setting it to the count of the block
986 setOrUpdateAnnotation(Instr
, "CTCTakenCount", Block
.Flow
);
987 BC
.MIB
->removeAnnotation(Instr
, "CTCMispredCount");
988 } else if (BC
.MIB
->isCall(Instr
)) {
989 setOrUpdateAnnotation(Instr
, "Count", Block
.Flow
);
994 // Update function's execution count and mark the function inferred.
995 BF
.setExecutionCount(Func
.Blocks
[0].Flow
);
996 BF
.setHasInferredProfile(true);
999 bool YAMLProfileReader::inferStaleProfile(
1000 BinaryFunction
&BF
, const yaml::bolt::BinaryFunctionProfile
&YamlBF
,
1001 const ArrayRef
<ProbeMatchSpec
> ProbeMatchSpecs
) {
1003 NamedRegionTimer
T("inferStaleProfile", "stale profile inference", "rewrite",
1004 "Rewrite passes", opts::TimeRewrite
);
1009 LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
1010 << "\"" << BF
.getPrintName() << "\"\n");
1012 // Make sure that block hashes are up to date.
1013 BF
.computeBlockHashes(YamlBP
.Header
.HashFunction
);
1015 const BinaryFunction::BasicBlockOrderType
BlockOrder(
1016 BF
.getLayout().block_begin(), BF
.getLayout().block_end());
1018 // Tracks the number of matched blocks.
1020 // Create a wrapper flow function to use with the profile inference algorithm.
1021 FlowFunction Func
= createFlowFunction(BlockOrder
);
1023 // Match as many block/jump counts from the stale profile as possible
1024 size_t MatchedBlocks
=
1025 matchWeights(BF
.getBinaryContext(), BlockOrder
, YamlBF
, Func
,
1026 YamlBP
.Header
.HashFunction
, IdToYamLBF
, BF
, ProbeMatchSpecs
);
1028 // Adjust the flow function by marking unreachable blocks Unlikely so that
1029 // they don't get any counts assigned.
1030 preprocessUnreachableBlocks(Func
);
1032 // Check if profile inference can be applied for the instance.
1033 if (!canApplyInference(Func
, YamlBF
, MatchedBlocks
))
1036 // Apply the profile inference algorithm.
1037 applyInference(Func
);
1039 // Collect inferred counts and update function annotations.
1040 assignProfile(BF
, BlockOrder
, Func
);
1042 // As of now, we always mark the binary function having "correct" profile.
1043 // In the future, we may discard the results for instances with poor inference
1044 // metrics and keep such functions un-optimized.
1048 } // end namespace bolt
1049 } // end namespace llvm