Bump version to 19.1.0 (final)
[llvm-project.git] / bolt / lib / Profile / StaleProfileMatching.cpp
blobcd6e96f7e2cf47f14a7a2900a9cb8abf24498934
1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // BOLT often has to deal with profiles collected on binaries built from several
10 // revisions behind release. As a result, a certain percentage of functions is
11 // considered stale and not optimized. This file implements an ability to match
12 // profile to functions that are not 100% binary identical, and thus, increasing
13 // the optimization coverage and boost the performance of applications.
15 // The algorithm consists of two phases: matching and inference:
16 // - At the matching phase, we try to "guess" as many block and jump counts from
17 // the stale profile as possible. To this end, the content of each basic block
18 // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
19 // it computes block hashes and identifies the corresponding entries in the
20 // stale profile. It yields a partial profile for every CFG in the binary.
21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
22 // reconstruct "realistic" block and jump counts from the partial profile
23 // generated at the first stage. In practice, we don't always produce proper
24 // profile data but the majority (e.g., >90%) of CFGs get the correct counts.
26 //===----------------------------------------------------------------------===//
28 #include "bolt/Core/HashUtilities.h"
29 #include "bolt/Profile/YAMLProfileReader.h"
30 #include "llvm/ADT/Bitfields.h"
31 #include "llvm/ADT/Hashing.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/Timer.h"
34 #include "llvm/Support/xxhash.h"
35 #include "llvm/Transforms/Utils/SampleProfileInference.h"
37 #include <queue>
39 using namespace llvm;
41 #undef DEBUG_TYPE
42 #define DEBUG_TYPE "bolt-prof"
44 namespace opts {
46 extern cl::opt<bool> TimeRewrite;
47 extern cl::OptionCategory BoltOptCategory;
49 cl::opt<bool>
50 InferStaleProfile("infer-stale-profile",
51 cl::desc("Infer counts from stale profile data."),
52 cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
54 cl::opt<unsigned> StaleMatchingMinMatchedBlock(
55 "stale-matching-min-matched-block",
56 cl::desc("Percentage threshold of matched basic blocks at which stale "
57 "profile inference is executed."),
58 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
60 cl::opt<unsigned> StaleMatchingMaxFuncSize(
61 "stale-matching-max-func-size",
62 cl::desc("The maximum size of a function to consider for inference."),
63 cl::init(10000), cl::Hidden, cl::cat(BoltOptCategory));
65 // Parameters of the profile inference algorithm. The default values are tuned
66 // on several benchmarks.
67 cl::opt<bool> StaleMatchingEvenFlowDistribution(
68 "stale-matching-even-flow-distribution",
69 cl::desc("Try to evenly distribute flow when there are multiple equally "
70 "likely options."),
71 cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory));
73 cl::opt<bool> StaleMatchingRebalanceUnknown(
74 "stale-matching-rebalance-unknown",
75 cl::desc("Evenly re-distribute flow among unknown subgraphs."),
76 cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory));
78 cl::opt<bool> StaleMatchingJoinIslands(
79 "stale-matching-join-islands",
80 cl::desc("Join isolated components having positive flow."), cl::init(true),
81 cl::ReallyHidden, cl::cat(BoltOptCategory));
83 cl::opt<unsigned> StaleMatchingCostBlockInc(
84 "stale-matching-cost-block-inc",
85 cl::desc("The cost of increasing a block count by one."), cl::init(150),
86 cl::ReallyHidden, cl::cat(BoltOptCategory));
88 cl::opt<unsigned> StaleMatchingCostBlockDec(
89 "stale-matching-cost-block-dec",
90 cl::desc("The cost of decreasing a block count by one."), cl::init(150),
91 cl::ReallyHidden, cl::cat(BoltOptCategory));
93 cl::opt<unsigned> StaleMatchingCostJumpInc(
94 "stale-matching-cost-jump-inc",
95 cl::desc("The cost of increasing a jump count by one."), cl::init(150),
96 cl::ReallyHidden, cl::cat(BoltOptCategory));
98 cl::opt<unsigned> StaleMatchingCostJumpDec(
99 "stale-matching-cost-jump-dec",
100 cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
101 cl::ReallyHidden, cl::cat(BoltOptCategory));
103 cl::opt<unsigned> StaleMatchingCostBlockUnknownInc(
104 "stale-matching-cost-block-unknown-inc",
105 cl::desc("The cost of increasing an unknown block count by one."),
106 cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory));
108 cl::opt<unsigned> StaleMatchingCostJumpUnknownInc(
109 "stale-matching-cost-jump-unknown-inc",
110 cl::desc("The cost of increasing an unknown jump count by one."),
111 cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory));
113 cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
114 "stale-matching-cost-jump-unknown-ft-inc",
115 cl::desc(
116 "The cost of increasing an unknown fall-through jump count by one."),
117 cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory));
119 } // namespace opts
121 namespace llvm {
122 namespace bolt {
124 /// An object wrapping several components of a basic block hash. The combined
125 /// (blended) hash is represented and stored as one uint64_t, while individual
126 /// components are of smaller size (e.g., uint16_t or uint8_t).
127 struct BlendedBlockHash {
128 private:
129 using ValueOffset = Bitfield::Element<uint16_t, 0, 16>;
130 using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>;
131 using ValueInstr = Bitfield::Element<uint16_t, 32, 16>;
132 using ValuePred = Bitfield::Element<uint8_t, 48, 8>;
133 using ValueSucc = Bitfield::Element<uint8_t, 56, 8>;
135 public:
136 explicit BlendedBlockHash() {}
138 explicit BlendedBlockHash(uint64_t Hash) {
139 Offset = Bitfield::get<ValueOffset>(Hash);
140 OpcodeHash = Bitfield::get<ValueOpcode>(Hash);
141 InstrHash = Bitfield::get<ValueInstr>(Hash);
142 PredHash = Bitfield::get<ValuePred>(Hash);
143 SuccHash = Bitfield::get<ValueSucc>(Hash);
146 /// Combine the blended hash into uint64_t.
147 uint64_t combine() const {
148 uint64_t Hash = 0;
149 Bitfield::set<ValueOffset>(Hash, Offset);
150 Bitfield::set<ValueOpcode>(Hash, OpcodeHash);
151 Bitfield::set<ValueInstr>(Hash, InstrHash);
152 Bitfield::set<ValuePred>(Hash, PredHash);
153 Bitfield::set<ValueSucc>(Hash, SuccHash);
154 return Hash;
157 /// Compute a distance between two given blended hashes. The smaller the
158 /// distance, the more similar two blocks are. For identical basic blocks,
159 /// the distance is zero.
160 uint64_t distance(const BlendedBlockHash &BBH) const {
161 assert(OpcodeHash == BBH.OpcodeHash &&
162 "incorrect blended hash distance computation");
163 uint64_t Dist = 0;
164 // Account for NeighborHash
165 Dist += SuccHash == BBH.SuccHash ? 0 : 1;
166 Dist += PredHash == BBH.PredHash ? 0 : 1;
167 Dist <<= 16;
168 // Account for InstrHash
169 Dist += InstrHash == BBH.InstrHash ? 0 : 1;
170 Dist <<= 16;
171 // Account for Offset
172 Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
173 return Dist;
176 /// The offset of the basic block from the function start.
177 uint16_t Offset{0};
178 /// (Loose) Hash of the basic block instructions, excluding operands.
179 uint16_t OpcodeHash{0};
180 /// (Strong) Hash of the basic block instructions, including opcodes and
181 /// operands.
182 uint16_t InstrHash{0};
183 /// (Loose) Hashes of the predecessors of the basic block.
184 uint8_t PredHash{0};
185 /// (Loose) Hashes of the successors of the basic block.
186 uint8_t SuccHash{0};
189 /// The object is used to identify and match basic blocks in a BinaryFunction
190 /// given their hashes computed on a binary built from several revisions behind
191 /// release.
192 class StaleMatcher {
193 public:
194 /// Initialize stale matcher.
195 void init(const std::vector<FlowBlock *> &Blocks,
196 const std::vector<BlendedBlockHash> &Hashes,
197 const std::vector<uint64_t> &CallHashes) {
198 assert(Blocks.size() == Hashes.size() &&
199 Hashes.size() == CallHashes.size() &&
200 "incorrect matcher initialization");
201 for (size_t I = 0; I < Blocks.size(); I++) {
202 FlowBlock *Block = Blocks[I];
203 uint16_t OpHash = Hashes[I].OpcodeHash;
204 OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block));
205 if (CallHashes[I])
206 CallHashToBlocks[CallHashes[I]].push_back(
207 std::make_pair(Hashes[I], Block));
211 /// Find the most similar block for a given hash.
212 const FlowBlock *matchBlock(BlendedBlockHash BlendedHash,
213 uint64_t CallHash) const {
214 const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
215 return BestBlock ? BestBlock : matchWithCalls(BlendedHash, CallHash);
218 /// Returns true if the two basic blocks (in the binary and in the profile)
219 /// corresponding to the given hashes are matched to each other with a high
220 /// confidence.
221 static bool isHighConfidenceMatch(BlendedBlockHash Hash1,
222 BlendedBlockHash Hash2) {
223 return Hash1.InstrHash == Hash2.InstrHash;
226 private:
227 using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
228 std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
229 std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
231 // Uses OpcodeHash to find the most similar block for a given hash.
232 const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
233 auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
234 if (BlockIt == OpHashToBlocks.end())
235 return nullptr;
236 FlowBlock *BestBlock = nullptr;
237 uint64_t BestDist = std::numeric_limits<uint64_t>::max();
238 for (const auto &[Hash, Block] : BlockIt->second) {
239 uint64_t Dist = Hash.distance(BlendedHash);
240 if (BestBlock == nullptr || Dist < BestDist) {
241 BestDist = Dist;
242 BestBlock = Block;
245 return BestBlock;
248 // Uses CallHash to find the most similar block for a given hash.
249 const FlowBlock *matchWithCalls(BlendedBlockHash BlendedHash,
250 uint64_t CallHash) const {
251 if (!CallHash)
252 return nullptr;
253 auto BlockIt = CallHashToBlocks.find(CallHash);
254 if (BlockIt == CallHashToBlocks.end())
255 return nullptr;
256 FlowBlock *BestBlock = nullptr;
257 uint64_t BestDist = std::numeric_limits<uint64_t>::max();
258 for (const auto &[Hash, Block] : BlockIt->second) {
259 uint64_t Dist = Hash.OpcodeHash > BlendedHash.OpcodeHash
260 ? Hash.OpcodeHash - BlendedHash.OpcodeHash
261 : BlendedHash.OpcodeHash - Hash.OpcodeHash;
262 if (BestBlock == nullptr || Dist < BestDist) {
263 BestDist = Dist;
264 BestBlock = Block;
267 return BestBlock;
271 void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const {
272 if (size() == 0)
273 return;
275 assert(hasCFG() && "the function is expected to have CFG");
277 std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size());
278 std::vector<uint64_t> OpcodeHashes(BasicBlocks.size());
279 // Initialize hash components.
280 for (size_t I = 0; I < BasicBlocks.size(); I++) {
281 const BinaryBasicBlock *BB = BasicBlocks[I];
282 assert(BB->getIndex() == I && "incorrect block index");
283 BlendedHashes[I].Offset = BB->getOffset();
284 // Hashing complete instructions.
285 std::string InstrHashStr = hashBlock(
286 BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); });
287 if (HashFunction == HashFunction::StdHash) {
288 uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr);
289 BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash);
290 } else if (HashFunction == HashFunction::XXH3) {
291 uint64_t InstrHash = llvm::xxh3_64bits(InstrHashStr);
292 BlendedHashes[I].InstrHash = (uint16_t)InstrHash;
293 } else {
294 llvm_unreachable("Unhandled HashFunction");
296 // Hashing opcodes.
297 std::string OpcodeHashStr = hashBlockLoose(BC, *BB);
298 if (HashFunction == HashFunction::StdHash) {
299 OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr);
300 BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]);
301 } else if (HashFunction == HashFunction::XXH3) {
302 OpcodeHashes[I] = llvm::xxh3_64bits(OpcodeHashStr);
303 BlendedHashes[I].OpcodeHash = (uint16_t)OpcodeHashes[I];
304 } else {
305 llvm_unreachable("Unhandled HashFunction");
309 // Initialize neighbor hash.
310 for (size_t I = 0; I < BasicBlocks.size(); I++) {
311 const BinaryBasicBlock *BB = BasicBlocks[I];
312 // Append hashes of successors.
313 uint64_t Hash = 0;
314 for (BinaryBasicBlock *SuccBB : BB->successors()) {
315 uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()];
316 Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
318 if (HashFunction == HashFunction::StdHash) {
319 // Compatibility with old behavior.
320 BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash);
321 } else {
322 BlendedHashes[I].SuccHash = (uint8_t)Hash;
325 // Append hashes of predecessors.
326 Hash = 0;
327 for (BinaryBasicBlock *PredBB : BB->predecessors()) {
328 uint64_t PredHash = OpcodeHashes[PredBB->getIndex()];
329 Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
331 if (HashFunction == HashFunction::StdHash) {
332 // Compatibility with old behavior.
333 BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash);
334 } else {
335 BlendedHashes[I].PredHash = (uint8_t)Hash;
339 // Assign hashes.
340 for (size_t I = 0; I < BasicBlocks.size(); I++) {
341 const BinaryBasicBlock *BB = BasicBlocks[I];
342 BB->setHash(BlendedHashes[I].combine());
345 // TODO: mediate the difference between flow function construction here in BOLT
346 // and in the compiler by splitting blocks with exception throwing calls at the
347 // call and adding the landing pad as the successor.
348 /// Create a wrapper flow function to use with the profile inference algorithm,
349 /// and initialize its jumps and metadata.
350 FlowFunction
351 createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
352 FlowFunction Func;
354 // Add a special "dummy" source so that there is always a unique entry point.
355 FlowBlock EntryBlock;
356 EntryBlock.Index = 0;
357 Func.Blocks.push_back(EntryBlock);
359 // Create FlowBlock for every basic block in the binary function.
360 for (const BinaryBasicBlock *BB : BlockOrder) {
361 Func.Blocks.emplace_back();
362 FlowBlock &Block = Func.Blocks.back();
363 Block.Index = Func.Blocks.size() - 1;
364 (void)BB;
365 assert(Block.Index == BB->getIndex() + 1 &&
366 "incorrectly assigned basic block index");
369 // Add a special "dummy" sink block so there is always a unique sink.
370 FlowBlock SinkBlock;
371 SinkBlock.Index = Func.Blocks.size();
372 Func.Blocks.push_back(SinkBlock);
374 // Create FlowJump for each jump between basic blocks in the binary function.
375 std::vector<uint64_t> InDegree(Func.Blocks.size(), 0);
376 for (const BinaryBasicBlock *SrcBB : BlockOrder) {
377 std::unordered_set<const BinaryBasicBlock *> UniqueSuccs;
378 // Collect regular jumps
379 for (const BinaryBasicBlock *DstBB : SrcBB->successors()) {
380 // Ignoring parallel edges
381 if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
382 continue;
384 Func.Jumps.emplace_back();
385 FlowJump &Jump = Func.Jumps.back();
386 Jump.Source = SrcBB->getIndex() + 1;
387 Jump.Target = DstBB->getIndex() + 1;
388 InDegree[Jump.Target]++;
389 UniqueSuccs.insert(DstBB);
391 // TODO: set jump from exit block to landing pad to Unlikely.
392 // If the block is an exit, add a dummy edge from it to the sink block.
393 if (UniqueSuccs.empty()) {
394 Func.Jumps.emplace_back();
395 FlowJump &Jump = Func.Jumps.back();
396 Jump.Source = SrcBB->getIndex() + 1;
397 Jump.Target = Func.Blocks.size() - 1;
398 InDegree[Jump.Target]++;
401 // Collect jumps to landing pads
402 for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) {
403 // Ignoring parallel edges
404 if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
405 continue;
407 Func.Jumps.emplace_back();
408 FlowJump &Jump = Func.Jumps.back();
409 Jump.Source = SrcBB->getIndex() + 1;
410 Jump.Target = DstBB->getIndex() + 1;
411 InDegree[Jump.Target]++;
412 UniqueSuccs.insert(DstBB);
416 // Add dummy edges to the extra sources. If there are multiple entry blocks,
417 // add an unlikely edge from 0 to the subsequent ones. Skips the sink block.
418 assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors");
419 for (uint64_t I = 1; I < Func.Blocks.size() - 1; I++) {
420 const BinaryBasicBlock *BB = BlockOrder[I - 1];
421 if (BB->isEntryPoint() || InDegree[I] == 0) {
422 Func.Jumps.emplace_back();
423 FlowJump &Jump = Func.Jumps.back();
424 Jump.Source = 0;
425 Jump.Target = I;
426 if (!BB->isEntryPoint())
427 Jump.IsUnlikely = true;
431 // Create necessary metadata for the flow function
432 for (FlowJump &Jump : Func.Jumps) {
433 assert(Jump.Source < Func.Blocks.size());
434 Func.Blocks[Jump.Source].SuccJumps.push_back(&Jump);
435 assert(Jump.Target < Func.Blocks.size());
436 Func.Blocks[Jump.Target].PredJumps.push_back(&Jump);
438 return Func;
441 /// Assign initial block/jump weights based on the stale profile data. The goal
442 /// is to extract as much information from the stale profile as possible. Here
443 /// we assume that each basic block is specified via a hash value computed from
444 /// its content and the hashes of the unchanged basic blocks stay the same
445 /// across different revisions of the binary.
446 /// Whenever there is a count in the profile with the hash corresponding to one
447 /// of the basic blocks in the binary, the count is "matched" to the block.
448 /// Similarly, if both the source and the target of a count in the profile are
449 /// matched to a jump in the binary, the count is recorded in CFG.
450 size_t
451 matchWeightsByHashes(BinaryContext &BC,
452 const BinaryFunction::BasicBlockOrderType &BlockOrder,
453 const yaml::bolt::BinaryFunctionProfile &YamlBF,
454 FlowFunction &Func, HashFunction HashFunction,
455 YAMLProfileReader::ProfileLookupMap &IdToYamlBF) {
457 assert(Func.Blocks.size() == BlockOrder.size() + 2);
459 std::vector<uint64_t> CallHashes;
460 std::vector<FlowBlock *> Blocks;
461 std::vector<BlendedBlockHash> BlendedHashes;
462 for (uint64_t I = 0; I < BlockOrder.size(); I++) {
463 const BinaryBasicBlock *BB = BlockOrder[I];
464 assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
466 std::string CallHashStr = hashBlockCalls(BC, *BB);
467 if (CallHashStr.empty()) {
468 CallHashes.push_back(0);
469 } else {
470 if (HashFunction == HashFunction::StdHash)
471 CallHashes.push_back(std::hash<std::string>{}(CallHashStr));
472 else if (HashFunction == HashFunction::XXH3)
473 CallHashes.push_back(llvm::xxh3_64bits(CallHashStr));
474 else
475 llvm_unreachable("Unhandled HashFunction");
478 Blocks.push_back(&Func.Blocks[I + 1]);
479 BlendedBlockHash BlendedHash(BB->getHash());
480 BlendedHashes.push_back(BlendedHash);
481 LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
482 << Twine::utohexstr(BB->getHash()) << "\n");
484 StaleMatcher Matcher;
485 Matcher.init(Blocks, BlendedHashes, CallHashes);
487 // Index in yaml profile => corresponding (matched) block
488 DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
489 // Match blocks from the profile to the blocks in CFG
490 for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
491 assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile");
492 BlendedBlockHash YamlHash(YamlBB.Hash);
494 const FlowBlock *MatchedBlock = nullptr;
495 std::string CallHashStr = hashBlockCalls(IdToYamlBF, YamlBB);
496 uint64_t CallHash = 0;
497 if (!CallHashStr.empty()) {
498 if (HashFunction == HashFunction::StdHash)
499 CallHash = std::hash<std::string>{}(CallHashStr);
500 else if (HashFunction == HashFunction::XXH3)
501 CallHash = llvm::xxh3_64bits(CallHashStr);
502 else
503 llvm_unreachable("Unhandled HashFunction");
505 MatchedBlock = Matcher.matchBlock(YamlHash, CallHash);
506 if (MatchedBlock == nullptr && YamlBB.Index == 0)
507 MatchedBlock = Blocks[0];
508 if (MatchedBlock != nullptr) {
509 const BinaryBasicBlock *BB = BlockOrder[MatchedBlock->Index - 1];
510 MatchedBlocks[YamlBB.Index] = MatchedBlock;
511 BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1];
512 LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")"
513 << " with hash " << Twine::utohexstr(YamlBB.Hash)
514 << " to BB (index = " << MatchedBlock->Index - 1 << ")"
515 << " with hash " << Twine::utohexstr(BinHash.combine())
516 << "\n");
517 // Update matching stats accounting for the matched block.
518 if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
519 ++BC.Stats.NumMatchedBlocks;
520 BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
521 LLVM_DEBUG(dbgs() << " exact match\n");
522 } else {
523 LLVM_DEBUG(dbgs() << " loose match\n");
525 if (YamlBB.NumInstructions == BB->size())
526 ++BC.Stats.NumStaleBlocksWithEqualIcount;
527 } else {
528 LLVM_DEBUG(
529 dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")"
530 << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n");
533 // Update matching stats.
534 ++BC.Stats.NumStaleBlocks;
535 BC.Stats.StaleSampleCount += YamlBB.ExecCount;
538 // Match jumps from the profile to the jumps from CFG
539 std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
540 std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);
541 for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
542 for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) {
543 if (YamlSI.Count == 0)
544 continue;
546 // Try to find the jump for a given (src, dst) pair from the profile and
547 // assign the jump weight based on the profile count
548 const uint64_t SrcIndex = YamlBB.Index;
549 const uint64_t DstIndex = YamlSI.Index;
551 const FlowBlock *MatchedSrcBlock = MatchedBlocks.lookup(SrcIndex);
552 const FlowBlock *MatchedDstBlock = MatchedBlocks.lookup(DstIndex);
554 if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) {
555 // Find a jump between the two blocks
556 FlowJump *Jump = nullptr;
557 for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) {
558 if (SuccJump->Target == MatchedDstBlock->Index) {
559 Jump = SuccJump;
560 break;
563 // Assign the weight, if the corresponding jump is found
564 if (Jump != nullptr) {
565 Jump->Weight = YamlSI.Count;
566 Jump->HasUnknownWeight = false;
569 // Assign the weight for the src block, if it is found
570 if (MatchedSrcBlock != nullptr)
571 OutWeight[MatchedSrcBlock->Index] += YamlSI.Count;
572 // Assign the weight for the dst block, if it is found
573 if (MatchedDstBlock != nullptr)
574 InWeight[MatchedDstBlock->Index] += YamlSI.Count;
578 // Assign block counts based on in-/out- jumps
579 for (FlowBlock &Block : Func.Blocks) {
580 if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) {
581 assert(Block.HasUnknownWeight && "unmatched block with a positive count");
582 continue;
584 Block.HasUnknownWeight = false;
585 Block.Weight = std::max(OutWeight[Block.Index], InWeight[Block.Index]);
588 return MatchedBlocks.size();
591 /// The function finds all blocks that are (i) reachable from the Entry block
592 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
593 /// so that profi does not send any flow to such blocks.
594 void preprocessUnreachableBlocks(FlowFunction &Func) {
595 const uint64_t NumBlocks = Func.Blocks.size();
597 // Start bfs from the source
598 std::queue<uint64_t> Queue;
599 std::vector<bool> VisitedEntry(NumBlocks, false);
600 for (uint64_t I = 0; I < NumBlocks; I++) {
601 FlowBlock &Block = Func.Blocks[I];
602 if (Block.isEntry()) {
603 Queue.push(I);
604 VisitedEntry[I] = true;
605 break;
608 while (!Queue.empty()) {
609 const uint64_t Src = Queue.front();
610 Queue.pop();
611 for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) {
612 const uint64_t Dst = Jump->Target;
613 if (!VisitedEntry[Dst]) {
614 Queue.push(Dst);
615 VisitedEntry[Dst] = true;
620 // Start bfs from all sinks
621 std::vector<bool> VisitedExit(NumBlocks, false);
622 for (uint64_t I = 0; I < NumBlocks; I++) {
623 FlowBlock &Block = Func.Blocks[I];
624 if (Block.isExit() && VisitedEntry[I]) {
625 Queue.push(I);
626 VisitedExit[I] = true;
629 while (!Queue.empty()) {
630 const uint64_t Src = Queue.front();
631 Queue.pop();
632 for (FlowJump *Jump : Func.Blocks[Src].PredJumps) {
633 const uint64_t Dst = Jump->Source;
634 if (!VisitedExit[Dst]) {
635 Queue.push(Dst);
636 VisitedExit[Dst] = true;
641 // Make all blocks of zero weight so that flow is not sent
642 for (uint64_t I = 0; I < NumBlocks; I++) {
643 FlowBlock &Block = Func.Blocks[I];
644 if (Block.Weight == 0)
645 continue;
646 if (!VisitedEntry[I] || !VisitedExit[I]) {
647 Block.Weight = 0;
648 Block.HasUnknownWeight = true;
649 Block.IsUnlikely = true;
650 for (FlowJump *Jump : Block.SuccJumps) {
651 if (Jump->Source == Block.Index && Jump->Target == Block.Index) {
652 Jump->Weight = 0;
653 Jump->HasUnknownWeight = true;
654 Jump->IsUnlikely = true;
661 /// Decide if stale profile matching can be applied for a given function.
662 /// Currently we skip inference for (very) large instances and for instances
663 /// having "unexpected" control flow (e.g., having no sink basic blocks).
664 bool canApplyInference(const FlowFunction &Func,
665 const yaml::bolt::BinaryFunctionProfile &YamlBF,
666 const uint64_t &MatchedBlocks) {
667 if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize)
668 return false;
670 if (MatchedBlocks * 100 <
671 opts::StaleMatchingMinMatchedBlock * YamlBF.Blocks.size())
672 return false;
674 // Returns false if the artificial sink block has no predecessors meaning
675 // there are no exit blocks.
676 if (Func.Blocks[Func.Blocks.size() - 1].isEntry())
677 return false;
679 return true;
682 /// Apply the profile inference algorithm for a given flow function.
683 void applyInference(FlowFunction &Func) {
684 ProfiParams Params;
685 // Set the params from the command-line flags.
686 Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution;
687 Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown;
688 Params.JoinIslands = opts::StaleMatchingJoinIslands;
690 Params.CostBlockInc = opts::StaleMatchingCostBlockInc;
691 Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc;
692 Params.CostBlockDec = opts::StaleMatchingCostBlockDec;
693 Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec;
694 Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc;
696 Params.CostJumpInc = opts::StaleMatchingCostJumpInc;
697 Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc;
698 Params.CostJumpDec = opts::StaleMatchingCostJumpDec;
699 Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec;
700 Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc;
701 Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc;
703 applyFlowInference(Params, Func);
706 /// Collect inferred counts from the flow function and update annotations in
707 /// the binary function.
708 void assignProfile(BinaryFunction &BF,
709 const BinaryFunction::BasicBlockOrderType &BlockOrder,
710 FlowFunction &Func) {
711 BinaryContext &BC = BF.getBinaryContext();
713 assert(Func.Blocks.size() == BlockOrder.size() + 2);
714 for (uint64_t I = 0; I < BlockOrder.size(); I++) {
715 FlowBlock &Block = Func.Blocks[I + 1];
716 BinaryBasicBlock *BB = BlockOrder[I];
718 // Update block's count
719 BB->setExecutionCount(Block.Flow);
721 // Update jump counts: (i) clean existing counts and then (ii) set new ones
722 auto BI = BB->branch_info_begin();
723 for (const BinaryBasicBlock *DstBB : BB->successors()) {
724 (void)DstBB;
725 BI->Count = 0;
726 BI->MispredictedCount = 0;
727 ++BI;
729 for (FlowJump *Jump : Block.SuccJumps) {
730 if (Jump->IsUnlikely)
731 continue;
732 if (Jump->Flow == 0)
733 continue;
735 // Skips the artificial sink block.
736 if (Jump->Target == Func.Blocks.size() - 1)
737 continue;
738 BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1];
739 // Check if the edge corresponds to a regular jump or a landing pad
740 if (BB->getSuccessor(SuccBB.getLabel())) {
741 BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(SuccBB);
742 BI.Count += Jump->Flow;
743 } else {
744 BinaryBasicBlock *LP = BB->getLandingPad(SuccBB.getLabel());
745 if (LP && LP->getKnownExecutionCount() < Jump->Flow)
746 LP->setExecutionCount(Jump->Flow);
750 // Update call-site annotations
751 auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name,
752 uint64_t Count) {
753 if (BC.MIB->hasAnnotation(Instr, Name))
754 BC.MIB->removeAnnotation(Instr, Name);
755 // Do not add zero-count annotations
756 if (Count == 0)
757 return;
758 BC.MIB->addAnnotation(Instr, Name, Count);
761 for (MCInst &Instr : *BB) {
762 // Ignore pseudo instructions
763 if (BC.MIB->isPseudo(Instr))
764 continue;
765 // Ignore jump tables
766 const MCInst *LastInstr = BB->getLastNonPseudoInstr();
767 if (BC.MIB->getJumpTable(*LastInstr) && LastInstr == &Instr)
768 continue;
770 if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) {
771 auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
772 Instr, "CallProfile");
773 if (!ICSP.empty()) {
774 // Try to evenly distribute the counts among the call sites
775 const uint64_t TotalCount = Block.Flow;
776 const uint64_t NumSites = ICSP.size();
777 for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) {
778 IndirectCallProfile &CSP = ICSP[Idx];
779 uint64_t CountPerSite = TotalCount / NumSites;
780 // When counts cannot be exactly distributed, increase by 1 the
781 // counts of the first (TotalCount % NumSites) call sites
782 if (Idx < TotalCount % NumSites)
783 CountPerSite++;
784 CSP.Count = CountPerSite;
786 } else {
787 ICSP.emplace_back(nullptr, Block.Flow, 0);
789 } else if (BC.MIB->getConditionalTailCall(Instr)) {
790 // We don't know exactly the number of times the conditional tail call
791 // is executed; conservatively, setting it to the count of the block
792 setOrUpdateAnnotation(Instr, "CTCTakenCount", Block.Flow);
793 BC.MIB->removeAnnotation(Instr, "CTCMispredCount");
794 } else if (BC.MIB->isCall(Instr)) {
795 setOrUpdateAnnotation(Instr, "Count", Block.Flow);
800 // Update function's execution count and mark the function inferred.
801 BF.setExecutionCount(Func.Blocks[0].Flow);
802 BF.setHasInferredProfile(true);
805 bool YAMLProfileReader::inferStaleProfile(
806 BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
808 NamedRegionTimer T("inferStaleProfile", "stale profile inference", "rewrite",
809 "Rewrite passes", opts::TimeRewrite);
811 if (!BF.hasCFG())
812 return false;
814 LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
815 << "\"" << BF.getPrintName() << "\"\n");
817 // Make sure that block hashes are up to date.
818 BF.computeBlockHashes(YamlBP.Header.HashFunction);
820 const BinaryFunction::BasicBlockOrderType BlockOrder(
821 BF.getLayout().block_begin(), BF.getLayout().block_end());
823 // Tracks the number of matched blocks.
825 // Create a wrapper flow function to use with the profile inference algorithm.
826 FlowFunction Func = createFlowFunction(BlockOrder);
828 // Match as many block/jump counts from the stale profile as possible
829 size_t MatchedBlocks =
830 matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func,
831 YamlBP.Header.HashFunction, IdToYamLBF);
833 // Adjust the flow function by marking unreachable blocks Unlikely so that
834 // they don't get any counts assigned.
835 preprocessUnreachableBlocks(Func);
837 // Check if profile inference can be applied for the instance.
838 if (!canApplyInference(Func, YamlBF, MatchedBlocks))
839 return false;
841 // Apply the profile inference algorithm.
842 applyInference(Func);
844 // Collect inferred counts and update function annotations.
845 assignProfile(BF, BlockOrder, Func);
847 // As of now, we always mark the binary function having "correct" profile.
848 // In the future, we may discard the results for instances with poor inference
849 // metrics and keep such functions un-optimized.
850 return true;
853 } // end namespace bolt
854 } // end namespace llvm