[llvm-shlib] Fix the version naming style of libLLVM for Windows (#85710)
[llvm-project.git] / bolt / lib / Profile / StaleProfileMatching.cpp
blob26180f1321477972124360473a35f097780b2368
1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // BOLT often has to deal with profiles collected on binaries built from several
10 // revisions behind release. As a result, a certain percentage of functions is
11 // considered stale and not optimized. This file implements an ability to match
12 // profile to functions that are not 100% binary identical, and thus, increasing
13 // the optimization coverage and boost the performance of applications.
15 // The algorithm consists of two phases: matching and inference:
16 // - At the matching phase, we try to "guess" as many block and jump counts from
17 // the stale profile as possible. To this end, the content of each basic block
18 // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
19 // it computes block hashes and identifies the corresponding entries in the
20 // stale profile. It yields a partial profile for every CFG in the binary.
21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
22 // reconstruct "realistic" block and jump counts from the partial profile
23 // generated at the first stage. In practice, we don't always produce proper
24 // profile data but the majority (e.g., >90%) of CFGs get the correct counts.
26 //===----------------------------------------------------------------------===//
28 #include "bolt/Core/HashUtilities.h"
29 #include "bolt/Profile/YAMLProfileReader.h"
30 #include "llvm/ADT/Bitfields.h"
31 #include "llvm/ADT/Hashing.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/xxhash.h"
34 #include "llvm/Transforms/Utils/SampleProfileInference.h"
36 #include <queue>
38 using namespace llvm;
40 #undef DEBUG_TYPE
41 #define DEBUG_TYPE "bolt-prof"
43 namespace opts {
45 extern cl::OptionCategory BoltOptCategory;
47 cl::opt<bool>
48 InferStaleProfile("infer-stale-profile",
49 cl::desc("Infer counts from stale profile data."),
50 cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
52 cl::opt<unsigned> StaleMatchingMaxFuncSize(
53 "stale-matching-max-func-size",
54 cl::desc("The maximum size of a function to consider for inference."),
55 cl::init(10000), cl::Hidden, cl::cat(BoltOptCategory));
57 // Parameters of the profile inference algorithm. The default values are tuned
58 // on several benchmarks.
59 cl::opt<bool> StaleMatchingEvenFlowDistribution(
60 "stale-matching-even-flow-distribution",
61 cl::desc("Try to evenly distribute flow when there are multiple equally "
62 "likely options."),
63 cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory));
65 cl::opt<bool> StaleMatchingRebalanceUnknown(
66 "stale-matching-rebalance-unknown",
67 cl::desc("Evenly re-distribute flow among unknown subgraphs."),
68 cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory));
70 cl::opt<bool> StaleMatchingJoinIslands(
71 "stale-matching-join-islands",
72 cl::desc("Join isolated components having positive flow."), cl::init(true),
73 cl::ReallyHidden, cl::cat(BoltOptCategory));
75 cl::opt<unsigned> StaleMatchingCostBlockInc(
76 "stale-matching-cost-block-inc",
77 cl::desc("The cost of increasing a block count by one."), cl::init(150),
78 cl::ReallyHidden, cl::cat(BoltOptCategory));
80 cl::opt<unsigned> StaleMatchingCostBlockDec(
81 "stale-matching-cost-block-dec",
82 cl::desc("The cost of decreasing a block count by one."), cl::init(150),
83 cl::ReallyHidden, cl::cat(BoltOptCategory));
85 cl::opt<unsigned> StaleMatchingCostJumpInc(
86 "stale-matching-cost-jump-inc",
87 cl::desc("The cost of increasing a jump count by one."), cl::init(150),
88 cl::ReallyHidden, cl::cat(BoltOptCategory));
90 cl::opt<unsigned> StaleMatchingCostJumpDec(
91 "stale-matching-cost-jump-dec",
92 cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
93 cl::ReallyHidden, cl::cat(BoltOptCategory));
95 cl::opt<unsigned> StaleMatchingCostBlockUnknownInc(
96 "stale-matching-cost-block-unknown-inc",
97 cl::desc("The cost of increasing an unknown block count by one."),
98 cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory));
100 cl::opt<unsigned> StaleMatchingCostJumpUnknownInc(
101 "stale-matching-cost-jump-unknown-inc",
102 cl::desc("The cost of increasing an unknown jump count by one."),
103 cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory));
105 cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
106 "stale-matching-cost-jump-unknown-ft-inc",
107 cl::desc(
108 "The cost of increasing an unknown fall-through jump count by one."),
109 cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory));
111 } // namespace opts
113 namespace llvm {
114 namespace bolt {
116 /// An object wrapping several components of a basic block hash. The combined
117 /// (blended) hash is represented and stored as one uint64_t, while individual
118 /// components are of smaller size (e.g., uint16_t or uint8_t).
119 struct BlendedBlockHash {
120 private:
121 using ValueOffset = Bitfield::Element<uint16_t, 0, 16>;
122 using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>;
123 using ValueInstr = Bitfield::Element<uint16_t, 32, 16>;
124 using ValuePred = Bitfield::Element<uint8_t, 48, 8>;
125 using ValueSucc = Bitfield::Element<uint8_t, 56, 8>;
127 public:
128 explicit BlendedBlockHash() {}
130 explicit BlendedBlockHash(uint64_t Hash) {
131 Offset = Bitfield::get<ValueOffset>(Hash);
132 OpcodeHash = Bitfield::get<ValueOpcode>(Hash);
133 InstrHash = Bitfield::get<ValueInstr>(Hash);
134 PredHash = Bitfield::get<ValuePred>(Hash);
135 SuccHash = Bitfield::get<ValueSucc>(Hash);
138 /// Combine the blended hash into uint64_t.
139 uint64_t combine() const {
140 uint64_t Hash = 0;
141 Bitfield::set<ValueOffset>(Hash, Offset);
142 Bitfield::set<ValueOpcode>(Hash, OpcodeHash);
143 Bitfield::set<ValueInstr>(Hash, InstrHash);
144 Bitfield::set<ValuePred>(Hash, PredHash);
145 Bitfield::set<ValueSucc>(Hash, SuccHash);
146 return Hash;
149 /// Compute a distance between two given blended hashes. The smaller the
150 /// distance, the more similar two blocks are. For identical basic blocks,
151 /// the distance is zero.
152 uint64_t distance(const BlendedBlockHash &BBH) const {
153 assert(OpcodeHash == BBH.OpcodeHash &&
154 "incorrect blended hash distance computation");
155 uint64_t Dist = 0;
156 // Account for NeighborHash
157 Dist += SuccHash == BBH.SuccHash ? 0 : 1;
158 Dist += PredHash == BBH.PredHash ? 0 : 1;
159 Dist <<= 16;
160 // Account for InstrHash
161 Dist += InstrHash == BBH.InstrHash ? 0 : 1;
162 Dist <<= 16;
163 // Account for Offset
164 Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
165 return Dist;
168 /// The offset of the basic block from the function start.
169 uint16_t Offset{0};
170 /// (Loose) Hash of the basic block instructions, excluding operands.
171 uint16_t OpcodeHash{0};
172 /// (Strong) Hash of the basic block instructions, including opcodes and
173 /// operands.
174 uint16_t InstrHash{0};
175 /// (Loose) Hashes of the predecessors of the basic block.
176 uint8_t PredHash{0};
177 /// (Loose) Hashes of the successors of the basic block.
178 uint8_t SuccHash{0};
181 /// The object is used to identify and match basic blocks in a BinaryFunction
182 /// given their hashes computed on a binary built from several revisions behind
183 /// release.
184 class StaleMatcher {
185 public:
186 /// Initialize stale matcher.
187 void init(const std::vector<FlowBlock *> &Blocks,
188 const std::vector<BlendedBlockHash> &Hashes) {
189 assert(Blocks.size() == Hashes.size() &&
190 "incorrect matcher initialization");
191 for (size_t I = 0; I < Blocks.size(); I++) {
192 FlowBlock *Block = Blocks[I];
193 uint16_t OpHash = Hashes[I].OpcodeHash;
194 OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block));
198 /// Find the most similar block for a given hash.
199 const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const {
200 auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
201 if (BlockIt == OpHashToBlocks.end())
202 return nullptr;
203 FlowBlock *BestBlock = nullptr;
204 uint64_t BestDist = std::numeric_limits<uint64_t>::max();
205 for (const auto &[Hash, Block] : BlockIt->second) {
206 uint64_t Dist = Hash.distance(BlendedHash);
207 if (BestBlock == nullptr || Dist < BestDist) {
208 BestDist = Dist;
209 BestBlock = Block;
212 return BestBlock;
215 /// Returns true if the two basic blocks (in the binary and in the profile)
216 /// corresponding to the given hashes are matched to each other with a high
217 /// confidence.
218 static bool isHighConfidenceMatch(BlendedBlockHash Hash1,
219 BlendedBlockHash Hash2) {
220 return Hash1.InstrHash == Hash2.InstrHash;
223 private:
224 using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
225 std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
228 void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const {
229 if (size() == 0)
230 return;
232 assert(hasCFG() && "the function is expected to have CFG");
234 std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size());
235 std::vector<uint64_t> OpcodeHashes(BasicBlocks.size());
236 // Initialize hash components.
237 for (size_t I = 0; I < BasicBlocks.size(); I++) {
238 const BinaryBasicBlock *BB = BasicBlocks[I];
239 assert(BB->getIndex() == I && "incorrect block index");
240 BlendedHashes[I].Offset = BB->getOffset();
241 // Hashing complete instructions.
242 std::string InstrHashStr = hashBlock(
243 BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); });
244 if (HashFunction == HashFunction::StdHash) {
245 uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr);
246 BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash);
247 } else if (HashFunction == HashFunction::XXH3) {
248 uint64_t InstrHash = llvm::xxh3_64bits(InstrHashStr);
249 BlendedHashes[I].InstrHash = (uint16_t)InstrHash;
250 } else {
251 llvm_unreachable("Unhandled HashFunction");
253 // Hashing opcodes.
254 std::string OpcodeHashStr = hashBlockLoose(BC, *BB);
255 if (HashFunction == HashFunction::StdHash) {
256 OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr);
257 BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]);
258 } else if (HashFunction == HashFunction::XXH3) {
259 OpcodeHashes[I] = llvm::xxh3_64bits(OpcodeHashStr);
260 BlendedHashes[I].OpcodeHash = (uint16_t)OpcodeHashes[I];
261 } else {
262 llvm_unreachable("Unhandled HashFunction");
266 // Initialize neighbor hash.
267 for (size_t I = 0; I < BasicBlocks.size(); I++) {
268 const BinaryBasicBlock *BB = BasicBlocks[I];
269 // Append hashes of successors.
270 uint64_t Hash = 0;
271 for (BinaryBasicBlock *SuccBB : BB->successors()) {
272 uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()];
273 Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
275 if (HashFunction == HashFunction::StdHash) {
276 // Compatibility with old behavior.
277 BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash);
278 } else {
279 BlendedHashes[I].SuccHash = (uint8_t)Hash;
282 // Append hashes of predecessors.
283 Hash = 0;
284 for (BinaryBasicBlock *PredBB : BB->predecessors()) {
285 uint64_t PredHash = OpcodeHashes[PredBB->getIndex()];
286 Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
288 if (HashFunction == HashFunction::StdHash) {
289 // Compatibility with old behavior.
290 BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash);
291 } else {
292 BlendedHashes[I].PredHash = (uint8_t)Hash;
296 // Assign hashes.
297 for (size_t I = 0; I < BasicBlocks.size(); I++) {
298 const BinaryBasicBlock *BB = BasicBlocks[I];
299 BB->setHash(BlendedHashes[I].combine());
303 /// Create a wrapper flow function to use with the profile inference algorithm,
304 /// and initialize its jumps and metadata.
305 FlowFunction
306 createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
307 FlowFunction Func;
309 // Add a special "dummy" source so that there is always a unique entry point.
310 // Because of the extra source, for all other blocks in FlowFunction it holds
311 // that Block.Index == BB->getIndex() + 1
312 FlowBlock EntryBlock;
313 EntryBlock.Index = 0;
314 Func.Blocks.push_back(EntryBlock);
316 // Create FlowBlock for every basic block in the binary function
317 for (const BinaryBasicBlock *BB : BlockOrder) {
318 Func.Blocks.emplace_back();
319 FlowBlock &Block = Func.Blocks.back();
320 Block.Index = Func.Blocks.size() - 1;
321 (void)BB;
322 assert(Block.Index == BB->getIndex() + 1 &&
323 "incorrectly assigned basic block index");
326 // Create FlowJump for each jump between basic blocks in the binary function
327 std::vector<uint64_t> InDegree(Func.Blocks.size(), 0);
328 for (const BinaryBasicBlock *SrcBB : BlockOrder) {
329 std::unordered_set<const BinaryBasicBlock *> UniqueSuccs;
330 // Collect regular jumps
331 for (const BinaryBasicBlock *DstBB : SrcBB->successors()) {
332 // Ignoring parallel edges
333 if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
334 continue;
336 Func.Jumps.emplace_back();
337 FlowJump &Jump = Func.Jumps.back();
338 Jump.Source = SrcBB->getIndex() + 1;
339 Jump.Target = DstBB->getIndex() + 1;
340 InDegree[Jump.Target]++;
341 UniqueSuccs.insert(DstBB);
343 // Collect jumps to landing pads
344 for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) {
345 // Ignoring parallel edges
346 if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
347 continue;
349 Func.Jumps.emplace_back();
350 FlowJump &Jump = Func.Jumps.back();
351 Jump.Source = SrcBB->getIndex() + 1;
352 Jump.Target = DstBB->getIndex() + 1;
353 InDegree[Jump.Target]++;
354 UniqueSuccs.insert(DstBB);
358 // Add dummy edges to the extra sources. If there are multiple entry blocks,
359 // add an unlikely edge from 0 to the subsequent ones
360 assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors");
361 for (uint64_t I = 1; I < Func.Blocks.size(); I++) {
362 const BinaryBasicBlock *BB = BlockOrder[I - 1];
363 if (BB->isEntryPoint() || InDegree[I] == 0) {
364 Func.Jumps.emplace_back();
365 FlowJump &Jump = Func.Jumps.back();
366 Jump.Source = 0;
367 Jump.Target = I;
368 if (!BB->isEntryPoint())
369 Jump.IsUnlikely = true;
373 // Create necessary metadata for the flow function
374 for (FlowJump &Jump : Func.Jumps) {
375 Func.Blocks.at(Jump.Source).SuccJumps.push_back(&Jump);
376 Func.Blocks.at(Jump.Target).PredJumps.push_back(&Jump);
378 return Func;
381 /// Assign initial block/jump weights based on the stale profile data. The goal
382 /// is to extract as much information from the stale profile as possible. Here
383 /// we assume that each basic block is specified via a hash value computed from
384 /// its content and the hashes of the unchanged basic blocks stay the same
385 /// across different revisions of the binary.
386 /// Whenever there is a count in the profile with the hash corresponding to one
387 /// of the basic blocks in the binary, the count is "matched" to the block.
388 /// Similarly, if both the source and the target of a count in the profile are
389 /// matched to a jump in the binary, the count is recorded in CFG.
390 void matchWeightsByHashes(BinaryContext &BC,
391 const BinaryFunction::BasicBlockOrderType &BlockOrder,
392 const yaml::bolt::BinaryFunctionProfile &YamlBF,
393 FlowFunction &Func) {
394 assert(Func.Blocks.size() == BlockOrder.size() + 1);
396 std::vector<FlowBlock *> Blocks;
397 std::vector<BlendedBlockHash> BlendedHashes;
398 for (uint64_t I = 0; I < BlockOrder.size(); I++) {
399 const BinaryBasicBlock *BB = BlockOrder[I];
400 assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
401 Blocks.push_back(&Func.Blocks[I + 1]);
402 BlendedBlockHash BlendedHash(BB->getHash());
403 BlendedHashes.push_back(BlendedHash);
404 LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
405 << Twine::utohexstr(BB->getHash()) << "\n");
407 StaleMatcher Matcher;
408 Matcher.init(Blocks, BlendedHashes);
410 // Index in yaml profile => corresponding (matched) block
411 DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
412 // Match blocks from the profile to the blocks in CFG
413 for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
414 assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile");
415 BlendedBlockHash YamlHash(YamlBB.Hash);
416 const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash);
417 // Always match the entry block.
418 if (MatchedBlock == nullptr && YamlBB.Index == 0)
419 MatchedBlock = Blocks[0];
420 if (MatchedBlock != nullptr) {
421 MatchedBlocks[YamlBB.Index] = MatchedBlock;
422 BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1];
423 LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")"
424 << " with hash " << Twine::utohexstr(YamlBB.Hash)
425 << " to BB (index = " << MatchedBlock->Index - 1 << ")"
426 << " with hash " << Twine::utohexstr(BinHash.combine())
427 << "\n");
428 // Update matching stats accounting for the matched block.
429 if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
430 ++BC.Stats.NumMatchedBlocks;
431 BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
432 LLVM_DEBUG(dbgs() << " exact match\n");
433 } else {
434 LLVM_DEBUG(dbgs() << " loose match\n");
436 } else {
437 LLVM_DEBUG(
438 dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")"
439 << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n");
442 // Update matching stats.
443 ++BC.Stats.NumStaleBlocks;
444 BC.Stats.StaleSampleCount += YamlBB.ExecCount;
447 // Match jumps from the profile to the jumps from CFG
448 std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
449 std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);
450 for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
451 for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) {
452 if (YamlSI.Count == 0)
453 continue;
455 // Try to find the jump for a given (src, dst) pair from the profile and
456 // assign the jump weight based on the profile count
457 const uint64_t SrcIndex = YamlBB.Index;
458 const uint64_t DstIndex = YamlSI.Index;
460 const FlowBlock *MatchedSrcBlock = MatchedBlocks.lookup(SrcIndex);
461 const FlowBlock *MatchedDstBlock = MatchedBlocks.lookup(DstIndex);
463 if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) {
464 // Find a jump between the two blocks
465 FlowJump *Jump = nullptr;
466 for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) {
467 if (SuccJump->Target == MatchedDstBlock->Index) {
468 Jump = SuccJump;
469 break;
472 // Assign the weight, if the corresponding jump is found
473 if (Jump != nullptr) {
474 Jump->Weight = YamlSI.Count;
475 Jump->HasUnknownWeight = false;
478 // Assign the weight for the src block, if it is found
479 if (MatchedSrcBlock != nullptr)
480 OutWeight[MatchedSrcBlock->Index] += YamlSI.Count;
481 // Assign the weight for the dst block, if it is found
482 if (MatchedDstBlock != nullptr)
483 InWeight[MatchedDstBlock->Index] += YamlSI.Count;
487 // Assign block counts based on in-/out- jumps
488 for (FlowBlock &Block : Func.Blocks) {
489 if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) {
490 assert(Block.HasUnknownWeight && "unmatched block with a positive count");
491 continue;
493 Block.HasUnknownWeight = false;
494 Block.Weight = std::max(OutWeight[Block.Index], InWeight[Block.Index]);
498 /// The function finds all blocks that are (i) reachable from the Entry block
499 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
500 /// so that profi does not send any flow to such blocks.
501 void preprocessUnreachableBlocks(FlowFunction &Func) {
502 const uint64_t NumBlocks = Func.Blocks.size();
504 // Start bfs from the source
505 std::queue<uint64_t> Queue;
506 std::vector<bool> VisitedEntry(NumBlocks, false);
507 for (uint64_t I = 0; I < NumBlocks; I++) {
508 FlowBlock &Block = Func.Blocks[I];
509 if (Block.isEntry()) {
510 Queue.push(I);
511 VisitedEntry[I] = true;
512 break;
515 while (!Queue.empty()) {
516 const uint64_t Src = Queue.front();
517 Queue.pop();
518 for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) {
519 const uint64_t Dst = Jump->Target;
520 if (!VisitedEntry[Dst]) {
521 Queue.push(Dst);
522 VisitedEntry[Dst] = true;
527 // Start bfs from all sinks
528 std::vector<bool> VisitedExit(NumBlocks, false);
529 for (uint64_t I = 0; I < NumBlocks; I++) {
530 FlowBlock &Block = Func.Blocks[I];
531 if (Block.isExit() && VisitedEntry[I]) {
532 Queue.push(I);
533 VisitedExit[I] = true;
536 while (!Queue.empty()) {
537 const uint64_t Src = Queue.front();
538 Queue.pop();
539 for (FlowJump *Jump : Func.Blocks[Src].PredJumps) {
540 const uint64_t Dst = Jump->Source;
541 if (!VisitedExit[Dst]) {
542 Queue.push(Dst);
543 VisitedExit[Dst] = true;
548 // Make all blocks of zero weight so that flow is not sent
549 for (uint64_t I = 0; I < NumBlocks; I++) {
550 FlowBlock &Block = Func.Blocks[I];
551 if (Block.Weight == 0)
552 continue;
553 if (!VisitedEntry[I] || !VisitedExit[I]) {
554 Block.Weight = 0;
555 Block.HasUnknownWeight = true;
556 Block.IsUnlikely = true;
557 for (FlowJump *Jump : Block.SuccJumps) {
558 if (Jump->Source == Block.Index && Jump->Target == Block.Index) {
559 Jump->Weight = 0;
560 Jump->HasUnknownWeight = true;
561 Jump->IsUnlikely = true;
568 /// Decide if stale profile matching can be applied for a given function.
569 /// Currently we skip inference for (very) large instances and for instances
570 /// having "unexpected" control flow (e.g., having no sink basic blocks).
571 bool canApplyInference(const FlowFunction &Func) {
572 if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize)
573 return false;
575 bool HasExitBlocks = llvm::any_of(
576 Func.Blocks, [&](const FlowBlock &Block) { return Block.isExit(); });
577 if (!HasExitBlocks)
578 return false;
580 return true;
583 /// Apply the profile inference algorithm for a given flow function.
584 void applyInference(FlowFunction &Func) {
585 ProfiParams Params;
586 // Set the params from the command-line flags.
587 Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution;
588 Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown;
589 Params.JoinIslands = opts::StaleMatchingJoinIslands;
591 Params.CostBlockInc = opts::StaleMatchingCostBlockInc;
592 Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc;
593 Params.CostBlockDec = opts::StaleMatchingCostBlockDec;
594 Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec;
595 Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc;
597 Params.CostJumpInc = opts::StaleMatchingCostJumpInc;
598 Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc;
599 Params.CostJumpDec = opts::StaleMatchingCostJumpDec;
600 Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec;
601 Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc;
602 Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc;
604 applyFlowInference(Params, Func);
607 /// Collect inferred counts from the flow function and update annotations in
608 /// the binary function.
609 void assignProfile(BinaryFunction &BF,
610 const BinaryFunction::BasicBlockOrderType &BlockOrder,
611 FlowFunction &Func) {
612 BinaryContext &BC = BF.getBinaryContext();
614 assert(Func.Blocks.size() == BlockOrder.size() + 1);
615 for (uint64_t I = 0; I < BlockOrder.size(); I++) {
616 FlowBlock &Block = Func.Blocks[I + 1];
617 BinaryBasicBlock *BB = BlockOrder[I];
619 // Update block's count
620 BB->setExecutionCount(Block.Flow);
622 // Update jump counts: (i) clean existing counts and then (ii) set new ones
623 auto BI = BB->branch_info_begin();
624 for (const BinaryBasicBlock *DstBB : BB->successors()) {
625 (void)DstBB;
626 BI->Count = 0;
627 BI->MispredictedCount = 0;
628 ++BI;
630 for (FlowJump *Jump : Block.SuccJumps) {
631 if (Jump->IsUnlikely)
632 continue;
633 if (Jump->Flow == 0)
634 continue;
636 BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1];
637 // Check if the edge corresponds to a regular jump or a landing pad
638 if (BB->getSuccessor(SuccBB.getLabel())) {
639 BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(SuccBB);
640 BI.Count += Jump->Flow;
641 } else {
642 BinaryBasicBlock *LP = BB->getLandingPad(SuccBB.getLabel());
643 if (LP && LP->getKnownExecutionCount() < Jump->Flow)
644 LP->setExecutionCount(Jump->Flow);
648 // Update call-site annotations
649 auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name,
650 uint64_t Count) {
651 if (BC.MIB->hasAnnotation(Instr, Name))
652 BC.MIB->removeAnnotation(Instr, Name);
653 // Do not add zero-count annotations
654 if (Count == 0)
655 return;
656 BC.MIB->addAnnotation(Instr, Name, Count);
659 for (MCInst &Instr : *BB) {
660 // Ignore pseudo instructions
661 if (BC.MIB->isPseudo(Instr))
662 continue;
663 // Ignore jump tables
664 const MCInst *LastInstr = BB->getLastNonPseudoInstr();
665 if (BC.MIB->getJumpTable(*LastInstr) && LastInstr == &Instr)
666 continue;
668 if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) {
669 auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
670 Instr, "CallProfile");
671 if (!ICSP.empty()) {
672 // Try to evenly distribute the counts among the call sites
673 const uint64_t TotalCount = Block.Flow;
674 const uint64_t NumSites = ICSP.size();
675 for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) {
676 IndirectCallProfile &CSP = ICSP[Idx];
677 uint64_t CountPerSite = TotalCount / NumSites;
678 // When counts cannot be exactly distributed, increase by 1 the
679 // counts of the first (TotalCount % NumSites) call sites
680 if (Idx < TotalCount % NumSites)
681 CountPerSite++;
682 CSP.Count = CountPerSite;
684 } else {
685 ICSP.emplace_back(nullptr, Block.Flow, 0);
687 } else if (BC.MIB->getConditionalTailCall(Instr)) {
688 // We don't know exactly the number of times the conditional tail call
689 // is executed; conservatively, setting it to the count of the block
690 setOrUpdateAnnotation(Instr, "CTCTakenCount", Block.Flow);
691 BC.MIB->removeAnnotation(Instr, "CTCMispredCount");
692 } else if (BC.MIB->isCall(Instr)) {
693 setOrUpdateAnnotation(Instr, "Count", Block.Flow);
698 // Update function's execution count and mark the function inferred.
699 BF.setExecutionCount(Func.Blocks[0].Flow);
700 BF.setHasInferredProfile(true);
703 bool YAMLProfileReader::inferStaleProfile(
704 BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
705 LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
706 << "\"" << BF.getPrintName() << "\"\n");
708 // Make sure that block hashes are up to date.
709 BF.computeBlockHashes(YamlBP.Header.HashFunction);
711 const BinaryFunction::BasicBlockOrderType BlockOrder(
712 BF.getLayout().block_begin(), BF.getLayout().block_end());
714 // Create a wrapper flow function to use with the profile inference algorithm.
715 FlowFunction Func = createFlowFunction(BlockOrder);
717 // Match as many block/jump counts from the stale profile as possible
718 matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func);
720 // Adjust the flow function by marking unreachable blocks Unlikely so that
721 // they don't get any counts assigned.
722 preprocessUnreachableBlocks(Func);
724 // Check if profile inference can be applied for the instance.
725 if (!canApplyInference(Func))
726 return false;
728 // Apply the profile inference algorithm.
729 applyInference(Func);
731 // Collect inferred counts and update function annotations.
732 assignProfile(BF, BlockOrder, Func);
734 // As of now, we always mark the binary function having "correct" profile.
735 // In the future, we may discard the results for instances with poor inference
736 // metrics and keep such functions un-optimized.
737 return true;
740 } // end namespace bolt
741 } // end namespace llvm