1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // BOLT often has to deal with profiles collected on binaries built from several
10 // revisions behind release. As a result, a certain percentage of functions is
11 // considered stale and not optimized. This file implements an ability to match
12 // profile to functions that are not 100% binary identical, and thus, increasing
13 // the optimization coverage and boost the performance of applications.
15 // The algorithm consists of two phases: matching and inference:
16 // - At the matching phase, we try to "guess" as many block and jump counts from
17 // the stale profile as possible. To this end, the content of each basic block
18 // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
19 // it computes block hashes and identifies the corresponding entries in the
20 // stale profile. It yields a partial profile for every CFG in the binary.
21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
22 // reconstruct "realistic" block and jump counts from the partial profile
23 // generated at the first stage. In practice, we don't always produce proper
24 // profile data but the majority (e.g., >90%) of CFGs get the correct counts.
26 //===----------------------------------------------------------------------===//
28 #include "bolt/Core/HashUtilities.h"
29 #include "bolt/Profile/YAMLProfileReader.h"
30 #include "llvm/ADT/Bitfields.h"
31 #include "llvm/ADT/Hashing.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Transforms/Utils/SampleProfileInference.h"
40 #define DEBUG_TYPE "bolt-prof"
44 extern cl::OptionCategory BoltOptCategory
;
47 InferStaleProfile("infer-stale-profile",
48 cl::desc("Infer counts from stale profile data."),
49 cl::init(false), cl::Hidden
, cl::cat(BoltOptCategory
));
51 cl::opt
<unsigned> StaleMatchingMaxFuncSize(
52 "stale-matching-max-func-size",
53 cl::desc("The maximum size of a function to consider for inference."),
54 cl::init(10000), cl::Hidden
, cl::cat(BoltOptCategory
));
56 // Parameters of the profile inference algorithm. The default values are tuned
57 // on several benchmarks.
58 cl::opt
<bool> StaleMatchingEvenFlowDistribution(
59 "stale-matching-even-flow-distribution",
60 cl::desc("Try to evenly distribute flow when there are multiple equally "
62 cl::init(true), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
64 cl::opt
<bool> StaleMatchingRebalanceUnknown(
65 "stale-matching-rebalance-unknown",
66 cl::desc("Evenly re-distribute flow among unknown subgraphs."),
67 cl::init(false), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
69 cl::opt
<bool> StaleMatchingJoinIslands(
70 "stale-matching-join-islands",
71 cl::desc("Join isolated components having positive flow."), cl::init(true),
72 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
74 cl::opt
<unsigned> StaleMatchingCostBlockInc(
75 "stale-matching-cost-block-inc",
76 cl::desc("The cost of increasing a block count by one."), cl::init(150),
77 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
79 cl::opt
<unsigned> StaleMatchingCostBlockDec(
80 "stale-matching-cost-block-dec",
81 cl::desc("The cost of decreasing a block count by one."), cl::init(150),
82 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
84 cl::opt
<unsigned> StaleMatchingCostJumpInc(
85 "stale-matching-cost-jump-inc",
86 cl::desc("The cost of increasing a jump count by one."), cl::init(150),
87 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
89 cl::opt
<unsigned> StaleMatchingCostJumpDec(
90 "stale-matching-cost-jump-dec",
91 cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
92 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
94 cl::opt
<unsigned> StaleMatchingCostBlockUnknownInc(
95 "stale-matching-cost-block-unknown-inc",
96 cl::desc("The cost of increasing an unknown block count by one."),
97 cl::init(1), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
99 cl::opt
<unsigned> StaleMatchingCostJumpUnknownInc(
100 "stale-matching-cost-jump-unknown-inc",
101 cl::desc("The cost of increasing an unknown jump count by one."),
102 cl::init(140), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
104 cl::opt
<unsigned> StaleMatchingCostJumpUnknownFTInc(
105 "stale-matching-cost-jump-unknown-ft-inc",
107 "The cost of increasing an unknown fall-through jump count by one."),
108 cl::init(3), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
115 /// An object wrapping several components of a basic block hash. The combined
116 /// (blended) hash is represented and stored as one uint64_t, while individual
117 /// components are of smaller size (e.g., uint16_t or uint8_t).
118 struct BlendedBlockHash
{
120 using ValueOffset
= Bitfield::Element
<uint16_t, 0, 16>;
121 using ValueOpcode
= Bitfield::Element
<uint16_t, 16, 16>;
122 using ValueInstr
= Bitfield::Element
<uint16_t, 32, 16>;
123 using ValuePred
= Bitfield::Element
<uint8_t, 48, 8>;
124 using ValueSucc
= Bitfield::Element
<uint8_t, 56, 8>;
127 explicit BlendedBlockHash() {}
129 explicit BlendedBlockHash(uint64_t Hash
) {
130 Offset
= Bitfield::get
<ValueOffset
>(Hash
);
131 OpcodeHash
= Bitfield::get
<ValueOpcode
>(Hash
);
132 InstrHash
= Bitfield::get
<ValueInstr
>(Hash
);
133 PredHash
= Bitfield::get
<ValuePred
>(Hash
);
134 SuccHash
= Bitfield::get
<ValueSucc
>(Hash
);
137 /// Combine the blended hash into uint64_t.
138 uint64_t combine() const {
140 Bitfield::set
<ValueOffset
>(Hash
, Offset
);
141 Bitfield::set
<ValueOpcode
>(Hash
, OpcodeHash
);
142 Bitfield::set
<ValueInstr
>(Hash
, InstrHash
);
143 Bitfield::set
<ValuePred
>(Hash
, PredHash
);
144 Bitfield::set
<ValueSucc
>(Hash
, SuccHash
);
148 /// Compute a distance between two given blended hashes. The smaller the
149 /// distance, the more similar two blocks are. For identical basic blocks,
150 /// the distance is zero.
151 uint64_t distance(const BlendedBlockHash
&BBH
) const {
152 assert(OpcodeHash
== BBH
.OpcodeHash
&&
153 "incorrect blended hash distance computation");
155 // Account for NeighborHash
156 Dist
+= SuccHash
== BBH
.SuccHash
? 0 : 1;
157 Dist
+= PredHash
== BBH
.PredHash
? 0 : 1;
159 // Account for InstrHash
160 Dist
+= InstrHash
== BBH
.InstrHash
? 0 : 1;
162 // Account for Offset
163 Dist
+= (Offset
>= BBH
.Offset
? Offset
- BBH
.Offset
: BBH
.Offset
- Offset
);
167 /// The offset of the basic block from the function start.
169 /// (Loose) Hash of the basic block instructions, excluding operands.
170 uint16_t OpcodeHash
{0};
171 /// (Strong) Hash of the basic block instructions, including opcodes and
173 uint16_t InstrHash
{0};
174 /// (Loose) Hashes of the predecessors of the basic block.
176 /// (Loose) Hashes of the successors of the basic block.
180 /// The object is used to identify and match basic blocks in a BinaryFunction
181 /// given their hashes computed on a binary built from several revisions behind
185 /// Initialize stale matcher.
186 void init(const std::vector
<FlowBlock
*> &Blocks
,
187 const std::vector
<BlendedBlockHash
> &Hashes
) {
188 assert(Blocks
.size() == Hashes
.size() &&
189 "incorrect matcher initialization");
190 for (size_t I
= 0; I
< Blocks
.size(); I
++) {
191 FlowBlock
*Block
= Blocks
[I
];
192 uint16_t OpHash
= Hashes
[I
].OpcodeHash
;
193 OpHashToBlocks
[OpHash
].push_back(std::make_pair(Hashes
[I
], Block
));
197 /// Find the most similar block for a given hash.
198 const FlowBlock
*matchBlock(BlendedBlockHash BlendedHash
) const {
199 auto BlockIt
= OpHashToBlocks
.find(BlendedHash
.OpcodeHash
);
200 if (BlockIt
== OpHashToBlocks
.end())
202 FlowBlock
*BestBlock
= nullptr;
203 uint64_t BestDist
= std::numeric_limits
<uint64_t>::max();
204 for (const auto &[Hash
, Block
] : BlockIt
->second
) {
205 uint64_t Dist
= Hash
.distance(BlendedHash
);
206 if (BestBlock
== nullptr || Dist
< BestDist
) {
214 /// Returns true if the two basic blocks (in the binary and in the profile)
215 /// corresponding to the given hashes are matched to each other with a high
217 static bool isHighConfidenceMatch(BlendedBlockHash Hash1
,
218 BlendedBlockHash Hash2
) {
219 return Hash1
.InstrHash
== Hash2
.InstrHash
;
223 using HashBlockPairType
= std::pair
<BlendedBlockHash
, FlowBlock
*>;
224 std::unordered_map
<uint16_t, std::vector
<HashBlockPairType
>> OpHashToBlocks
;
227 void BinaryFunction::computeBlockHashes() const {
231 assert(hasCFG() && "the function is expected to have CFG");
233 std::vector
<BlendedBlockHash
> BlendedHashes(BasicBlocks
.size());
234 std::vector
<uint64_t> OpcodeHashes(BasicBlocks
.size());
235 // Initialize hash components.
236 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
237 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
238 assert(BB
->getIndex() == I
&& "incorrect block index");
239 BlendedHashes
[I
].Offset
= BB
->getOffset();
240 // Hashing complete instructions.
241 std::string InstrHashStr
= hashBlock(
242 BC
, *BB
, [&](const MCOperand
&Op
) { return hashInstOperand(BC
, Op
); });
243 uint64_t InstrHash
= std::hash
<std::string
>{}(InstrHashStr
);
244 BlendedHashes
[I
].InstrHash
= (uint16_t)hash_value(InstrHash
);
246 std::string OpcodeHashStr
= hashBlockLoose(BC
, *BB
);
247 OpcodeHashes
[I
] = std::hash
<std::string
>{}(OpcodeHashStr
);
248 BlendedHashes
[I
].OpcodeHash
= (uint16_t)hash_value(OpcodeHashes
[I
]);
251 // Initialize neighbor hash.
252 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
253 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
254 // Append hashes of successors.
256 for (BinaryBasicBlock
*SuccBB
: BB
->successors()) {
257 uint64_t SuccHash
= OpcodeHashes
[SuccBB
->getIndex()];
258 Hash
= hashing::detail::hash_16_bytes(Hash
, SuccHash
);
260 BlendedHashes
[I
].SuccHash
= (uint8_t)hash_value(Hash
);
262 // Append hashes of predecessors.
264 for (BinaryBasicBlock
*PredBB
: BB
->predecessors()) {
265 uint64_t PredHash
= OpcodeHashes
[PredBB
->getIndex()];
266 Hash
= hashing::detail::hash_16_bytes(Hash
, PredHash
);
268 BlendedHashes
[I
].PredHash
= (uint8_t)hash_value(Hash
);
272 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
273 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
274 BB
->setHash(BlendedHashes
[I
].combine());
278 /// Create a wrapper flow function to use with the profile inference algorithm,
279 /// and initialize its jumps and metadata.
281 createFlowFunction(const BinaryFunction::BasicBlockOrderType
&BlockOrder
) {
284 // Add a special "dummy" source so that there is always a unique entry point.
285 // Because of the extra source, for all other blocks in FlowFunction it holds
286 // that Block.Index == BB->getIndex() + 1
287 FlowBlock EntryBlock
;
288 EntryBlock
.Index
= 0;
289 Func
.Blocks
.push_back(EntryBlock
);
291 // Create FlowBlock for every basic block in the binary function
292 for (const BinaryBasicBlock
*BB
: BlockOrder
) {
293 Func
.Blocks
.emplace_back();
294 FlowBlock
&Block
= Func
.Blocks
.back();
295 Block
.Index
= Func
.Blocks
.size() - 1;
297 assert(Block
.Index
== BB
->getIndex() + 1 &&
298 "incorrectly assigned basic block index");
301 // Create FlowJump for each jump between basic blocks in the binary function
302 std::vector
<uint64_t> InDegree(Func
.Blocks
.size(), 0);
303 for (const BinaryBasicBlock
*SrcBB
: BlockOrder
) {
304 std::unordered_set
<const BinaryBasicBlock
*> UniqueSuccs
;
305 // Collect regular jumps
306 for (const BinaryBasicBlock
*DstBB
: SrcBB
->successors()) {
307 // Ignoring parallel edges
308 if (UniqueSuccs
.find(DstBB
) != UniqueSuccs
.end())
311 Func
.Jumps
.emplace_back();
312 FlowJump
&Jump
= Func
.Jumps
.back();
313 Jump
.Source
= SrcBB
->getIndex() + 1;
314 Jump
.Target
= DstBB
->getIndex() + 1;
315 InDegree
[Jump
.Target
]++;
316 UniqueSuccs
.insert(DstBB
);
318 // Collect jumps to landing pads
319 for (const BinaryBasicBlock
*DstBB
: SrcBB
->landing_pads()) {
320 // Ignoring parallel edges
321 if (UniqueSuccs
.find(DstBB
) != UniqueSuccs
.end())
324 Func
.Jumps
.emplace_back();
325 FlowJump
&Jump
= Func
.Jumps
.back();
326 Jump
.Source
= SrcBB
->getIndex() + 1;
327 Jump
.Target
= DstBB
->getIndex() + 1;
328 InDegree
[Jump
.Target
]++;
329 UniqueSuccs
.insert(DstBB
);
333 // Add dummy edges to the extra sources. If there are multiple entry blocks,
334 // add an unlikely edge from 0 to the subsequent ones
335 assert(InDegree
[0] == 0 && "dummy entry blocks shouldn't have predecessors");
336 for (uint64_t I
= 1; I
< Func
.Blocks
.size(); I
++) {
337 const BinaryBasicBlock
*BB
= BlockOrder
[I
- 1];
338 if (BB
->isEntryPoint() || InDegree
[I
] == 0) {
339 Func
.Jumps
.emplace_back();
340 FlowJump
&Jump
= Func
.Jumps
.back();
343 if (!BB
->isEntryPoint())
344 Jump
.IsUnlikely
= true;
348 // Create necessary metadata for the flow function
349 for (FlowJump
&Jump
: Func
.Jumps
) {
350 Func
.Blocks
.at(Jump
.Source
).SuccJumps
.push_back(&Jump
);
351 Func
.Blocks
.at(Jump
.Target
).PredJumps
.push_back(&Jump
);
356 /// Assign initial block/jump weights based on the stale profile data. The goal
357 /// is to extract as much information from the stale profile as possible. Here
358 /// we assume that each basic block is specified via a hash value computed from
359 /// its content and the hashes of the unchanged basic blocks stay the same
360 /// across different revisions of the binary.
361 /// Whenever there is a count in the profile with the hash corresponding to one
362 /// of the basic blocks in the binary, the count is "matched" to the block.
363 /// Similarly, if both the source and the target of a count in the profile are
364 /// matched to a jump in the binary, the count is recorded in CFG.
365 void matchWeightsByHashes(BinaryContext
&BC
,
366 const BinaryFunction::BasicBlockOrderType
&BlockOrder
,
367 const yaml::bolt::BinaryFunctionProfile
&YamlBF
,
368 FlowFunction
&Func
) {
369 assert(Func
.Blocks
.size() == BlockOrder
.size() + 1);
371 std::vector
<FlowBlock
*> Blocks
;
372 std::vector
<BlendedBlockHash
> BlendedHashes
;
373 for (uint64_t I
= 0; I
< BlockOrder
.size(); I
++) {
374 const BinaryBasicBlock
*BB
= BlockOrder
[I
];
375 assert(BB
->getHash() != 0 && "empty hash of BinaryBasicBlock");
376 Blocks
.push_back(&Func
.Blocks
[I
+ 1]);
377 BlendedBlockHash
BlendedHash(BB
->getHash());
378 BlendedHashes
.push_back(BlendedHash
);
379 LLVM_DEBUG(dbgs() << "BB with index " << I
<< " has hash = "
380 << Twine::utohexstr(BB
->getHash()) << "\n");
382 StaleMatcher Matcher
;
383 Matcher
.init(Blocks
, BlendedHashes
);
385 // Index in yaml profile => corresponding (matched) block
386 DenseMap
<uint64_t, const FlowBlock
*> MatchedBlocks
;
387 // Match blocks from the profile to the blocks in CFG
388 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
.Blocks
) {
389 assert(YamlBB
.Hash
!= 0 && "empty hash of BinaryBasicBlockProfile");
390 BlendedBlockHash
YamlHash(YamlBB
.Hash
);
391 const FlowBlock
*MatchedBlock
= Matcher
.matchBlock(YamlHash
);
392 // Always match the entry block.
393 if (MatchedBlock
== nullptr && YamlBB
.Index
== 0)
394 MatchedBlock
= Blocks
[0];
395 if (MatchedBlock
!= nullptr) {
396 MatchedBlocks
[YamlBB
.Index
] = MatchedBlock
;
397 BlendedBlockHash BinHash
= BlendedHashes
[MatchedBlock
->Index
- 1];
398 LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB
.Index
<< ")"
399 << " with hash " << Twine::utohexstr(YamlBB
.Hash
)
400 << " to BB (index = " << MatchedBlock
->Index
- 1 << ")"
401 << " with hash " << Twine::utohexstr(BinHash
.combine())
403 // Update matching stats accounting for the matched block.
404 if (Matcher
.isHighConfidenceMatch(BinHash
, YamlHash
)) {
405 ++BC
.Stats
.NumMatchedBlocks
;
406 BC
.Stats
.MatchedSampleCount
+= YamlBB
.ExecCount
;
407 LLVM_DEBUG(dbgs() << " exact match\n");
411 dbgs() << "Couldn't match yaml block (bid = " << YamlBB
.Index
<< ")"
412 << " with hash " << Twine::utohexstr(YamlBB
.Hash
) << "\n");
415 // Update matching stats.
416 ++BC
.Stats
.NumStaleBlocks
;
417 BC
.Stats
.StaleSampleCount
+= YamlBB
.ExecCount
;
420 // Match jumps from the profile to the jumps from CFG
421 std::vector
<uint64_t> OutWeight(Func
.Blocks
.size(), 0);
422 std::vector
<uint64_t> InWeight(Func
.Blocks
.size(), 0);
423 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
.Blocks
) {
424 for (const yaml::bolt::SuccessorInfo
&YamlSI
: YamlBB
.Successors
) {
425 if (YamlSI
.Count
== 0)
428 // Try to find the jump for a given (src, dst) pair from the profile and
429 // assign the jump weight based on the profile count
430 const uint64_t SrcIndex
= YamlBB
.Index
;
431 const uint64_t DstIndex
= YamlSI
.Index
;
433 const FlowBlock
*MatchedSrcBlock
= MatchedBlocks
.lookup(SrcIndex
);
434 const FlowBlock
*MatchedDstBlock
= MatchedBlocks
.lookup(DstIndex
);
436 if (MatchedSrcBlock
!= nullptr && MatchedDstBlock
!= nullptr) {
437 // Find a jump between the two blocks
438 FlowJump
*Jump
= nullptr;
439 for (FlowJump
*SuccJump
: MatchedSrcBlock
->SuccJumps
) {
440 if (SuccJump
->Target
== MatchedDstBlock
->Index
) {
445 // Assign the weight, if the corresponding jump is found
446 if (Jump
!= nullptr) {
447 Jump
->Weight
= YamlSI
.Count
;
448 Jump
->HasUnknownWeight
= false;
451 // Assign the weight for the src block, if it is found
452 if (MatchedSrcBlock
!= nullptr)
453 OutWeight
[MatchedSrcBlock
->Index
] += YamlSI
.Count
;
454 // Assign the weight for the dst block, if it is found
455 if (MatchedDstBlock
!= nullptr)
456 InWeight
[MatchedDstBlock
->Index
] += YamlSI
.Count
;
460 // Assign block counts based on in-/out- jumps
461 for (FlowBlock
&Block
: Func
.Blocks
) {
462 if (OutWeight
[Block
.Index
] == 0 && InWeight
[Block
.Index
] == 0) {
463 assert(Block
.HasUnknownWeight
&& "unmatched block with a positive count");
466 Block
.HasUnknownWeight
= false;
467 Block
.Weight
= std::max(OutWeight
[Block
.Index
], InWeight
[Block
.Index
]);
471 /// The function finds all blocks that are (i) reachable from the Entry block
472 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
473 /// so that profi does not send any flow to such blocks.
474 void preprocessUnreachableBlocks(FlowFunction
&Func
) {
475 const uint64_t NumBlocks
= Func
.Blocks
.size();
477 // Start bfs from the source
478 std::queue
<uint64_t> Queue
;
479 std::vector
<bool> VisitedEntry(NumBlocks
, false);
480 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
481 FlowBlock
&Block
= Func
.Blocks
[I
];
482 if (Block
.isEntry()) {
484 VisitedEntry
[I
] = true;
488 while (!Queue
.empty()) {
489 const uint64_t Src
= Queue
.front();
491 for (FlowJump
*Jump
: Func
.Blocks
[Src
].SuccJumps
) {
492 const uint64_t Dst
= Jump
->Target
;
493 if (!VisitedEntry
[Dst
]) {
495 VisitedEntry
[Dst
] = true;
500 // Start bfs from all sinks
501 std::vector
<bool> VisitedExit(NumBlocks
, false);
502 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
503 FlowBlock
&Block
= Func
.Blocks
[I
];
504 if (Block
.isExit() && VisitedEntry
[I
]) {
506 VisitedExit
[I
] = true;
509 while (!Queue
.empty()) {
510 const uint64_t Src
= Queue
.front();
512 for (FlowJump
*Jump
: Func
.Blocks
[Src
].PredJumps
) {
513 const uint64_t Dst
= Jump
->Source
;
514 if (!VisitedExit
[Dst
]) {
516 VisitedExit
[Dst
] = true;
521 // Make all blocks of zero weight so that flow is not sent
522 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
523 FlowBlock
&Block
= Func
.Blocks
[I
];
524 if (Block
.Weight
== 0)
526 if (!VisitedEntry
[I
] || !VisitedExit
[I
]) {
528 Block
.HasUnknownWeight
= true;
529 Block
.IsUnlikely
= true;
530 for (FlowJump
*Jump
: Block
.SuccJumps
) {
531 if (Jump
->Source
== Block
.Index
&& Jump
->Target
== Block
.Index
) {
533 Jump
->HasUnknownWeight
= true;
534 Jump
->IsUnlikely
= true;
541 /// Decide if stale profile matching can be applied for a given function.
542 /// Currently we skip inference for (very) large instances and for instances
543 /// having "unexpected" control flow (e.g., having no sink basic blocks).
544 bool canApplyInference(const FlowFunction
&Func
) {
545 if (Func
.Blocks
.size() > opts::StaleMatchingMaxFuncSize
)
548 bool HasExitBlocks
= llvm::any_of(
549 Func
.Blocks
, [&](const FlowBlock
&Block
) { return Block
.isExit(); });
556 /// Apply the profile inference algorithm for a given flow function.
557 void applyInference(FlowFunction
&Func
) {
559 // Set the params from the command-line flags.
560 Params
.EvenFlowDistribution
= opts::StaleMatchingEvenFlowDistribution
;
561 Params
.RebalanceUnknown
= opts::StaleMatchingRebalanceUnknown
;
562 Params
.JoinIslands
= opts::StaleMatchingJoinIslands
;
564 Params
.CostBlockInc
= opts::StaleMatchingCostBlockInc
;
565 Params
.CostBlockEntryInc
= opts::StaleMatchingCostBlockInc
;
566 Params
.CostBlockDec
= opts::StaleMatchingCostBlockDec
;
567 Params
.CostBlockEntryDec
= opts::StaleMatchingCostBlockDec
;
568 Params
.CostBlockUnknownInc
= opts::StaleMatchingCostBlockUnknownInc
;
570 Params
.CostJumpInc
= opts::StaleMatchingCostJumpInc
;
571 Params
.CostJumpFTInc
= opts::StaleMatchingCostJumpInc
;
572 Params
.CostJumpDec
= opts::StaleMatchingCostJumpDec
;
573 Params
.CostJumpFTDec
= opts::StaleMatchingCostJumpDec
;
574 Params
.CostJumpUnknownInc
= opts::StaleMatchingCostJumpUnknownInc
;
575 Params
.CostJumpUnknownFTInc
= opts::StaleMatchingCostJumpUnknownFTInc
;
577 applyFlowInference(Params
, Func
);
580 /// Collect inferred counts from the flow function and update annotations in
581 /// the binary function.
582 void assignProfile(BinaryFunction
&BF
,
583 const BinaryFunction::BasicBlockOrderType
&BlockOrder
,
584 FlowFunction
&Func
) {
585 BinaryContext
&BC
= BF
.getBinaryContext();
587 assert(Func
.Blocks
.size() == BlockOrder
.size() + 1);
588 for (uint64_t I
= 0; I
< BlockOrder
.size(); I
++) {
589 FlowBlock
&Block
= Func
.Blocks
[I
+ 1];
590 BinaryBasicBlock
*BB
= BlockOrder
[I
];
592 // Update block's count
593 BB
->setExecutionCount(Block
.Flow
);
595 // Update jump counts: (i) clean existing counts and then (ii) set new ones
596 auto BI
= BB
->branch_info_begin();
597 for (const BinaryBasicBlock
*DstBB
: BB
->successors()) {
600 BI
->MispredictedCount
= 0;
603 for (FlowJump
*Jump
: Block
.SuccJumps
) {
604 if (Jump
->IsUnlikely
)
609 BinaryBasicBlock
&SuccBB
= *BlockOrder
[Jump
->Target
- 1];
610 // Check if the edge corresponds to a regular jump or a landing pad
611 if (BB
->getSuccessor(SuccBB
.getLabel())) {
612 BinaryBasicBlock::BinaryBranchInfo
&BI
= BB
->getBranchInfo(SuccBB
);
613 BI
.Count
+= Jump
->Flow
;
615 BinaryBasicBlock
*LP
= BB
->getLandingPad(SuccBB
.getLabel());
616 if (LP
&& LP
->getKnownExecutionCount() < Jump
->Flow
)
617 LP
->setExecutionCount(Jump
->Flow
);
621 // Update call-site annotations
622 auto setOrUpdateAnnotation
= [&](MCInst
&Instr
, StringRef Name
,
624 if (BC
.MIB
->hasAnnotation(Instr
, Name
))
625 BC
.MIB
->removeAnnotation(Instr
, Name
);
626 // Do not add zero-count annotations
629 BC
.MIB
->addAnnotation(Instr
, Name
, Count
);
632 for (MCInst
&Instr
: *BB
) {
633 // Ignore pseudo instructions
634 if (BC
.MIB
->isPseudo(Instr
))
636 // Ignore jump tables
637 const MCInst
*LastInstr
= BB
->getLastNonPseudoInstr();
638 if (BC
.MIB
->getJumpTable(*LastInstr
) && LastInstr
== &Instr
)
641 if (BC
.MIB
->isIndirectCall(Instr
) || BC
.MIB
->isIndirectBranch(Instr
)) {
642 auto &ICSP
= BC
.MIB
->getOrCreateAnnotationAs
<IndirectCallSiteProfile
>(
643 Instr
, "CallProfile");
645 // Try to evenly distribute the counts among the call sites
646 const uint64_t TotalCount
= Block
.Flow
;
647 const uint64_t NumSites
= ICSP
.size();
648 for (uint64_t Idx
= 0; Idx
< ICSP
.size(); Idx
++) {
649 IndirectCallProfile
&CSP
= ICSP
[Idx
];
650 uint64_t CountPerSite
= TotalCount
/ NumSites
;
651 // When counts cannot be exactly distributed, increase by 1 the
652 // counts of the first (TotalCount % NumSites) call sites
653 if (Idx
< TotalCount
% NumSites
)
655 CSP
.Count
= CountPerSite
;
658 ICSP
.emplace_back(nullptr, Block
.Flow
, 0);
660 } else if (BC
.MIB
->getConditionalTailCall(Instr
)) {
661 // We don't know exactly the number of times the conditional tail call
662 // is executed; conservatively, setting it to the count of the block
663 setOrUpdateAnnotation(Instr
, "CTCTakenCount", Block
.Flow
);
664 BC
.MIB
->removeAnnotation(Instr
, "CTCMispredCount");
665 } else if (BC
.MIB
->isCall(Instr
)) {
666 setOrUpdateAnnotation(Instr
, "Count", Block
.Flow
);
671 // Update function's execution count and mark the function inferred.
672 BF
.setExecutionCount(Func
.Blocks
[0].Flow
);
673 BF
.setHasInferredProfile(true);
676 bool YAMLProfileReader::inferStaleProfile(
677 BinaryFunction
&BF
, const yaml::bolt::BinaryFunctionProfile
&YamlBF
) {
678 LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
679 << "\"" << BF
.getPrintName() << "\"\n");
681 // Make sure that block hashes are up to date.
682 BF
.computeBlockHashes();
684 const BinaryFunction::BasicBlockOrderType
BlockOrder(
685 BF
.getLayout().block_begin(), BF
.getLayout().block_end());
687 // Create a wrapper flow function to use with the profile inference algorithm.
688 FlowFunction Func
= createFlowFunction(BlockOrder
);
690 // Match as many block/jump counts from the stale profile as possible
691 matchWeightsByHashes(BF
.getBinaryContext(), BlockOrder
, YamlBF
, Func
);
693 // Adjust the flow function by marking unreachable blocks Unlikely so that
694 // they don't get any counts assigned.
695 preprocessUnreachableBlocks(Func
);
697 // Check if profile inference can be applied for the instance.
698 if (!canApplyInference(Func
))
701 // Apply the profile inference algorithm.
702 applyInference(Func
);
704 // Collect inferred counts and update function annotations.
705 assignProfile(BF
, BlockOrder
, Func
);
707 // As of now, we always mark the binary function having "correct" profile.
708 // In the future, we may discard the results for instances with poor inference
709 // metrics and keep such functions un-optimized.
713 } // end namespace bolt
714 } // end namespace llvm