1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // BOLT often has to deal with profiles collected on binaries built from several
10 // revisions behind release. As a result, a certain percentage of functions is
11 // considered stale and not optimized. This file implements an ability to match
12 // profile to functions that are not 100% binary identical, and thus, increasing
13 // the optimization coverage and boost the performance of applications.
15 // The algorithm consists of two phases: matching and inference:
16 // - At the matching phase, we try to "guess" as many block and jump counts from
17 // the stale profile as possible. To this end, the content of each basic block
18 // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
19 // it computes block hashes and identifies the corresponding entries in the
20 // stale profile. It yields a partial profile for every CFG in the binary.
21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
22 // reconstruct "realistic" block and jump counts from the partial profile
23 // generated at the first stage. In practice, we don't always produce proper
24 // profile data but the majority (e.g., >90%) of CFGs get the correct counts.
26 //===----------------------------------------------------------------------===//
28 #include "bolt/Core/HashUtilities.h"
29 #include "bolt/Profile/YAMLProfileReader.h"
30 #include "llvm/ADT/Bitfields.h"
31 #include "llvm/ADT/Hashing.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/Timer.h"
34 #include "llvm/Support/xxhash.h"
35 #include "llvm/Transforms/Utils/SampleProfileInference.h"
42 #define DEBUG_TYPE "bolt-prof"
46 extern cl::opt
<bool> TimeRewrite
;
47 extern cl::OptionCategory BoltOptCategory
;
50 InferStaleProfile("infer-stale-profile",
51 cl::desc("Infer counts from stale profile data."),
52 cl::init(false), cl::Hidden
, cl::cat(BoltOptCategory
));
54 cl::opt
<unsigned> StaleMatchingMinMatchedBlock(
55 "stale-matching-min-matched-block",
56 cl::desc("Percentage threshold of matched basic blocks at which stale "
57 "profile inference is executed."),
58 cl::init(0), cl::Hidden
, cl::cat(BoltOptCategory
));
60 cl::opt
<unsigned> StaleMatchingMaxFuncSize(
61 "stale-matching-max-func-size",
62 cl::desc("The maximum size of a function to consider for inference."),
63 cl::init(10000), cl::Hidden
, cl::cat(BoltOptCategory
));
65 // Parameters of the profile inference algorithm. The default values are tuned
66 // on several benchmarks.
67 cl::opt
<bool> StaleMatchingEvenFlowDistribution(
68 "stale-matching-even-flow-distribution",
69 cl::desc("Try to evenly distribute flow when there are multiple equally "
71 cl::init(true), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
73 cl::opt
<bool> StaleMatchingRebalanceUnknown(
74 "stale-matching-rebalance-unknown",
75 cl::desc("Evenly re-distribute flow among unknown subgraphs."),
76 cl::init(false), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
78 cl::opt
<bool> StaleMatchingJoinIslands(
79 "stale-matching-join-islands",
80 cl::desc("Join isolated components having positive flow."), cl::init(true),
81 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
83 cl::opt
<unsigned> StaleMatchingCostBlockInc(
84 "stale-matching-cost-block-inc",
85 cl::desc("The cost of increasing a block count by one."), cl::init(150),
86 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
88 cl::opt
<unsigned> StaleMatchingCostBlockDec(
89 "stale-matching-cost-block-dec",
90 cl::desc("The cost of decreasing a block count by one."), cl::init(150),
91 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
93 cl::opt
<unsigned> StaleMatchingCostJumpInc(
94 "stale-matching-cost-jump-inc",
95 cl::desc("The cost of increasing a jump count by one."), cl::init(150),
96 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
98 cl::opt
<unsigned> StaleMatchingCostJumpDec(
99 "stale-matching-cost-jump-dec",
100 cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
101 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
103 cl::opt
<unsigned> StaleMatchingCostBlockUnknownInc(
104 "stale-matching-cost-block-unknown-inc",
105 cl::desc("The cost of increasing an unknown block count by one."),
106 cl::init(1), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
108 cl::opt
<unsigned> StaleMatchingCostJumpUnknownInc(
109 "stale-matching-cost-jump-unknown-inc",
110 cl::desc("The cost of increasing an unknown jump count by one."),
111 cl::init(140), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
113 cl::opt
<unsigned> StaleMatchingCostJumpUnknownFTInc(
114 "stale-matching-cost-jump-unknown-ft-inc",
116 "The cost of increasing an unknown fall-through jump count by one."),
117 cl::init(3), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
124 /// An object wrapping several components of a basic block hash. The combined
125 /// (blended) hash is represented and stored as one uint64_t, while individual
126 /// components are of smaller size (e.g., uint16_t or uint8_t).
127 struct BlendedBlockHash
{
129 using ValueOffset
= Bitfield::Element
<uint16_t, 0, 16>;
130 using ValueOpcode
= Bitfield::Element
<uint16_t, 16, 16>;
131 using ValueInstr
= Bitfield::Element
<uint16_t, 32, 16>;
132 using ValuePred
= Bitfield::Element
<uint8_t, 48, 8>;
133 using ValueSucc
= Bitfield::Element
<uint8_t, 56, 8>;
136 explicit BlendedBlockHash() {}
138 explicit BlendedBlockHash(uint64_t Hash
) {
139 Offset
= Bitfield::get
<ValueOffset
>(Hash
);
140 OpcodeHash
= Bitfield::get
<ValueOpcode
>(Hash
);
141 InstrHash
= Bitfield::get
<ValueInstr
>(Hash
);
142 PredHash
= Bitfield::get
<ValuePred
>(Hash
);
143 SuccHash
= Bitfield::get
<ValueSucc
>(Hash
);
146 /// Combine the blended hash into uint64_t.
147 uint64_t combine() const {
149 Bitfield::set
<ValueOffset
>(Hash
, Offset
);
150 Bitfield::set
<ValueOpcode
>(Hash
, OpcodeHash
);
151 Bitfield::set
<ValueInstr
>(Hash
, InstrHash
);
152 Bitfield::set
<ValuePred
>(Hash
, PredHash
);
153 Bitfield::set
<ValueSucc
>(Hash
, SuccHash
);
157 /// Compute a distance between two given blended hashes. The smaller the
158 /// distance, the more similar two blocks are. For identical basic blocks,
159 /// the distance is zero.
160 uint64_t distance(const BlendedBlockHash
&BBH
) const {
161 assert(OpcodeHash
== BBH
.OpcodeHash
&&
162 "incorrect blended hash distance computation");
164 // Account for NeighborHash
165 Dist
+= SuccHash
== BBH
.SuccHash
? 0 : 1;
166 Dist
+= PredHash
== BBH
.PredHash
? 0 : 1;
168 // Account for InstrHash
169 Dist
+= InstrHash
== BBH
.InstrHash
? 0 : 1;
171 // Account for Offset
172 Dist
+= (Offset
>= BBH
.Offset
? Offset
- BBH
.Offset
: BBH
.Offset
- Offset
);
176 /// The offset of the basic block from the function start.
178 /// (Loose) Hash of the basic block instructions, excluding operands.
179 uint16_t OpcodeHash
{0};
180 /// (Strong) Hash of the basic block instructions, including opcodes and
182 uint16_t InstrHash
{0};
183 /// (Loose) Hashes of the predecessors of the basic block.
185 /// (Loose) Hashes of the successors of the basic block.
189 /// The object is used to identify and match basic blocks in a BinaryFunction
190 /// given their hashes computed on a binary built from several revisions behind
194 /// Initialize stale matcher.
195 void init(const std::vector
<FlowBlock
*> &Blocks
,
196 const std::vector
<BlendedBlockHash
> &Hashes
,
197 const std::vector
<uint64_t> &CallHashes
) {
198 assert(Blocks
.size() == Hashes
.size() &&
199 Hashes
.size() == CallHashes
.size() &&
200 "incorrect matcher initialization");
201 for (size_t I
= 0; I
< Blocks
.size(); I
++) {
202 FlowBlock
*Block
= Blocks
[I
];
203 uint16_t OpHash
= Hashes
[I
].OpcodeHash
;
204 OpHashToBlocks
[OpHash
].push_back(std::make_pair(Hashes
[I
], Block
));
206 CallHashToBlocks
[CallHashes
[I
]].push_back(
207 std::make_pair(Hashes
[I
], Block
));
211 /// Find the most similar block for a given hash.
212 const FlowBlock
*matchBlock(BlendedBlockHash BlendedHash
,
213 uint64_t CallHash
) const {
214 const FlowBlock
*BestBlock
= matchWithOpcodes(BlendedHash
);
215 return BestBlock
? BestBlock
: matchWithCalls(BlendedHash
, CallHash
);
218 /// Returns true if the two basic blocks (in the binary and in the profile)
219 /// corresponding to the given hashes are matched to each other with a high
221 static bool isHighConfidenceMatch(BlendedBlockHash Hash1
,
222 BlendedBlockHash Hash2
) {
223 return Hash1
.InstrHash
== Hash2
.InstrHash
;
227 using HashBlockPairType
= std::pair
<BlendedBlockHash
, FlowBlock
*>;
228 std::unordered_map
<uint16_t, std::vector
<HashBlockPairType
>> OpHashToBlocks
;
229 std::unordered_map
<uint64_t, std::vector
<HashBlockPairType
>> CallHashToBlocks
;
231 // Uses OpcodeHash to find the most similar block for a given hash.
232 const FlowBlock
*matchWithOpcodes(BlendedBlockHash BlendedHash
) const {
233 auto BlockIt
= OpHashToBlocks
.find(BlendedHash
.OpcodeHash
);
234 if (BlockIt
== OpHashToBlocks
.end())
236 FlowBlock
*BestBlock
= nullptr;
237 uint64_t BestDist
= std::numeric_limits
<uint64_t>::max();
238 for (const auto &[Hash
, Block
] : BlockIt
->second
) {
239 uint64_t Dist
= Hash
.distance(BlendedHash
);
240 if (BestBlock
== nullptr || Dist
< BestDist
) {
248 // Uses CallHash to find the most similar block for a given hash.
249 const FlowBlock
*matchWithCalls(BlendedBlockHash BlendedHash
,
250 uint64_t CallHash
) const {
253 auto BlockIt
= CallHashToBlocks
.find(CallHash
);
254 if (BlockIt
== CallHashToBlocks
.end())
256 FlowBlock
*BestBlock
= nullptr;
257 uint64_t BestDist
= std::numeric_limits
<uint64_t>::max();
258 for (const auto &[Hash
, Block
] : BlockIt
->second
) {
259 uint64_t Dist
= Hash
.OpcodeHash
> BlendedHash
.OpcodeHash
260 ? Hash
.OpcodeHash
- BlendedHash
.OpcodeHash
261 : BlendedHash
.OpcodeHash
- Hash
.OpcodeHash
;
262 if (BestBlock
== nullptr || Dist
< BestDist
) {
271 void BinaryFunction::computeBlockHashes(HashFunction HashFunction
) const {
275 assert(hasCFG() && "the function is expected to have CFG");
277 std::vector
<BlendedBlockHash
> BlendedHashes(BasicBlocks
.size());
278 std::vector
<uint64_t> OpcodeHashes(BasicBlocks
.size());
279 // Initialize hash components.
280 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
281 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
282 assert(BB
->getIndex() == I
&& "incorrect block index");
283 BlendedHashes
[I
].Offset
= BB
->getOffset();
284 // Hashing complete instructions.
285 std::string InstrHashStr
= hashBlock(
286 BC
, *BB
, [&](const MCOperand
&Op
) { return hashInstOperand(BC
, Op
); });
287 if (HashFunction
== HashFunction::StdHash
) {
288 uint64_t InstrHash
= std::hash
<std::string
>{}(InstrHashStr
);
289 BlendedHashes
[I
].InstrHash
= (uint16_t)hash_value(InstrHash
);
290 } else if (HashFunction
== HashFunction::XXH3
) {
291 uint64_t InstrHash
= llvm::xxh3_64bits(InstrHashStr
);
292 BlendedHashes
[I
].InstrHash
= (uint16_t)InstrHash
;
294 llvm_unreachable("Unhandled HashFunction");
297 std::string OpcodeHashStr
= hashBlockLoose(BC
, *BB
);
298 if (HashFunction
== HashFunction::StdHash
) {
299 OpcodeHashes
[I
] = std::hash
<std::string
>{}(OpcodeHashStr
);
300 BlendedHashes
[I
].OpcodeHash
= (uint16_t)hash_value(OpcodeHashes
[I
]);
301 } else if (HashFunction
== HashFunction::XXH3
) {
302 OpcodeHashes
[I
] = llvm::xxh3_64bits(OpcodeHashStr
);
303 BlendedHashes
[I
].OpcodeHash
= (uint16_t)OpcodeHashes
[I
];
305 llvm_unreachable("Unhandled HashFunction");
309 // Initialize neighbor hash.
310 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
311 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
312 // Append hashes of successors.
314 for (BinaryBasicBlock
*SuccBB
: BB
->successors()) {
315 uint64_t SuccHash
= OpcodeHashes
[SuccBB
->getIndex()];
316 Hash
= hashing::detail::hash_16_bytes(Hash
, SuccHash
);
318 if (HashFunction
== HashFunction::StdHash
) {
319 // Compatibility with old behavior.
320 BlendedHashes
[I
].SuccHash
= (uint8_t)hash_value(Hash
);
322 BlendedHashes
[I
].SuccHash
= (uint8_t)Hash
;
325 // Append hashes of predecessors.
327 for (BinaryBasicBlock
*PredBB
: BB
->predecessors()) {
328 uint64_t PredHash
= OpcodeHashes
[PredBB
->getIndex()];
329 Hash
= hashing::detail::hash_16_bytes(Hash
, PredHash
);
331 if (HashFunction
== HashFunction::StdHash
) {
332 // Compatibility with old behavior.
333 BlendedHashes
[I
].PredHash
= (uint8_t)hash_value(Hash
);
335 BlendedHashes
[I
].PredHash
= (uint8_t)Hash
;
340 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
341 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
342 BB
->setHash(BlendedHashes
[I
].combine());
345 // TODO: mediate the difference between flow function construction here in BOLT
346 // and in the compiler by splitting blocks with exception throwing calls at the
347 // call and adding the landing pad as the successor.
348 /// Create a wrapper flow function to use with the profile inference algorithm,
349 /// and initialize its jumps and metadata.
351 createFlowFunction(const BinaryFunction::BasicBlockOrderType
&BlockOrder
) {
354 // Add a special "dummy" source so that there is always a unique entry point.
355 FlowBlock EntryBlock
;
356 EntryBlock
.Index
= 0;
357 Func
.Blocks
.push_back(EntryBlock
);
359 // Create FlowBlock for every basic block in the binary function.
360 for (const BinaryBasicBlock
*BB
: BlockOrder
) {
361 Func
.Blocks
.emplace_back();
362 FlowBlock
&Block
= Func
.Blocks
.back();
363 Block
.Index
= Func
.Blocks
.size() - 1;
365 assert(Block
.Index
== BB
->getIndex() + 1 &&
366 "incorrectly assigned basic block index");
369 // Add a special "dummy" sink block so there is always a unique sink.
371 SinkBlock
.Index
= Func
.Blocks
.size();
372 Func
.Blocks
.push_back(SinkBlock
);
374 // Create FlowJump for each jump between basic blocks in the binary function.
375 std::vector
<uint64_t> InDegree(Func
.Blocks
.size(), 0);
376 for (const BinaryBasicBlock
*SrcBB
: BlockOrder
) {
377 std::unordered_set
<const BinaryBasicBlock
*> UniqueSuccs
;
378 // Collect regular jumps
379 for (const BinaryBasicBlock
*DstBB
: SrcBB
->successors()) {
380 // Ignoring parallel edges
381 if (UniqueSuccs
.find(DstBB
) != UniqueSuccs
.end())
384 Func
.Jumps
.emplace_back();
385 FlowJump
&Jump
= Func
.Jumps
.back();
386 Jump
.Source
= SrcBB
->getIndex() + 1;
387 Jump
.Target
= DstBB
->getIndex() + 1;
388 InDegree
[Jump
.Target
]++;
389 UniqueSuccs
.insert(DstBB
);
391 // TODO: set jump from exit block to landing pad to Unlikely.
392 // If the block is an exit, add a dummy edge from it to the sink block.
393 if (UniqueSuccs
.empty()) {
394 Func
.Jumps
.emplace_back();
395 FlowJump
&Jump
= Func
.Jumps
.back();
396 Jump
.Source
= SrcBB
->getIndex() + 1;
397 Jump
.Target
= Func
.Blocks
.size() - 1;
398 InDegree
[Jump
.Target
]++;
401 // Collect jumps to landing pads
402 for (const BinaryBasicBlock
*DstBB
: SrcBB
->landing_pads()) {
403 // Ignoring parallel edges
404 if (UniqueSuccs
.find(DstBB
) != UniqueSuccs
.end())
407 Func
.Jumps
.emplace_back();
408 FlowJump
&Jump
= Func
.Jumps
.back();
409 Jump
.Source
= SrcBB
->getIndex() + 1;
410 Jump
.Target
= DstBB
->getIndex() + 1;
411 InDegree
[Jump
.Target
]++;
412 UniqueSuccs
.insert(DstBB
);
416 // Add dummy edges to the extra sources. If there are multiple entry blocks,
417 // add an unlikely edge from 0 to the subsequent ones. Skips the sink block.
418 assert(InDegree
[0] == 0 && "dummy entry blocks shouldn't have predecessors");
419 for (uint64_t I
= 1; I
< Func
.Blocks
.size() - 1; I
++) {
420 const BinaryBasicBlock
*BB
= BlockOrder
[I
- 1];
421 if (BB
->isEntryPoint() || InDegree
[I
] == 0) {
422 Func
.Jumps
.emplace_back();
423 FlowJump
&Jump
= Func
.Jumps
.back();
426 if (!BB
->isEntryPoint())
427 Jump
.IsUnlikely
= true;
431 // Create necessary metadata for the flow function
432 for (FlowJump
&Jump
: Func
.Jumps
) {
433 assert(Jump
.Source
< Func
.Blocks
.size());
434 Func
.Blocks
[Jump
.Source
].SuccJumps
.push_back(&Jump
);
435 assert(Jump
.Target
< Func
.Blocks
.size());
436 Func
.Blocks
[Jump
.Target
].PredJumps
.push_back(&Jump
);
441 /// Assign initial block/jump weights based on the stale profile data. The goal
442 /// is to extract as much information from the stale profile as possible. Here
443 /// we assume that each basic block is specified via a hash value computed from
444 /// its content and the hashes of the unchanged basic blocks stay the same
445 /// across different revisions of the binary.
446 /// Whenever there is a count in the profile with the hash corresponding to one
447 /// of the basic blocks in the binary, the count is "matched" to the block.
448 /// Similarly, if both the source and the target of a count in the profile are
449 /// matched to a jump in the binary, the count is recorded in CFG.
451 matchWeightsByHashes(BinaryContext
&BC
,
452 const BinaryFunction::BasicBlockOrderType
&BlockOrder
,
453 const yaml::bolt::BinaryFunctionProfile
&YamlBF
,
454 FlowFunction
&Func
, HashFunction HashFunction
,
455 YAMLProfileReader::ProfileLookupMap
&IdToYamlBF
) {
457 assert(Func
.Blocks
.size() == BlockOrder
.size() + 2);
459 std::vector
<uint64_t> CallHashes
;
460 std::vector
<FlowBlock
*> Blocks
;
461 std::vector
<BlendedBlockHash
> BlendedHashes
;
462 for (uint64_t I
= 0; I
< BlockOrder
.size(); I
++) {
463 const BinaryBasicBlock
*BB
= BlockOrder
[I
];
464 assert(BB
->getHash() != 0 && "empty hash of BinaryBasicBlock");
466 std::string CallHashStr
= hashBlockCalls(BC
, *BB
);
467 if (CallHashStr
.empty()) {
468 CallHashes
.push_back(0);
470 if (HashFunction
== HashFunction::StdHash
)
471 CallHashes
.push_back(std::hash
<std::string
>{}(CallHashStr
));
472 else if (HashFunction
== HashFunction::XXH3
)
473 CallHashes
.push_back(llvm::xxh3_64bits(CallHashStr
));
475 llvm_unreachable("Unhandled HashFunction");
478 Blocks
.push_back(&Func
.Blocks
[I
+ 1]);
479 BlendedBlockHash
BlendedHash(BB
->getHash());
480 BlendedHashes
.push_back(BlendedHash
);
481 LLVM_DEBUG(dbgs() << "BB with index " << I
<< " has hash = "
482 << Twine::utohexstr(BB
->getHash()) << "\n");
484 StaleMatcher Matcher
;
485 Matcher
.init(Blocks
, BlendedHashes
, CallHashes
);
487 // Index in yaml profile => corresponding (matched) block
488 DenseMap
<uint64_t, const FlowBlock
*> MatchedBlocks
;
489 // Match blocks from the profile to the blocks in CFG
490 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
.Blocks
) {
491 assert(YamlBB
.Hash
!= 0 && "empty hash of BinaryBasicBlockProfile");
492 BlendedBlockHash
YamlHash(YamlBB
.Hash
);
494 const FlowBlock
*MatchedBlock
= nullptr;
495 std::string CallHashStr
= hashBlockCalls(IdToYamlBF
, YamlBB
);
496 uint64_t CallHash
= 0;
497 if (!CallHashStr
.empty()) {
498 if (HashFunction
== HashFunction::StdHash
)
499 CallHash
= std::hash
<std::string
>{}(CallHashStr
);
500 else if (HashFunction
== HashFunction::XXH3
)
501 CallHash
= llvm::xxh3_64bits(CallHashStr
);
503 llvm_unreachable("Unhandled HashFunction");
505 MatchedBlock
= Matcher
.matchBlock(YamlHash
, CallHash
);
506 if (MatchedBlock
== nullptr && YamlBB
.Index
== 0)
507 MatchedBlock
= Blocks
[0];
508 if (MatchedBlock
!= nullptr) {
509 const BinaryBasicBlock
*BB
= BlockOrder
[MatchedBlock
->Index
- 1];
510 MatchedBlocks
[YamlBB
.Index
] = MatchedBlock
;
511 BlendedBlockHash BinHash
= BlendedHashes
[MatchedBlock
->Index
- 1];
512 LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB
.Index
<< ")"
513 << " with hash " << Twine::utohexstr(YamlBB
.Hash
)
514 << " to BB (index = " << MatchedBlock
->Index
- 1 << ")"
515 << " with hash " << Twine::utohexstr(BinHash
.combine())
517 // Update matching stats accounting for the matched block.
518 if (Matcher
.isHighConfidenceMatch(BinHash
, YamlHash
)) {
519 ++BC
.Stats
.NumMatchedBlocks
;
520 BC
.Stats
.MatchedSampleCount
+= YamlBB
.ExecCount
;
521 LLVM_DEBUG(dbgs() << " exact match\n");
523 LLVM_DEBUG(dbgs() << " loose match\n");
525 if (YamlBB
.NumInstructions
== BB
->size())
526 ++BC
.Stats
.NumStaleBlocksWithEqualIcount
;
529 dbgs() << "Couldn't match yaml block (bid = " << YamlBB
.Index
<< ")"
530 << " with hash " << Twine::utohexstr(YamlBB
.Hash
) << "\n");
533 // Update matching stats.
534 ++BC
.Stats
.NumStaleBlocks
;
535 BC
.Stats
.StaleSampleCount
+= YamlBB
.ExecCount
;
538 // Match jumps from the profile to the jumps from CFG
539 std::vector
<uint64_t> OutWeight(Func
.Blocks
.size(), 0);
540 std::vector
<uint64_t> InWeight(Func
.Blocks
.size(), 0);
541 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
.Blocks
) {
542 for (const yaml::bolt::SuccessorInfo
&YamlSI
: YamlBB
.Successors
) {
543 if (YamlSI
.Count
== 0)
546 // Try to find the jump for a given (src, dst) pair from the profile and
547 // assign the jump weight based on the profile count
548 const uint64_t SrcIndex
= YamlBB
.Index
;
549 const uint64_t DstIndex
= YamlSI
.Index
;
551 const FlowBlock
*MatchedSrcBlock
= MatchedBlocks
.lookup(SrcIndex
);
552 const FlowBlock
*MatchedDstBlock
= MatchedBlocks
.lookup(DstIndex
);
554 if (MatchedSrcBlock
!= nullptr && MatchedDstBlock
!= nullptr) {
555 // Find a jump between the two blocks
556 FlowJump
*Jump
= nullptr;
557 for (FlowJump
*SuccJump
: MatchedSrcBlock
->SuccJumps
) {
558 if (SuccJump
->Target
== MatchedDstBlock
->Index
) {
563 // Assign the weight, if the corresponding jump is found
564 if (Jump
!= nullptr) {
565 Jump
->Weight
= YamlSI
.Count
;
566 Jump
->HasUnknownWeight
= false;
569 // Assign the weight for the src block, if it is found
570 if (MatchedSrcBlock
!= nullptr)
571 OutWeight
[MatchedSrcBlock
->Index
] += YamlSI
.Count
;
572 // Assign the weight for the dst block, if it is found
573 if (MatchedDstBlock
!= nullptr)
574 InWeight
[MatchedDstBlock
->Index
] += YamlSI
.Count
;
578 // Assign block counts based on in-/out- jumps
579 for (FlowBlock
&Block
: Func
.Blocks
) {
580 if (OutWeight
[Block
.Index
] == 0 && InWeight
[Block
.Index
] == 0) {
581 assert(Block
.HasUnknownWeight
&& "unmatched block with a positive count");
584 Block
.HasUnknownWeight
= false;
585 Block
.Weight
= std::max(OutWeight
[Block
.Index
], InWeight
[Block
.Index
]);
588 return MatchedBlocks
.size();
591 /// The function finds all blocks that are (i) reachable from the Entry block
592 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
593 /// so that profi does not send any flow to such blocks.
594 void preprocessUnreachableBlocks(FlowFunction
&Func
) {
595 const uint64_t NumBlocks
= Func
.Blocks
.size();
597 // Start bfs from the source
598 std::queue
<uint64_t> Queue
;
599 std::vector
<bool> VisitedEntry(NumBlocks
, false);
600 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
601 FlowBlock
&Block
= Func
.Blocks
[I
];
602 if (Block
.isEntry()) {
604 VisitedEntry
[I
] = true;
608 while (!Queue
.empty()) {
609 const uint64_t Src
= Queue
.front();
611 for (FlowJump
*Jump
: Func
.Blocks
[Src
].SuccJumps
) {
612 const uint64_t Dst
= Jump
->Target
;
613 if (!VisitedEntry
[Dst
]) {
615 VisitedEntry
[Dst
] = true;
620 // Start bfs from all sinks
621 std::vector
<bool> VisitedExit(NumBlocks
, false);
622 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
623 FlowBlock
&Block
= Func
.Blocks
[I
];
624 if (Block
.isExit() && VisitedEntry
[I
]) {
626 VisitedExit
[I
] = true;
629 while (!Queue
.empty()) {
630 const uint64_t Src
= Queue
.front();
632 for (FlowJump
*Jump
: Func
.Blocks
[Src
].PredJumps
) {
633 const uint64_t Dst
= Jump
->Source
;
634 if (!VisitedExit
[Dst
]) {
636 VisitedExit
[Dst
] = true;
641 // Make all blocks of zero weight so that flow is not sent
642 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
643 FlowBlock
&Block
= Func
.Blocks
[I
];
644 if (Block
.Weight
== 0)
646 if (!VisitedEntry
[I
] || !VisitedExit
[I
]) {
648 Block
.HasUnknownWeight
= true;
649 Block
.IsUnlikely
= true;
650 for (FlowJump
*Jump
: Block
.SuccJumps
) {
651 if (Jump
->Source
== Block
.Index
&& Jump
->Target
== Block
.Index
) {
653 Jump
->HasUnknownWeight
= true;
654 Jump
->IsUnlikely
= true;
661 /// Decide if stale profile matching can be applied for a given function.
662 /// Currently we skip inference for (very) large instances and for instances
663 /// having "unexpected" control flow (e.g., having no sink basic blocks).
664 bool canApplyInference(const FlowFunction
&Func
,
665 const yaml::bolt::BinaryFunctionProfile
&YamlBF
,
666 const uint64_t &MatchedBlocks
) {
667 if (Func
.Blocks
.size() > opts::StaleMatchingMaxFuncSize
)
670 if (MatchedBlocks
* 100 <
671 opts::StaleMatchingMinMatchedBlock
* YamlBF
.Blocks
.size())
674 // Returns false if the artificial sink block has no predecessors meaning
675 // there are no exit blocks.
676 if (Func
.Blocks
[Func
.Blocks
.size() - 1].isEntry())
682 /// Apply the profile inference algorithm for a given flow function.
683 void applyInference(FlowFunction
&Func
) {
685 // Set the params from the command-line flags.
686 Params
.EvenFlowDistribution
= opts::StaleMatchingEvenFlowDistribution
;
687 Params
.RebalanceUnknown
= opts::StaleMatchingRebalanceUnknown
;
688 Params
.JoinIslands
= opts::StaleMatchingJoinIslands
;
690 Params
.CostBlockInc
= opts::StaleMatchingCostBlockInc
;
691 Params
.CostBlockEntryInc
= opts::StaleMatchingCostBlockInc
;
692 Params
.CostBlockDec
= opts::StaleMatchingCostBlockDec
;
693 Params
.CostBlockEntryDec
= opts::StaleMatchingCostBlockDec
;
694 Params
.CostBlockUnknownInc
= opts::StaleMatchingCostBlockUnknownInc
;
696 Params
.CostJumpInc
= opts::StaleMatchingCostJumpInc
;
697 Params
.CostJumpFTInc
= opts::StaleMatchingCostJumpInc
;
698 Params
.CostJumpDec
= opts::StaleMatchingCostJumpDec
;
699 Params
.CostJumpFTDec
= opts::StaleMatchingCostJumpDec
;
700 Params
.CostJumpUnknownInc
= opts::StaleMatchingCostJumpUnknownInc
;
701 Params
.CostJumpUnknownFTInc
= opts::StaleMatchingCostJumpUnknownFTInc
;
703 applyFlowInference(Params
, Func
);
706 /// Collect inferred counts from the flow function and update annotations in
707 /// the binary function.
708 void assignProfile(BinaryFunction
&BF
,
709 const BinaryFunction::BasicBlockOrderType
&BlockOrder
,
710 FlowFunction
&Func
) {
711 BinaryContext
&BC
= BF
.getBinaryContext();
713 assert(Func
.Blocks
.size() == BlockOrder
.size() + 2);
714 for (uint64_t I
= 0; I
< BlockOrder
.size(); I
++) {
715 FlowBlock
&Block
= Func
.Blocks
[I
+ 1];
716 BinaryBasicBlock
*BB
= BlockOrder
[I
];
718 // Update block's count
719 BB
->setExecutionCount(Block
.Flow
);
721 // Update jump counts: (i) clean existing counts and then (ii) set new ones
722 auto BI
= BB
->branch_info_begin();
723 for (const BinaryBasicBlock
*DstBB
: BB
->successors()) {
726 BI
->MispredictedCount
= 0;
729 for (FlowJump
*Jump
: Block
.SuccJumps
) {
730 if (Jump
->IsUnlikely
)
735 // Skips the artificial sink block.
736 if (Jump
->Target
== Func
.Blocks
.size() - 1)
738 BinaryBasicBlock
&SuccBB
= *BlockOrder
[Jump
->Target
- 1];
739 // Check if the edge corresponds to a regular jump or a landing pad
740 if (BB
->getSuccessor(SuccBB
.getLabel())) {
741 BinaryBasicBlock::BinaryBranchInfo
&BI
= BB
->getBranchInfo(SuccBB
);
742 BI
.Count
+= Jump
->Flow
;
744 BinaryBasicBlock
*LP
= BB
->getLandingPad(SuccBB
.getLabel());
745 if (LP
&& LP
->getKnownExecutionCount() < Jump
->Flow
)
746 LP
->setExecutionCount(Jump
->Flow
);
750 // Update call-site annotations
751 auto setOrUpdateAnnotation
= [&](MCInst
&Instr
, StringRef Name
,
753 if (BC
.MIB
->hasAnnotation(Instr
, Name
))
754 BC
.MIB
->removeAnnotation(Instr
, Name
);
755 // Do not add zero-count annotations
758 BC
.MIB
->addAnnotation(Instr
, Name
, Count
);
761 for (MCInst
&Instr
: *BB
) {
762 // Ignore pseudo instructions
763 if (BC
.MIB
->isPseudo(Instr
))
765 // Ignore jump tables
766 const MCInst
*LastInstr
= BB
->getLastNonPseudoInstr();
767 if (BC
.MIB
->getJumpTable(*LastInstr
) && LastInstr
== &Instr
)
770 if (BC
.MIB
->isIndirectCall(Instr
) || BC
.MIB
->isIndirectBranch(Instr
)) {
771 auto &ICSP
= BC
.MIB
->getOrCreateAnnotationAs
<IndirectCallSiteProfile
>(
772 Instr
, "CallProfile");
774 // Try to evenly distribute the counts among the call sites
775 const uint64_t TotalCount
= Block
.Flow
;
776 const uint64_t NumSites
= ICSP
.size();
777 for (uint64_t Idx
= 0; Idx
< ICSP
.size(); Idx
++) {
778 IndirectCallProfile
&CSP
= ICSP
[Idx
];
779 uint64_t CountPerSite
= TotalCount
/ NumSites
;
780 // When counts cannot be exactly distributed, increase by 1 the
781 // counts of the first (TotalCount % NumSites) call sites
782 if (Idx
< TotalCount
% NumSites
)
784 CSP
.Count
= CountPerSite
;
787 ICSP
.emplace_back(nullptr, Block
.Flow
, 0);
789 } else if (BC
.MIB
->getConditionalTailCall(Instr
)) {
790 // We don't know exactly the number of times the conditional tail call
791 // is executed; conservatively, setting it to the count of the block
792 setOrUpdateAnnotation(Instr
, "CTCTakenCount", Block
.Flow
);
793 BC
.MIB
->removeAnnotation(Instr
, "CTCMispredCount");
794 } else if (BC
.MIB
->isCall(Instr
)) {
795 setOrUpdateAnnotation(Instr
, "Count", Block
.Flow
);
800 // Update function's execution count and mark the function inferred.
801 BF
.setExecutionCount(Func
.Blocks
[0].Flow
);
802 BF
.setHasInferredProfile(true);
805 bool YAMLProfileReader::inferStaleProfile(
806 BinaryFunction
&BF
, const yaml::bolt::BinaryFunctionProfile
&YamlBF
) {
808 NamedRegionTimer
T("inferStaleProfile", "stale profile inference", "rewrite",
809 "Rewrite passes", opts::TimeRewrite
);
814 LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
815 << "\"" << BF
.getPrintName() << "\"\n");
817 // Make sure that block hashes are up to date.
818 BF
.computeBlockHashes(YamlBP
.Header
.HashFunction
);
820 const BinaryFunction::BasicBlockOrderType
BlockOrder(
821 BF
.getLayout().block_begin(), BF
.getLayout().block_end());
823 // Tracks the number of matched blocks.
825 // Create a wrapper flow function to use with the profile inference algorithm.
826 FlowFunction Func
= createFlowFunction(BlockOrder
);
828 // Match as many block/jump counts from the stale profile as possible
829 size_t MatchedBlocks
=
830 matchWeightsByHashes(BF
.getBinaryContext(), BlockOrder
, YamlBF
, Func
,
831 YamlBP
.Header
.HashFunction
, IdToYamLBF
);
833 // Adjust the flow function by marking unreachable blocks Unlikely so that
834 // they don't get any counts assigned.
835 preprocessUnreachableBlocks(Func
);
837 // Check if profile inference can be applied for the instance.
838 if (!canApplyInference(Func
, YamlBF
, MatchedBlocks
))
841 // Apply the profile inference algorithm.
842 applyInference(Func
);
844 // Collect inferred counts and update function annotations.
845 assignProfile(BF
, BlockOrder
, Func
);
847 // As of now, we always mark the binary function having "correct" profile.
848 // In the future, we may discard the results for instances with poor inference
849 // metrics and keep such functions un-optimized.
853 } // end namespace bolt
854 } // end namespace llvm