1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // BOLT often has to deal with profiles collected on binaries built from several
10 // revisions behind release. As a result, a certain percentage of functions is
11 // considered stale and not optimized. This file implements an ability to match
12 // profile to functions that are not 100% binary identical, and thus, increasing
13 // the optimization coverage and boost the performance of applications.
15 // The algorithm consists of two phases: matching and inference:
16 // - At the matching phase, we try to "guess" as many block and jump counts from
17 // the stale profile as possible. To this end, the content of each basic block
18 // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
19 // it computes block hashes and identifies the corresponding entries in the
20 // stale profile. It yields a partial profile for every CFG in the binary.
21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
22 // reconstruct "realistic" block and jump counts from the partial profile
23 // generated at the first stage. In practice, we don't always produce proper
24 // profile data but the majority (e.g., >90%) of CFGs get the correct counts.
26 //===----------------------------------------------------------------------===//
28 #include "bolt/Core/HashUtilities.h"
29 #include "bolt/Profile/YAMLProfileReader.h"
30 #include "llvm/ADT/Bitfields.h"
31 #include "llvm/ADT/Hashing.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/xxhash.h"
34 #include "llvm/Transforms/Utils/SampleProfileInference.h"
41 #define DEBUG_TYPE "bolt-prof"
45 extern cl::OptionCategory BoltOptCategory
;
48 InferStaleProfile("infer-stale-profile",
49 cl::desc("Infer counts from stale profile data."),
50 cl::init(false), cl::Hidden
, cl::cat(BoltOptCategory
));
52 cl::opt
<unsigned> StaleMatchingMaxFuncSize(
53 "stale-matching-max-func-size",
54 cl::desc("The maximum size of a function to consider for inference."),
55 cl::init(10000), cl::Hidden
, cl::cat(BoltOptCategory
));
57 // Parameters of the profile inference algorithm. The default values are tuned
58 // on several benchmarks.
59 cl::opt
<bool> StaleMatchingEvenFlowDistribution(
60 "stale-matching-even-flow-distribution",
61 cl::desc("Try to evenly distribute flow when there are multiple equally "
63 cl::init(true), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
65 cl::opt
<bool> StaleMatchingRebalanceUnknown(
66 "stale-matching-rebalance-unknown",
67 cl::desc("Evenly re-distribute flow among unknown subgraphs."),
68 cl::init(false), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
70 cl::opt
<bool> StaleMatchingJoinIslands(
71 "stale-matching-join-islands",
72 cl::desc("Join isolated components having positive flow."), cl::init(true),
73 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
75 cl::opt
<unsigned> StaleMatchingCostBlockInc(
76 "stale-matching-cost-block-inc",
77 cl::desc("The cost of increasing a block count by one."), cl::init(150),
78 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
80 cl::opt
<unsigned> StaleMatchingCostBlockDec(
81 "stale-matching-cost-block-dec",
82 cl::desc("The cost of decreasing a block count by one."), cl::init(150),
83 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
85 cl::opt
<unsigned> StaleMatchingCostJumpInc(
86 "stale-matching-cost-jump-inc",
87 cl::desc("The cost of increasing a jump count by one."), cl::init(150),
88 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
90 cl::opt
<unsigned> StaleMatchingCostJumpDec(
91 "stale-matching-cost-jump-dec",
92 cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
93 cl::ReallyHidden
, cl::cat(BoltOptCategory
));
95 cl::opt
<unsigned> StaleMatchingCostBlockUnknownInc(
96 "stale-matching-cost-block-unknown-inc",
97 cl::desc("The cost of increasing an unknown block count by one."),
98 cl::init(1), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
100 cl::opt
<unsigned> StaleMatchingCostJumpUnknownInc(
101 "stale-matching-cost-jump-unknown-inc",
102 cl::desc("The cost of increasing an unknown jump count by one."),
103 cl::init(140), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
105 cl::opt
<unsigned> StaleMatchingCostJumpUnknownFTInc(
106 "stale-matching-cost-jump-unknown-ft-inc",
108 "The cost of increasing an unknown fall-through jump count by one."),
109 cl::init(3), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
116 /// An object wrapping several components of a basic block hash. The combined
117 /// (blended) hash is represented and stored as one uint64_t, while individual
118 /// components are of smaller size (e.g., uint16_t or uint8_t).
119 struct BlendedBlockHash
{
121 using ValueOffset
= Bitfield::Element
<uint16_t, 0, 16>;
122 using ValueOpcode
= Bitfield::Element
<uint16_t, 16, 16>;
123 using ValueInstr
= Bitfield::Element
<uint16_t, 32, 16>;
124 using ValuePred
= Bitfield::Element
<uint8_t, 48, 8>;
125 using ValueSucc
= Bitfield::Element
<uint8_t, 56, 8>;
128 explicit BlendedBlockHash() {}
130 explicit BlendedBlockHash(uint64_t Hash
) {
131 Offset
= Bitfield::get
<ValueOffset
>(Hash
);
132 OpcodeHash
= Bitfield::get
<ValueOpcode
>(Hash
);
133 InstrHash
= Bitfield::get
<ValueInstr
>(Hash
);
134 PredHash
= Bitfield::get
<ValuePred
>(Hash
);
135 SuccHash
= Bitfield::get
<ValueSucc
>(Hash
);
138 /// Combine the blended hash into uint64_t.
139 uint64_t combine() const {
141 Bitfield::set
<ValueOffset
>(Hash
, Offset
);
142 Bitfield::set
<ValueOpcode
>(Hash
, OpcodeHash
);
143 Bitfield::set
<ValueInstr
>(Hash
, InstrHash
);
144 Bitfield::set
<ValuePred
>(Hash
, PredHash
);
145 Bitfield::set
<ValueSucc
>(Hash
, SuccHash
);
149 /// Compute a distance between two given blended hashes. The smaller the
150 /// distance, the more similar two blocks are. For identical basic blocks,
151 /// the distance is zero.
152 uint64_t distance(const BlendedBlockHash
&BBH
) const {
153 assert(OpcodeHash
== BBH
.OpcodeHash
&&
154 "incorrect blended hash distance computation");
156 // Account for NeighborHash
157 Dist
+= SuccHash
== BBH
.SuccHash
? 0 : 1;
158 Dist
+= PredHash
== BBH
.PredHash
? 0 : 1;
160 // Account for InstrHash
161 Dist
+= InstrHash
== BBH
.InstrHash
? 0 : 1;
163 // Account for Offset
164 Dist
+= (Offset
>= BBH
.Offset
? Offset
- BBH
.Offset
: BBH
.Offset
- Offset
);
168 /// The offset of the basic block from the function start.
170 /// (Loose) Hash of the basic block instructions, excluding operands.
171 uint16_t OpcodeHash
{0};
172 /// (Strong) Hash of the basic block instructions, including opcodes and
174 uint16_t InstrHash
{0};
175 /// (Loose) Hashes of the predecessors of the basic block.
177 /// (Loose) Hashes of the successors of the basic block.
181 /// The object is used to identify and match basic blocks in a BinaryFunction
182 /// given their hashes computed on a binary built from several revisions behind
186 /// Initialize stale matcher.
187 void init(const std::vector
<FlowBlock
*> &Blocks
,
188 const std::vector
<BlendedBlockHash
> &Hashes
) {
189 assert(Blocks
.size() == Hashes
.size() &&
190 "incorrect matcher initialization");
191 for (size_t I
= 0; I
< Blocks
.size(); I
++) {
192 FlowBlock
*Block
= Blocks
[I
];
193 uint16_t OpHash
= Hashes
[I
].OpcodeHash
;
194 OpHashToBlocks
[OpHash
].push_back(std::make_pair(Hashes
[I
], Block
));
198 /// Find the most similar block for a given hash.
199 const FlowBlock
*matchBlock(BlendedBlockHash BlendedHash
) const {
200 auto BlockIt
= OpHashToBlocks
.find(BlendedHash
.OpcodeHash
);
201 if (BlockIt
== OpHashToBlocks
.end())
203 FlowBlock
*BestBlock
= nullptr;
204 uint64_t BestDist
= std::numeric_limits
<uint64_t>::max();
205 for (const auto &[Hash
, Block
] : BlockIt
->second
) {
206 uint64_t Dist
= Hash
.distance(BlendedHash
);
207 if (BestBlock
== nullptr || Dist
< BestDist
) {
215 /// Returns true if the two basic blocks (in the binary and in the profile)
216 /// corresponding to the given hashes are matched to each other with a high
218 static bool isHighConfidenceMatch(BlendedBlockHash Hash1
,
219 BlendedBlockHash Hash2
) {
220 return Hash1
.InstrHash
== Hash2
.InstrHash
;
224 using HashBlockPairType
= std::pair
<BlendedBlockHash
, FlowBlock
*>;
225 std::unordered_map
<uint16_t, std::vector
<HashBlockPairType
>> OpHashToBlocks
;
228 void BinaryFunction::computeBlockHashes(HashFunction HashFunction
) const {
232 assert(hasCFG() && "the function is expected to have CFG");
234 std::vector
<BlendedBlockHash
> BlendedHashes(BasicBlocks
.size());
235 std::vector
<uint64_t> OpcodeHashes(BasicBlocks
.size());
236 // Initialize hash components.
237 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
238 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
239 assert(BB
->getIndex() == I
&& "incorrect block index");
240 BlendedHashes
[I
].Offset
= BB
->getOffset();
241 // Hashing complete instructions.
242 std::string InstrHashStr
= hashBlock(
243 BC
, *BB
, [&](const MCOperand
&Op
) { return hashInstOperand(BC
, Op
); });
244 if (HashFunction
== HashFunction::StdHash
) {
245 uint64_t InstrHash
= std::hash
<std::string
>{}(InstrHashStr
);
246 BlendedHashes
[I
].InstrHash
= (uint16_t)hash_value(InstrHash
);
247 } else if (HashFunction
== HashFunction::XXH3
) {
248 uint64_t InstrHash
= llvm::xxh3_64bits(InstrHashStr
);
249 BlendedHashes
[I
].InstrHash
= (uint16_t)InstrHash
;
251 llvm_unreachable("Unhandled HashFunction");
254 std::string OpcodeHashStr
= hashBlockLoose(BC
, *BB
);
255 if (HashFunction
== HashFunction::StdHash
) {
256 OpcodeHashes
[I
] = std::hash
<std::string
>{}(OpcodeHashStr
);
257 BlendedHashes
[I
].OpcodeHash
= (uint16_t)hash_value(OpcodeHashes
[I
]);
258 } else if (HashFunction
== HashFunction::XXH3
) {
259 OpcodeHashes
[I
] = llvm::xxh3_64bits(OpcodeHashStr
);
260 BlendedHashes
[I
].OpcodeHash
= (uint16_t)OpcodeHashes
[I
];
262 llvm_unreachable("Unhandled HashFunction");
266 // Initialize neighbor hash.
267 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
268 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
269 // Append hashes of successors.
271 for (BinaryBasicBlock
*SuccBB
: BB
->successors()) {
272 uint64_t SuccHash
= OpcodeHashes
[SuccBB
->getIndex()];
273 Hash
= hashing::detail::hash_16_bytes(Hash
, SuccHash
);
275 if (HashFunction
== HashFunction::StdHash
) {
276 // Compatibility with old behavior.
277 BlendedHashes
[I
].SuccHash
= (uint8_t)hash_value(Hash
);
279 BlendedHashes
[I
].SuccHash
= (uint8_t)Hash
;
282 // Append hashes of predecessors.
284 for (BinaryBasicBlock
*PredBB
: BB
->predecessors()) {
285 uint64_t PredHash
= OpcodeHashes
[PredBB
->getIndex()];
286 Hash
= hashing::detail::hash_16_bytes(Hash
, PredHash
);
288 if (HashFunction
== HashFunction::StdHash
) {
289 // Compatibility with old behavior.
290 BlendedHashes
[I
].PredHash
= (uint8_t)hash_value(Hash
);
292 BlendedHashes
[I
].PredHash
= (uint8_t)Hash
;
297 for (size_t I
= 0; I
< BasicBlocks
.size(); I
++) {
298 const BinaryBasicBlock
*BB
= BasicBlocks
[I
];
299 BB
->setHash(BlendedHashes
[I
].combine());
303 /// Create a wrapper flow function to use with the profile inference algorithm,
304 /// and initialize its jumps and metadata.
306 createFlowFunction(const BinaryFunction::BasicBlockOrderType
&BlockOrder
) {
309 // Add a special "dummy" source so that there is always a unique entry point.
310 // Because of the extra source, for all other blocks in FlowFunction it holds
311 // that Block.Index == BB->getIndex() + 1
312 FlowBlock EntryBlock
;
313 EntryBlock
.Index
= 0;
314 Func
.Blocks
.push_back(EntryBlock
);
316 // Create FlowBlock for every basic block in the binary function
317 for (const BinaryBasicBlock
*BB
: BlockOrder
) {
318 Func
.Blocks
.emplace_back();
319 FlowBlock
&Block
= Func
.Blocks
.back();
320 Block
.Index
= Func
.Blocks
.size() - 1;
322 assert(Block
.Index
== BB
->getIndex() + 1 &&
323 "incorrectly assigned basic block index");
326 // Create FlowJump for each jump between basic blocks in the binary function
327 std::vector
<uint64_t> InDegree(Func
.Blocks
.size(), 0);
328 for (const BinaryBasicBlock
*SrcBB
: BlockOrder
) {
329 std::unordered_set
<const BinaryBasicBlock
*> UniqueSuccs
;
330 // Collect regular jumps
331 for (const BinaryBasicBlock
*DstBB
: SrcBB
->successors()) {
332 // Ignoring parallel edges
333 if (UniqueSuccs
.find(DstBB
) != UniqueSuccs
.end())
336 Func
.Jumps
.emplace_back();
337 FlowJump
&Jump
= Func
.Jumps
.back();
338 Jump
.Source
= SrcBB
->getIndex() + 1;
339 Jump
.Target
= DstBB
->getIndex() + 1;
340 InDegree
[Jump
.Target
]++;
341 UniqueSuccs
.insert(DstBB
);
343 // Collect jumps to landing pads
344 for (const BinaryBasicBlock
*DstBB
: SrcBB
->landing_pads()) {
345 // Ignoring parallel edges
346 if (UniqueSuccs
.find(DstBB
) != UniqueSuccs
.end())
349 Func
.Jumps
.emplace_back();
350 FlowJump
&Jump
= Func
.Jumps
.back();
351 Jump
.Source
= SrcBB
->getIndex() + 1;
352 Jump
.Target
= DstBB
->getIndex() + 1;
353 InDegree
[Jump
.Target
]++;
354 UniqueSuccs
.insert(DstBB
);
358 // Add dummy edges to the extra sources. If there are multiple entry blocks,
359 // add an unlikely edge from 0 to the subsequent ones
360 assert(InDegree
[0] == 0 && "dummy entry blocks shouldn't have predecessors");
361 for (uint64_t I
= 1; I
< Func
.Blocks
.size(); I
++) {
362 const BinaryBasicBlock
*BB
= BlockOrder
[I
- 1];
363 if (BB
->isEntryPoint() || InDegree
[I
] == 0) {
364 Func
.Jumps
.emplace_back();
365 FlowJump
&Jump
= Func
.Jumps
.back();
368 if (!BB
->isEntryPoint())
369 Jump
.IsUnlikely
= true;
373 // Create necessary metadata for the flow function
374 for (FlowJump
&Jump
: Func
.Jumps
) {
375 Func
.Blocks
.at(Jump
.Source
).SuccJumps
.push_back(&Jump
);
376 Func
.Blocks
.at(Jump
.Target
).PredJumps
.push_back(&Jump
);
381 /// Assign initial block/jump weights based on the stale profile data. The goal
382 /// is to extract as much information from the stale profile as possible. Here
383 /// we assume that each basic block is specified via a hash value computed from
384 /// its content and the hashes of the unchanged basic blocks stay the same
385 /// across different revisions of the binary.
386 /// Whenever there is a count in the profile with the hash corresponding to one
387 /// of the basic blocks in the binary, the count is "matched" to the block.
388 /// Similarly, if both the source and the target of a count in the profile are
389 /// matched to a jump in the binary, the count is recorded in CFG.
390 void matchWeightsByHashes(BinaryContext
&BC
,
391 const BinaryFunction::BasicBlockOrderType
&BlockOrder
,
392 const yaml::bolt::BinaryFunctionProfile
&YamlBF
,
393 FlowFunction
&Func
) {
394 assert(Func
.Blocks
.size() == BlockOrder
.size() + 1);
396 std::vector
<FlowBlock
*> Blocks
;
397 std::vector
<BlendedBlockHash
> BlendedHashes
;
398 for (uint64_t I
= 0; I
< BlockOrder
.size(); I
++) {
399 const BinaryBasicBlock
*BB
= BlockOrder
[I
];
400 assert(BB
->getHash() != 0 && "empty hash of BinaryBasicBlock");
401 Blocks
.push_back(&Func
.Blocks
[I
+ 1]);
402 BlendedBlockHash
BlendedHash(BB
->getHash());
403 BlendedHashes
.push_back(BlendedHash
);
404 LLVM_DEBUG(dbgs() << "BB with index " << I
<< " has hash = "
405 << Twine::utohexstr(BB
->getHash()) << "\n");
407 StaleMatcher Matcher
;
408 Matcher
.init(Blocks
, BlendedHashes
);
410 // Index in yaml profile => corresponding (matched) block
411 DenseMap
<uint64_t, const FlowBlock
*> MatchedBlocks
;
412 // Match blocks from the profile to the blocks in CFG
413 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
.Blocks
) {
414 assert(YamlBB
.Hash
!= 0 && "empty hash of BinaryBasicBlockProfile");
415 BlendedBlockHash
YamlHash(YamlBB
.Hash
);
416 const FlowBlock
*MatchedBlock
= Matcher
.matchBlock(YamlHash
);
417 // Always match the entry block.
418 if (MatchedBlock
== nullptr && YamlBB
.Index
== 0)
419 MatchedBlock
= Blocks
[0];
420 if (MatchedBlock
!= nullptr) {
421 MatchedBlocks
[YamlBB
.Index
] = MatchedBlock
;
422 BlendedBlockHash BinHash
= BlendedHashes
[MatchedBlock
->Index
- 1];
423 LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB
.Index
<< ")"
424 << " with hash " << Twine::utohexstr(YamlBB
.Hash
)
425 << " to BB (index = " << MatchedBlock
->Index
- 1 << ")"
426 << " with hash " << Twine::utohexstr(BinHash
.combine())
428 // Update matching stats accounting for the matched block.
429 if (Matcher
.isHighConfidenceMatch(BinHash
, YamlHash
)) {
430 ++BC
.Stats
.NumMatchedBlocks
;
431 BC
.Stats
.MatchedSampleCount
+= YamlBB
.ExecCount
;
432 LLVM_DEBUG(dbgs() << " exact match\n");
434 LLVM_DEBUG(dbgs() << " loose match\n");
438 dbgs() << "Couldn't match yaml block (bid = " << YamlBB
.Index
<< ")"
439 << " with hash " << Twine::utohexstr(YamlBB
.Hash
) << "\n");
442 // Update matching stats.
443 ++BC
.Stats
.NumStaleBlocks
;
444 BC
.Stats
.StaleSampleCount
+= YamlBB
.ExecCount
;
447 // Match jumps from the profile to the jumps from CFG
448 std::vector
<uint64_t> OutWeight(Func
.Blocks
.size(), 0);
449 std::vector
<uint64_t> InWeight(Func
.Blocks
.size(), 0);
450 for (const yaml::bolt::BinaryBasicBlockProfile
&YamlBB
: YamlBF
.Blocks
) {
451 for (const yaml::bolt::SuccessorInfo
&YamlSI
: YamlBB
.Successors
) {
452 if (YamlSI
.Count
== 0)
455 // Try to find the jump for a given (src, dst) pair from the profile and
456 // assign the jump weight based on the profile count
457 const uint64_t SrcIndex
= YamlBB
.Index
;
458 const uint64_t DstIndex
= YamlSI
.Index
;
460 const FlowBlock
*MatchedSrcBlock
= MatchedBlocks
.lookup(SrcIndex
);
461 const FlowBlock
*MatchedDstBlock
= MatchedBlocks
.lookup(DstIndex
);
463 if (MatchedSrcBlock
!= nullptr && MatchedDstBlock
!= nullptr) {
464 // Find a jump between the two blocks
465 FlowJump
*Jump
= nullptr;
466 for (FlowJump
*SuccJump
: MatchedSrcBlock
->SuccJumps
) {
467 if (SuccJump
->Target
== MatchedDstBlock
->Index
) {
472 // Assign the weight, if the corresponding jump is found
473 if (Jump
!= nullptr) {
474 Jump
->Weight
= YamlSI
.Count
;
475 Jump
->HasUnknownWeight
= false;
478 // Assign the weight for the src block, if it is found
479 if (MatchedSrcBlock
!= nullptr)
480 OutWeight
[MatchedSrcBlock
->Index
] += YamlSI
.Count
;
481 // Assign the weight for the dst block, if it is found
482 if (MatchedDstBlock
!= nullptr)
483 InWeight
[MatchedDstBlock
->Index
] += YamlSI
.Count
;
487 // Assign block counts based on in-/out- jumps
488 for (FlowBlock
&Block
: Func
.Blocks
) {
489 if (OutWeight
[Block
.Index
] == 0 && InWeight
[Block
.Index
] == 0) {
490 assert(Block
.HasUnknownWeight
&& "unmatched block with a positive count");
493 Block
.HasUnknownWeight
= false;
494 Block
.Weight
= std::max(OutWeight
[Block
.Index
], InWeight
[Block
.Index
]);
498 /// The function finds all blocks that are (i) reachable from the Entry block
499 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
500 /// so that profi does not send any flow to such blocks.
501 void preprocessUnreachableBlocks(FlowFunction
&Func
) {
502 const uint64_t NumBlocks
= Func
.Blocks
.size();
504 // Start bfs from the source
505 std::queue
<uint64_t> Queue
;
506 std::vector
<bool> VisitedEntry(NumBlocks
, false);
507 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
508 FlowBlock
&Block
= Func
.Blocks
[I
];
509 if (Block
.isEntry()) {
511 VisitedEntry
[I
] = true;
515 while (!Queue
.empty()) {
516 const uint64_t Src
= Queue
.front();
518 for (FlowJump
*Jump
: Func
.Blocks
[Src
].SuccJumps
) {
519 const uint64_t Dst
= Jump
->Target
;
520 if (!VisitedEntry
[Dst
]) {
522 VisitedEntry
[Dst
] = true;
527 // Start bfs from all sinks
528 std::vector
<bool> VisitedExit(NumBlocks
, false);
529 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
530 FlowBlock
&Block
= Func
.Blocks
[I
];
531 if (Block
.isExit() && VisitedEntry
[I
]) {
533 VisitedExit
[I
] = true;
536 while (!Queue
.empty()) {
537 const uint64_t Src
= Queue
.front();
539 for (FlowJump
*Jump
: Func
.Blocks
[Src
].PredJumps
) {
540 const uint64_t Dst
= Jump
->Source
;
541 if (!VisitedExit
[Dst
]) {
543 VisitedExit
[Dst
] = true;
548 // Make all blocks of zero weight so that flow is not sent
549 for (uint64_t I
= 0; I
< NumBlocks
; I
++) {
550 FlowBlock
&Block
= Func
.Blocks
[I
];
551 if (Block
.Weight
== 0)
553 if (!VisitedEntry
[I
] || !VisitedExit
[I
]) {
555 Block
.HasUnknownWeight
= true;
556 Block
.IsUnlikely
= true;
557 for (FlowJump
*Jump
: Block
.SuccJumps
) {
558 if (Jump
->Source
== Block
.Index
&& Jump
->Target
== Block
.Index
) {
560 Jump
->HasUnknownWeight
= true;
561 Jump
->IsUnlikely
= true;
568 /// Decide if stale profile matching can be applied for a given function.
569 /// Currently we skip inference for (very) large instances and for instances
570 /// having "unexpected" control flow (e.g., having no sink basic blocks).
571 bool canApplyInference(const FlowFunction
&Func
) {
572 if (Func
.Blocks
.size() > opts::StaleMatchingMaxFuncSize
)
575 bool HasExitBlocks
= llvm::any_of(
576 Func
.Blocks
, [&](const FlowBlock
&Block
) { return Block
.isExit(); });
583 /// Apply the profile inference algorithm for a given flow function.
584 void applyInference(FlowFunction
&Func
) {
586 // Set the params from the command-line flags.
587 Params
.EvenFlowDistribution
= opts::StaleMatchingEvenFlowDistribution
;
588 Params
.RebalanceUnknown
= opts::StaleMatchingRebalanceUnknown
;
589 Params
.JoinIslands
= opts::StaleMatchingJoinIslands
;
591 Params
.CostBlockInc
= opts::StaleMatchingCostBlockInc
;
592 Params
.CostBlockEntryInc
= opts::StaleMatchingCostBlockInc
;
593 Params
.CostBlockDec
= opts::StaleMatchingCostBlockDec
;
594 Params
.CostBlockEntryDec
= opts::StaleMatchingCostBlockDec
;
595 Params
.CostBlockUnknownInc
= opts::StaleMatchingCostBlockUnknownInc
;
597 Params
.CostJumpInc
= opts::StaleMatchingCostJumpInc
;
598 Params
.CostJumpFTInc
= opts::StaleMatchingCostJumpInc
;
599 Params
.CostJumpDec
= opts::StaleMatchingCostJumpDec
;
600 Params
.CostJumpFTDec
= opts::StaleMatchingCostJumpDec
;
601 Params
.CostJumpUnknownInc
= opts::StaleMatchingCostJumpUnknownInc
;
602 Params
.CostJumpUnknownFTInc
= opts::StaleMatchingCostJumpUnknownFTInc
;
604 applyFlowInference(Params
, Func
);
607 /// Collect inferred counts from the flow function and update annotations in
608 /// the binary function.
609 void assignProfile(BinaryFunction
&BF
,
610 const BinaryFunction::BasicBlockOrderType
&BlockOrder
,
611 FlowFunction
&Func
) {
612 BinaryContext
&BC
= BF
.getBinaryContext();
614 assert(Func
.Blocks
.size() == BlockOrder
.size() + 1);
615 for (uint64_t I
= 0; I
< BlockOrder
.size(); I
++) {
616 FlowBlock
&Block
= Func
.Blocks
[I
+ 1];
617 BinaryBasicBlock
*BB
= BlockOrder
[I
];
619 // Update block's count
620 BB
->setExecutionCount(Block
.Flow
);
622 // Update jump counts: (i) clean existing counts and then (ii) set new ones
623 auto BI
= BB
->branch_info_begin();
624 for (const BinaryBasicBlock
*DstBB
: BB
->successors()) {
627 BI
->MispredictedCount
= 0;
630 for (FlowJump
*Jump
: Block
.SuccJumps
) {
631 if (Jump
->IsUnlikely
)
636 BinaryBasicBlock
&SuccBB
= *BlockOrder
[Jump
->Target
- 1];
637 // Check if the edge corresponds to a regular jump or a landing pad
638 if (BB
->getSuccessor(SuccBB
.getLabel())) {
639 BinaryBasicBlock::BinaryBranchInfo
&BI
= BB
->getBranchInfo(SuccBB
);
640 BI
.Count
+= Jump
->Flow
;
642 BinaryBasicBlock
*LP
= BB
->getLandingPad(SuccBB
.getLabel());
643 if (LP
&& LP
->getKnownExecutionCount() < Jump
->Flow
)
644 LP
->setExecutionCount(Jump
->Flow
);
648 // Update call-site annotations
649 auto setOrUpdateAnnotation
= [&](MCInst
&Instr
, StringRef Name
,
651 if (BC
.MIB
->hasAnnotation(Instr
, Name
))
652 BC
.MIB
->removeAnnotation(Instr
, Name
);
653 // Do not add zero-count annotations
656 BC
.MIB
->addAnnotation(Instr
, Name
, Count
);
659 for (MCInst
&Instr
: *BB
) {
660 // Ignore pseudo instructions
661 if (BC
.MIB
->isPseudo(Instr
))
663 // Ignore jump tables
664 const MCInst
*LastInstr
= BB
->getLastNonPseudoInstr();
665 if (BC
.MIB
->getJumpTable(*LastInstr
) && LastInstr
== &Instr
)
668 if (BC
.MIB
->isIndirectCall(Instr
) || BC
.MIB
->isIndirectBranch(Instr
)) {
669 auto &ICSP
= BC
.MIB
->getOrCreateAnnotationAs
<IndirectCallSiteProfile
>(
670 Instr
, "CallProfile");
672 // Try to evenly distribute the counts among the call sites
673 const uint64_t TotalCount
= Block
.Flow
;
674 const uint64_t NumSites
= ICSP
.size();
675 for (uint64_t Idx
= 0; Idx
< ICSP
.size(); Idx
++) {
676 IndirectCallProfile
&CSP
= ICSP
[Idx
];
677 uint64_t CountPerSite
= TotalCount
/ NumSites
;
678 // When counts cannot be exactly distributed, increase by 1 the
679 // counts of the first (TotalCount % NumSites) call sites
680 if (Idx
< TotalCount
% NumSites
)
682 CSP
.Count
= CountPerSite
;
685 ICSP
.emplace_back(nullptr, Block
.Flow
, 0);
687 } else if (BC
.MIB
->getConditionalTailCall(Instr
)) {
688 // We don't know exactly the number of times the conditional tail call
689 // is executed; conservatively, setting it to the count of the block
690 setOrUpdateAnnotation(Instr
, "CTCTakenCount", Block
.Flow
);
691 BC
.MIB
->removeAnnotation(Instr
, "CTCMispredCount");
692 } else if (BC
.MIB
->isCall(Instr
)) {
693 setOrUpdateAnnotation(Instr
, "Count", Block
.Flow
);
698 // Update function's execution count and mark the function inferred.
699 BF
.setExecutionCount(Func
.Blocks
[0].Flow
);
700 BF
.setHasInferredProfile(true);
703 bool YAMLProfileReader::inferStaleProfile(
704 BinaryFunction
&BF
, const yaml::bolt::BinaryFunctionProfile
&YamlBF
) {
705 LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
706 << "\"" << BF
.getPrintName() << "\"\n");
708 // Make sure that block hashes are up to date.
709 BF
.computeBlockHashes(YamlBP
.Header
.HashFunction
);
711 const BinaryFunction::BasicBlockOrderType
BlockOrder(
712 BF
.getLayout().block_begin(), BF
.getLayout().block_end());
714 // Create a wrapper flow function to use with the profile inference algorithm.
715 FlowFunction Func
= createFlowFunction(BlockOrder
);
717 // Match as many block/jump counts from the stale profile as possible
718 matchWeightsByHashes(BF
.getBinaryContext(), BlockOrder
, YamlBF
, Func
);
720 // Adjust the flow function by marking unreachable blocks Unlikely so that
721 // they don't get any counts assigned.
722 preprocessUnreachableBlocks(Func
);
724 // Check if profile inference can be applied for the instance.
725 if (!canApplyInference(Func
))
728 // Apply the profile inference algorithm.
729 applyInference(Func
);
731 // Collect inferred counts and update function annotations.
732 assignProfile(BF
, BlockOrder
, Func
);
734 // As of now, we always mark the binary function having "correct" profile.
735 // In the future, we may discard the results for instances with poor inference
736 // metrics and keep such functions un-optimized.
740 } // end namespace bolt
741 } // end namespace llvm