bolt/lib/Profile/StaleProfileMatching.cpp

   1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching   ----===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // BOLT often has to deal with profiles collected on binaries built from several
  10 // revisions behind release. As a result, a certain percentage of functions is
  11 // considered stale and not optimized. This file implements an ability to match
  12 // profile to functions that are not 100% binary identical, and thus, increasing
  13 // the optimization coverage and boost the performance of applications.
  14 //
  15 // The algorithm consists of two phases: matching and inference:
  16 // - At the matching phase, we try to "guess" as many block and jump counts from
  17 //   the stale profile as possible. To this end, the content of each basic block
  18 //   is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
  19 //   it computes block hashes and identifies the corresponding entries in the
  20 //   stale profile. It yields a partial profile for every CFG in the binary.
  21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
  22 //   reconstruct "realistic" block and jump counts from the partial profile
  23 //   generated at the first stage. In practice, we don't always produce proper
  24 //   profile data but the majority (e.g., >90%) of CFGs get the correct counts.
  25 //
  26 //===----------------------------------------------------------------------===//
  27
  28 #include "bolt/Core/HashUtilities.h"
  29 #include "bolt/Profile/YAMLProfileReader.h"
  30 #include "llvm/ADT/Bitfields.h"
  31 #include "llvm/ADT/Hashing.h"
  32 #include "llvm/Support/CommandLine.h"
  33 #include "llvm/Support/Timer.h"
  34 #include "llvm/Support/xxhash.h"
  35 #include "llvm/Transforms/Utils/SampleProfileInference.h"
  36
  37 #include <queue>
  38
  39 using namespace llvm;
  40
  41 #undef DEBUG_TYPE
  42 #define DEBUG_TYPE "bolt-prof"
  43
  44 namespace opts {
  45
  46 extern cl::opt<bool> TimeRewrite;
  47 extern cl::OptionCategory BoltOptCategory;
  48
  49 cl::opt<bool>
  50     InferStaleProfile("infer-stale-profile",
  51                       cl::desc("Infer counts from stale profile data."),
  52                       cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
  53
  54 cl::opt<unsigned> StaleMatchingMinMatchedBlock(
  55     "stale-matching-min-matched-block",
  56     cl::desc("Percentage threshold of matched basic blocks at which stale "
  57              "profile inference is executed."),
  58     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
  59
  60 cl::opt<unsigned> StaleMatchingMaxFuncSize(
  61     "stale-matching-max-func-size",
  62     cl::desc("The maximum size of a function to consider for inference."),
  63     cl::init(10000), cl::Hidden, cl::cat(BoltOptCategory));
  64
  65 // Parameters of the profile inference algorithm. The default values are tuned
  66 // on several benchmarks.
  67 cl::opt<bool> StaleMatchingEvenFlowDistribution(
  68     "stale-matching-even-flow-distribution",
  69     cl::desc("Try to evenly distribute flow when there are multiple equally "
  70              "likely options."),
  71     cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory));
  72
  73 cl::opt<bool> StaleMatchingRebalanceUnknown(
  74     "stale-matching-rebalance-unknown",
  75     cl::desc("Evenly re-distribute flow among unknown subgraphs."),
  76     cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory));
  77
  78 cl::opt<bool> StaleMatchingJoinIslands(
  79     "stale-matching-join-islands",
  80     cl::desc("Join isolated components having positive flow."), cl::init(true),
  81     cl::ReallyHidden, cl::cat(BoltOptCategory));
  82
  83 cl::opt<unsigned> StaleMatchingCostBlockInc(
  84     "stale-matching-cost-block-inc",
  85     cl::desc("The cost of increasing a block count by one."), cl::init(150),
  86     cl::ReallyHidden, cl::cat(BoltOptCategory));
  87
  88 cl::opt<unsigned> StaleMatchingCostBlockDec(
  89     "stale-matching-cost-block-dec",
  90     cl::desc("The cost of decreasing a block count by one."), cl::init(150),
  91     cl::ReallyHidden, cl::cat(BoltOptCategory));
  92
  93 cl::opt<unsigned> StaleMatchingCostJumpInc(
  94     "stale-matching-cost-jump-inc",
  95     cl::desc("The cost of increasing a jump count by one."), cl::init(150),
  96     cl::ReallyHidden, cl::cat(BoltOptCategory));
  97
  98 cl::opt<unsigned> StaleMatchingCostJumpDec(
  99     "stale-matching-cost-jump-dec",
 100     cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
 101     cl::ReallyHidden, cl::cat(BoltOptCategory));
 102
 103 cl::opt<unsigned> StaleMatchingCostBlockUnknownInc(
 104     "stale-matching-cost-block-unknown-inc",
 105     cl::desc("The cost of increasing an unknown block count by one."),
 106     cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory));
 107
 108 cl::opt<unsigned> StaleMatchingCostJumpUnknownInc(
 109     "stale-matching-cost-jump-unknown-inc",
 110     cl::desc("The cost of increasing an unknown jump count by one."),
 111     cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory));
 112
 113 cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
 114     "stale-matching-cost-jump-unknown-ft-inc",
 115     cl::desc(
 116         "The cost of increasing an unknown fall-through jump count by one."),
 117     cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory));
 118
 119 } // namespace opts
 120
 121 namespace llvm {
 122 namespace bolt {
 123
 124 /// An object wrapping several components of a basic block hash. The combined
 125 /// (blended) hash is represented and stored as one uint64_t, while individual
 126 /// components are of smaller size (e.g., uint16_t or uint8_t).
 127 struct BlendedBlockHash {
 128 private:
 129   using ValueOffset = Bitfield::Element<uint16_t, 0, 16>;
 130   using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>;
 131   using ValueInstr = Bitfield::Element<uint16_t, 32, 16>;
 132   using ValuePred = Bitfield::Element<uint8_t, 48, 8>;
 133   using ValueSucc = Bitfield::Element<uint8_t, 56, 8>;
 134
 135 public:
 136   explicit BlendedBlockHash() {}
 137
 138   explicit BlendedBlockHash(uint64_t Hash) {
 139     Offset = Bitfield::get<ValueOffset>(Hash);
 140     OpcodeHash = Bitfield::get<ValueOpcode>(Hash);
 141     InstrHash = Bitfield::get<ValueInstr>(Hash);
 142     PredHash = Bitfield::get<ValuePred>(Hash);
 143     SuccHash = Bitfield::get<ValueSucc>(Hash);
 144   }
 145
 146   /// Combine the blended hash into uint64_t.
 147   uint64_t combine() const {
 148     uint64_t Hash = 0;
 149     Bitfield::set<ValueOffset>(Hash, Offset);
 150     Bitfield::set<ValueOpcode>(Hash, OpcodeHash);
 151     Bitfield::set<ValueInstr>(Hash, InstrHash);
 152     Bitfield::set<ValuePred>(Hash, PredHash);
 153     Bitfield::set<ValueSucc>(Hash, SuccHash);
 154     return Hash;
 155   }
 156
 157   /// Compute a distance between two given blended hashes. The smaller the
 158   /// distance, the more similar two blocks are. For identical basic blocks,
 159   /// the distance is zero.
 160   uint64_t distance(const BlendedBlockHash &BBH) const {
 161     assert(OpcodeHash == BBH.OpcodeHash &&
 162            "incorrect blended hash distance computation");
 163     uint64_t Dist = 0;
 164     // Account for NeighborHash
 165     Dist += SuccHash == BBH.SuccHash ? 0 : 1;
 166     Dist += PredHash == BBH.PredHash ? 0 : 1;
 167     Dist <<= 16;
 168     // Account for InstrHash
 169     Dist += InstrHash == BBH.InstrHash ? 0 : 1;
 170     Dist <<= 16;
 171     // Account for Offset
 172     Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
 173     return Dist;
 174   }
 175
 176   /// The offset of the basic block from the function start.
 177   uint16_t Offset{0};
 178   /// (Loose) Hash of the basic block instructions, excluding operands.
 179   uint16_t OpcodeHash{0};
 180   /// (Strong) Hash of the basic block instructions, including opcodes and
 181   /// operands.
 182   uint16_t InstrHash{0};
 183   /// (Loose) Hashes of the predecessors of the basic block.
 184   uint8_t PredHash{0};
 185   /// (Loose) Hashes of the successors of the basic block.
 186   uint8_t SuccHash{0};
 187 };
 188
 189 /// The object is used to identify and match basic blocks in a BinaryFunction
 190 /// given their hashes computed on a binary built from several revisions behind
 191 /// release.
 192 class StaleMatcher {
 193 public:
 194   /// Initialize stale matcher.
 195   void init(const std::vector<FlowBlock *> &Blocks,
 196             const std::vector<BlendedBlockHash> &Hashes,
 197             const std::vector<uint64_t> &CallHashes) {
 198     assert(Blocks.size() == Hashes.size() &&
 199            Hashes.size() == CallHashes.size() &&
 200            "incorrect matcher initialization");
 201     for (size_t I = 0; I < Blocks.size(); I++) {
 202       FlowBlock *Block = Blocks[I];
 203       uint16_t OpHash = Hashes[I].OpcodeHash;
 204       OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block));
 205       if (CallHashes[I])
 206         CallHashToBlocks[CallHashes[I]].push_back(
 207             std::make_pair(Hashes[I], Block));
 208     }
 209   }
 210
 211   /// Find the most similar block for a given hash.
 212   const FlowBlock *matchBlock(BlendedBlockHash BlendedHash,
 213                               uint64_t CallHash) const {
 214     const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
 215     return BestBlock ? BestBlock : matchWithCalls(BlendedHash, CallHash);
 216   }
 217
 218   /// Returns true if the two basic blocks (in the binary and in the profile)
 219   /// corresponding to the given hashes are matched to each other with a high
 220   /// confidence.
 221   static bool isHighConfidenceMatch(BlendedBlockHash Hash1,
 222                                     BlendedBlockHash Hash2) {
 223     return Hash1.InstrHash == Hash2.InstrHash;
 224   }
 225
 226 private:
 227   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
 228   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
 229   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
 230
 231   // Uses OpcodeHash to find the most similar block for a given hash.
 232   const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
 233     auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
 234     if (BlockIt == OpHashToBlocks.end())
 235       return nullptr;
 236     FlowBlock *BestBlock = nullptr;
 237     uint64_t BestDist = std::numeric_limits<uint64_t>::max();
 238     for (const auto &[Hash, Block] : BlockIt->second) {
 239       uint64_t Dist = Hash.distance(BlendedHash);
 240       if (BestBlock == nullptr || Dist < BestDist) {
 241         BestDist = Dist;
 242         BestBlock = Block;
 243       }
 244     }
 245     return BestBlock;
 246   }
 247
 248   // Uses CallHash to find the most similar block for a given hash.
 249   const FlowBlock *matchWithCalls(BlendedBlockHash BlendedHash,
 250                                   uint64_t CallHash) const {
 251     if (!CallHash)
 252       return nullptr;
 253     auto BlockIt = CallHashToBlocks.find(CallHash);
 254     if (BlockIt == CallHashToBlocks.end())
 255       return nullptr;
 256     FlowBlock *BestBlock = nullptr;
 257     uint64_t BestDist = std::numeric_limits<uint64_t>::max();
 258     for (const auto &[Hash, Block] : BlockIt->second) {
 259       uint64_t Dist = Hash.OpcodeHash > BlendedHash.OpcodeHash
 260                           ? Hash.OpcodeHash - BlendedHash.OpcodeHash
 261                           : BlendedHash.OpcodeHash - Hash.OpcodeHash;
 262       if (BestBlock == nullptr || Dist < BestDist) {
 263         BestDist = Dist;
 264         BestBlock = Block;
 265       }
 266     }
 267     return BestBlock;
 268   }
 269 };
 270
 271 void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const {
 272   if (size() == 0)
 273     return;
 274
 275   assert(hasCFG() && "the function is expected to have CFG");
 276
 277   std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size());
 278   std::vector<uint64_t> OpcodeHashes(BasicBlocks.size());
 279   // Initialize hash components.
 280   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 281     const BinaryBasicBlock *BB = BasicBlocks[I];
 282     assert(BB->getIndex() == I && "incorrect block index");
 283     BlendedHashes[I].Offset = BB->getOffset();
 284     // Hashing complete instructions.
 285     std::string InstrHashStr = hashBlock(
 286         BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); });
 287     if (HashFunction == HashFunction::StdHash) {
 288       uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr);
 289       BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash);
 290     } else if (HashFunction == HashFunction::XXH3) {
 291       uint64_t InstrHash = llvm::xxh3_64bits(InstrHashStr);
 292       BlendedHashes[I].InstrHash = (uint16_t)InstrHash;
 293     } else {
 294       llvm_unreachable("Unhandled HashFunction");
 295     }
 296     // Hashing opcodes.
 297     std::string OpcodeHashStr = hashBlockLoose(BC, *BB);
 298     if (HashFunction == HashFunction::StdHash) {
 299       OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr);
 300       BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]);
 301     } else if (HashFunction == HashFunction::XXH3) {
 302       OpcodeHashes[I] = llvm::xxh3_64bits(OpcodeHashStr);
 303       BlendedHashes[I].OpcodeHash = (uint16_t)OpcodeHashes[I];
 304     } else {
 305       llvm_unreachable("Unhandled HashFunction");
 306     }
 307   }
 308
 309   // Initialize neighbor hash.
 310   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 311     const BinaryBasicBlock *BB = BasicBlocks[I];
 312     // Append hashes of successors.
 313     uint64_t Hash = 0;
 314     for (BinaryBasicBlock *SuccBB : BB->successors()) {
 315       uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()];
 316       Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
 317     }
 318     if (HashFunction == HashFunction::StdHash) {
 319       // Compatibility with old behavior.
 320       BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash);
 321     } else {
 322       BlendedHashes[I].SuccHash = (uint8_t)Hash;
 323     }
 324
 325     // Append hashes of predecessors.
 326     Hash = 0;
 327     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
 328       uint64_t PredHash = OpcodeHashes[PredBB->getIndex()];
 329       Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
 330     }
 331     if (HashFunction == HashFunction::StdHash) {
 332       // Compatibility with old behavior.
 333       BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash);
 334     } else {
 335       BlendedHashes[I].PredHash = (uint8_t)Hash;
 336     }
 337   }
 338
 339   //  Assign hashes.
 340   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 341     const BinaryBasicBlock *BB = BasicBlocks[I];
 342     BB->setHash(BlendedHashes[I].combine());
 343   }
 344 }
 345 // TODO: mediate the difference between flow function construction here in BOLT
 346 // and in the compiler by splitting blocks with exception throwing calls at the
 347 // call and adding the landing pad as the successor.
 348 /// Create a wrapper flow function to use with the profile inference algorithm,
 349 /// and initialize its jumps and metadata.
 350 FlowFunction
 351 createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 352   FlowFunction Func;
 353
 354   // Add a special "dummy" source so that there is always a unique entry point.
 355   FlowBlock EntryBlock;
 356   EntryBlock.Index = 0;
 357   Func.Blocks.push_back(EntryBlock);
 358
 359   // Create FlowBlock for every basic block in the binary function.
 360   for (const BinaryBasicBlock *BB : BlockOrder) {
 361     Func.Blocks.emplace_back();
 362     FlowBlock &Block = Func.Blocks.back();
 363     Block.Index = Func.Blocks.size() - 1;
 364     (void)BB;
 365     assert(Block.Index == BB->getIndex() + 1 &&
 366            "incorrectly assigned basic block index");
 367   }
 368
 369   // Add a special "dummy" sink block so there is always a unique sink.
 370   FlowBlock SinkBlock;
 371   SinkBlock.Index = Func.Blocks.size();
 372   Func.Blocks.push_back(SinkBlock);
 373
 374   // Create FlowJump for each jump between basic blocks in the binary function.
 375   std::vector<uint64_t> InDegree(Func.Blocks.size(), 0);
 376   for (const BinaryBasicBlock *SrcBB : BlockOrder) {
 377     std::unordered_set<const BinaryBasicBlock *> UniqueSuccs;
 378     // Collect regular jumps
 379     for (const BinaryBasicBlock *DstBB : SrcBB->successors()) {
 380       // Ignoring parallel edges
 381       if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
 382         continue;
 383
 384       Func.Jumps.emplace_back();
 385       FlowJump &Jump = Func.Jumps.back();
 386       Jump.Source = SrcBB->getIndex() + 1;
 387       Jump.Target = DstBB->getIndex() + 1;
 388       InDegree[Jump.Target]++;
 389       UniqueSuccs.insert(DstBB);
 390     }
 391     // TODO: set jump from exit block to landing pad to Unlikely.
 392     // If the block is an exit, add a dummy edge from it to the sink block.
 393     if (UniqueSuccs.empty()) {
 394       Func.Jumps.emplace_back();
 395       FlowJump &Jump = Func.Jumps.back();
 396       Jump.Source = SrcBB->getIndex() + 1;
 397       Jump.Target = Func.Blocks.size() - 1;
 398       InDegree[Jump.Target]++;
 399     }
 400
 401     // Collect jumps to landing pads
 402     for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) {
 403       // Ignoring parallel edges
 404       if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
 405         continue;
 406
 407       Func.Jumps.emplace_back();
 408       FlowJump &Jump = Func.Jumps.back();
 409       Jump.Source = SrcBB->getIndex() + 1;
 410       Jump.Target = DstBB->getIndex() + 1;
 411       InDegree[Jump.Target]++;
 412       UniqueSuccs.insert(DstBB);
 413     }
 414   }
 415
 416   // Add dummy edges to the extra sources. If there are multiple entry blocks,
 417   // add an unlikely edge from 0 to the subsequent ones. Skips the sink block.
 418   assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors");
 419   for (uint64_t I = 1; I < Func.Blocks.size() - 1; I++) {
 420     const BinaryBasicBlock *BB = BlockOrder[I - 1];
 421     if (BB->isEntryPoint() || InDegree[I] == 0) {
 422       Func.Jumps.emplace_back();
 423       FlowJump &Jump = Func.Jumps.back();
 424       Jump.Source = 0;
 425       Jump.Target = I;
 426       if (!BB->isEntryPoint())
 427         Jump.IsUnlikely = true;
 428     }
 429   }
 430
 431   // Create necessary metadata for the flow function
 432   for (FlowJump &Jump : Func.Jumps) {
 433     assert(Jump.Source < Func.Blocks.size());
 434     Func.Blocks[Jump.Source].SuccJumps.push_back(&Jump);
 435     assert(Jump.Target < Func.Blocks.size());
 436     Func.Blocks[Jump.Target].PredJumps.push_back(&Jump);
 437   }
 438   return Func;
 439 }
 440
 441 /// Assign initial block/jump weights based on the stale profile data. The goal
 442 /// is to extract as much information from the stale profile as possible. Here
 443 /// we assume that each basic block is specified via a hash value computed from
 444 /// its content and the hashes of the unchanged basic blocks stay the same
 445 /// across different revisions of the binary.
 446 /// Whenever there is a count in the profile with the hash corresponding to one
 447 /// of the basic blocks in the binary, the count is "matched" to the block.
 448 /// Similarly, if both the source and the target of a count in the profile are
 449 /// matched to a jump in the binary, the count is recorded in CFG.
 450 size_t
 451 matchWeightsByHashes(BinaryContext &BC,
 452                      const BinaryFunction::BasicBlockOrderType &BlockOrder,
 453                      const yaml::bolt::BinaryFunctionProfile &YamlBF,
 454                      FlowFunction &Func, HashFunction HashFunction,
 455                      YAMLProfileReader::ProfileLookupMap &IdToYamlBF) {
 456
 457   assert(Func.Blocks.size() == BlockOrder.size() + 2);
 458
 459   std::vector<uint64_t> CallHashes;
 460   std::vector<FlowBlock *> Blocks;
 461   std::vector<BlendedBlockHash> BlendedHashes;
 462   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
 463     const BinaryBasicBlock *BB = BlockOrder[I];
 464     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
 465
 466     std::string CallHashStr = hashBlockCalls(BC, *BB);
 467     if (CallHashStr.empty()) {
 468       CallHashes.push_back(0);
 469     } else {
 470       if (HashFunction == HashFunction::StdHash)
 471         CallHashes.push_back(std::hash<std::string>{}(CallHashStr));
 472       else if (HashFunction == HashFunction::XXH3)
 473         CallHashes.push_back(llvm::xxh3_64bits(CallHashStr));
 474       else
 475         llvm_unreachable("Unhandled HashFunction");
 476     }
 477
 478     Blocks.push_back(&Func.Blocks[I + 1]);
 479     BlendedBlockHash BlendedHash(BB->getHash());
 480     BlendedHashes.push_back(BlendedHash);
 481     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
 482                       << Twine::utohexstr(BB->getHash()) << "\n");
 483   }
 484   StaleMatcher Matcher;
 485   Matcher.init(Blocks, BlendedHashes, CallHashes);
 486
 487   // Index in yaml profile => corresponding (matched) block
 488   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
 489   // Match blocks from the profile to the blocks in CFG
 490   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
 491     assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile");
 492     BlendedBlockHash YamlHash(YamlBB.Hash);
 493
 494     const FlowBlock *MatchedBlock = nullptr;
 495     std::string CallHashStr = hashBlockCalls(IdToYamlBF, YamlBB);
 496     uint64_t CallHash = 0;
 497     if (!CallHashStr.empty()) {
 498       if (HashFunction == HashFunction::StdHash)
 499         CallHash = std::hash<std::string>{}(CallHashStr);
 500       else if (HashFunction == HashFunction::XXH3)
 501         CallHash = llvm::xxh3_64bits(CallHashStr);
 502       else
 503         llvm_unreachable("Unhandled HashFunction");
 504     }
 505     MatchedBlock = Matcher.matchBlock(YamlHash, CallHash);
 506     if (MatchedBlock == nullptr && YamlBB.Index == 0)
 507       MatchedBlock = Blocks[0];
 508     if (MatchedBlock != nullptr) {
 509       const BinaryBasicBlock *BB = BlockOrder[MatchedBlock->Index - 1];
 510       MatchedBlocks[YamlBB.Index] = MatchedBlock;
 511       BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1];
 512       LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")"
 513                         << " with hash " << Twine::utohexstr(YamlBB.Hash)
 514                         << " to BB (index = " << MatchedBlock->Index - 1 << ")"
 515                         << " with hash " << Twine::utohexstr(BinHash.combine())
 516                         << "\n");
 517       // Update matching stats accounting for the matched block.
 518       if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
 519         ++BC.Stats.NumMatchedBlocks;
 520         BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
 521         LLVM_DEBUG(dbgs() << "  exact match\n");
 522       } else {
 523         LLVM_DEBUG(dbgs() << "  loose match\n");
 524       }
 525       if (YamlBB.NumInstructions == BB->size())
 526         ++BC.Stats.NumStaleBlocksWithEqualIcount;
 527     } else {
 528       LLVM_DEBUG(
 529           dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")"
 530                  << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n");
 531     }
 532
 533     // Update matching stats.
 534     ++BC.Stats.NumStaleBlocks;
 535     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
 536   }
 537
 538   // Match jumps from the profile to the jumps from CFG
 539   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
 540   std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);
 541   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
 542     for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) {
 543       if (YamlSI.Count == 0)
 544         continue;
 545
 546       // Try to find the jump for a given (src, dst) pair from the profile and
 547       // assign the jump weight based on the profile count
 548       const uint64_t SrcIndex = YamlBB.Index;
 549       const uint64_t DstIndex = YamlSI.Index;
 550
 551       const FlowBlock *MatchedSrcBlock = MatchedBlocks.lookup(SrcIndex);
 552       const FlowBlock *MatchedDstBlock = MatchedBlocks.lookup(DstIndex);
 553
 554       if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) {
 555         // Find a jump between the two blocks
 556         FlowJump *Jump = nullptr;
 557         for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) {
 558           if (SuccJump->Target == MatchedDstBlock->Index) {
 559             Jump = SuccJump;
 560             break;
 561           }
 562         }
 563         // Assign the weight, if the corresponding jump is found
 564         if (Jump != nullptr) {
 565           Jump->Weight = YamlSI.Count;
 566           Jump->HasUnknownWeight = false;
 567         }
 568       }
 569       // Assign the weight for the src block, if it is found
 570       if (MatchedSrcBlock != nullptr)
 571         OutWeight[MatchedSrcBlock->Index] += YamlSI.Count;
 572       // Assign the weight for the dst block, if it is found
 573       if (MatchedDstBlock != nullptr)
 574         InWeight[MatchedDstBlock->Index] += YamlSI.Count;
 575     }
 576   }
 577
 578   // Assign block counts based on in-/out- jumps
 579   for (FlowBlock &Block : Func.Blocks) {
 580     if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) {
 581       assert(Block.HasUnknownWeight && "unmatched block with a positive count");
 582       continue;
 583     }
 584     Block.HasUnknownWeight = false;
 585     Block.Weight = std::max(OutWeight[Block.Index], InWeight[Block.Index]);
 586   }
 587
 588   return MatchedBlocks.size();
 589 }
 590
 591 /// The function finds all blocks that are (i) reachable from the Entry block
 592 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
 593 /// so that profi does not send any flow to such blocks.
 594 void preprocessUnreachableBlocks(FlowFunction &Func) {
 595   const uint64_t NumBlocks = Func.Blocks.size();
 596
 597   // Start bfs from the source
 598   std::queue<uint64_t> Queue;
 599   std::vector<bool> VisitedEntry(NumBlocks, false);
 600   for (uint64_t I = 0; I < NumBlocks; I++) {
 601     FlowBlock &Block = Func.Blocks[I];
 602     if (Block.isEntry()) {
 603       Queue.push(I);
 604       VisitedEntry[I] = true;
 605       break;
 606     }
 607   }
 608   while (!Queue.empty()) {
 609     const uint64_t Src = Queue.front();
 610     Queue.pop();
 611     for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) {
 612       const uint64_t Dst = Jump->Target;
 613       if (!VisitedEntry[Dst]) {
 614         Queue.push(Dst);
 615         VisitedEntry[Dst] = true;
 616       }
 617     }
 618   }
 619
 620   // Start bfs from all sinks
 621   std::vector<bool> VisitedExit(NumBlocks, false);
 622   for (uint64_t I = 0; I < NumBlocks; I++) {
 623     FlowBlock &Block = Func.Blocks[I];
 624     if (Block.isExit() && VisitedEntry[I]) {
 625       Queue.push(I);
 626       VisitedExit[I] = true;
 627     }
 628   }
 629   while (!Queue.empty()) {
 630     const uint64_t Src = Queue.front();
 631     Queue.pop();
 632     for (FlowJump *Jump : Func.Blocks[Src].PredJumps) {
 633       const uint64_t Dst = Jump->Source;
 634       if (!VisitedExit[Dst]) {
 635         Queue.push(Dst);
 636         VisitedExit[Dst] = true;
 637       }
 638     }
 639   }
 640
 641   // Make all blocks of zero weight so that flow is not sent
 642   for (uint64_t I = 0; I < NumBlocks; I++) {
 643     FlowBlock &Block = Func.Blocks[I];
 644     if (Block.Weight == 0)
 645       continue;
 646     if (!VisitedEntry[I] || !VisitedExit[I]) {
 647       Block.Weight = 0;
 648       Block.HasUnknownWeight = true;
 649       Block.IsUnlikely = true;
 650       for (FlowJump *Jump : Block.SuccJumps) {
 651         if (Jump->Source == Block.Index && Jump->Target == Block.Index) {
 652           Jump->Weight = 0;
 653           Jump->HasUnknownWeight = true;
 654           Jump->IsUnlikely = true;
 655         }
 656       }
 657     }
 658   }
 659 }
 660
 661 /// Decide if stale profile matching can be applied for a given function.
 662 /// Currently we skip inference for (very) large instances and for instances
 663 /// having "unexpected" control flow (e.g., having no sink basic blocks).
 664 bool canApplyInference(const FlowFunction &Func,
 665                        const yaml::bolt::BinaryFunctionProfile &YamlBF,
 666                        const uint64_t &MatchedBlocks) {
 667   if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize)
 668     return false;
 669
 670   if (MatchedBlocks * 100 <
 671       opts::StaleMatchingMinMatchedBlock * YamlBF.Blocks.size())
 672     return false;
 673
 674   // Returns false if the artificial sink block has no predecessors meaning
 675   // there are no exit blocks.
 676   if (Func.Blocks[Func.Blocks.size() - 1].isEntry())
 677     return false;
 678
 679   return true;
 680 }
 681
 682 /// Apply the profile inference algorithm for a given flow function.
 683 void applyInference(FlowFunction &Func) {
 684   ProfiParams Params;
 685   // Set the params from the command-line flags.
 686   Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution;
 687   Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown;
 688   Params.JoinIslands = opts::StaleMatchingJoinIslands;
 689
 690   Params.CostBlockInc = opts::StaleMatchingCostBlockInc;
 691   Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc;
 692   Params.CostBlockDec = opts::StaleMatchingCostBlockDec;
 693   Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec;
 694   Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc;
 695
 696   Params.CostJumpInc = opts::StaleMatchingCostJumpInc;
 697   Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc;
 698   Params.CostJumpDec = opts::StaleMatchingCostJumpDec;
 699   Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec;
 700   Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc;
 701   Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc;
 702
 703   applyFlowInference(Params, Func);
 704 }
 705
 706 /// Collect inferred counts from the flow function and update annotations in
 707 /// the binary function.
 708 void assignProfile(BinaryFunction &BF,
 709                    const BinaryFunction::BasicBlockOrderType &BlockOrder,
 710                    FlowFunction &Func) {
 711   BinaryContext &BC = BF.getBinaryContext();
 712
 713   assert(Func.Blocks.size() == BlockOrder.size() + 2);
 714   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
 715     FlowBlock &Block = Func.Blocks[I + 1];
 716     BinaryBasicBlock *BB = BlockOrder[I];
 717
 718     // Update block's count
 719     BB->setExecutionCount(Block.Flow);
 720
 721     // Update jump counts: (i) clean existing counts and then (ii) set new ones
 722     auto BI = BB->branch_info_begin();
 723     for (const BinaryBasicBlock *DstBB : BB->successors()) {
 724       (void)DstBB;
 725       BI->Count = 0;
 726       BI->MispredictedCount = 0;
 727       ++BI;
 728     }
 729     for (FlowJump *Jump : Block.SuccJumps) {
 730       if (Jump->IsUnlikely)
 731         continue;
 732       if (Jump->Flow == 0)
 733         continue;
 734
 735       // Skips the artificial sink block.
 736       if (Jump->Target == Func.Blocks.size() - 1)
 737         continue;
 738       BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1];
 739       // Check if the edge corresponds to a regular jump or a landing pad
 740       if (BB->getSuccessor(SuccBB.getLabel())) {
 741         BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(SuccBB);
 742         BI.Count += Jump->Flow;
 743       } else {
 744         BinaryBasicBlock *LP = BB->getLandingPad(SuccBB.getLabel());
 745         if (LP && LP->getKnownExecutionCount() < Jump->Flow)
 746           LP->setExecutionCount(Jump->Flow);
 747       }
 748     }
 749
 750     // Update call-site annotations
 751     auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name,
 752                                      uint64_t Count) {
 753       if (BC.MIB->hasAnnotation(Instr, Name))
 754         BC.MIB->removeAnnotation(Instr, Name);
 755       // Do not add zero-count annotations
 756       if (Count == 0)
 757         return;
 758       BC.MIB->addAnnotation(Instr, Name, Count);
 759     };
 760
 761     for (MCInst &Instr : *BB) {
 762       // Ignore pseudo instructions
 763       if (BC.MIB->isPseudo(Instr))
 764         continue;
 765       // Ignore jump tables
 766       const MCInst *LastInstr = BB->getLastNonPseudoInstr();
 767       if (BC.MIB->getJumpTable(*LastInstr) && LastInstr == &Instr)
 768         continue;
 769
 770       if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) {
 771         auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
 772             Instr, "CallProfile");
 773         if (!ICSP.empty()) {
 774           // Try to evenly distribute the counts among the call sites
 775           const uint64_t TotalCount = Block.Flow;
 776           const uint64_t NumSites = ICSP.size();
 777           for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) {
 778             IndirectCallProfile &CSP = ICSP[Idx];
 779             uint64_t CountPerSite = TotalCount / NumSites;
 780             // When counts cannot be exactly distributed, increase by 1 the
 781             // counts of the first (TotalCount % NumSites) call sites
 782             if (Idx < TotalCount % NumSites)
 783               CountPerSite++;
 784             CSP.Count = CountPerSite;
 785           }
 786         } else {
 787           ICSP.emplace_back(nullptr, Block.Flow, 0);
 788         }
 789       } else if (BC.MIB->getConditionalTailCall(Instr)) {
 790         // We don't know exactly the number of times the conditional tail call
 791         // is executed; conservatively, setting it to the count of the block
 792         setOrUpdateAnnotation(Instr, "CTCTakenCount", Block.Flow);
 793         BC.MIB->removeAnnotation(Instr, "CTCMispredCount");
 794       } else if (BC.MIB->isCall(Instr)) {
 795         setOrUpdateAnnotation(Instr, "Count", Block.Flow);
 796       }
 797     }
 798   }
 799
 800   // Update function's execution count and mark the function inferred.
 801   BF.setExecutionCount(Func.Blocks[0].Flow);
 802   BF.setHasInferredProfile(true);
 803 }
 804
 805 bool YAMLProfileReader::inferStaleProfile(
 806     BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
 807
 808   NamedRegionTimer T("inferStaleProfile", "stale profile inference", "rewrite",
 809                      "Rewrite passes", opts::TimeRewrite);
 810
 811   if (!BF.hasCFG())
 812     return false;
 813
 814   LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
 815                     << "\"" << BF.getPrintName() << "\"\n");
 816
 817   // Make sure that block hashes are up to date.
 818   BF.computeBlockHashes(YamlBP.Header.HashFunction);
 819
 820   const BinaryFunction::BasicBlockOrderType BlockOrder(
 821       BF.getLayout().block_begin(), BF.getLayout().block_end());
 822
 823   // Tracks the number of matched blocks.
 824
 825   // Create a wrapper flow function to use with the profile inference algorithm.
 826   FlowFunction Func = createFlowFunction(BlockOrder);
 827
 828   // Match as many block/jump counts from the stale profile as possible
 829   size_t MatchedBlocks =
 830       matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func,
 831                            YamlBP.Header.HashFunction, IdToYamLBF);
 832
 833   // Adjust the flow function by marking unreachable blocks Unlikely so that
 834   // they don't get any counts assigned.
 835   preprocessUnreachableBlocks(Func);
 836
 837   // Check if profile inference can be applied for the instance.
 838   if (!canApplyInference(Func, YamlBF, MatchedBlocks))
 839     return false;
 840
 841   // Apply the profile inference algorithm.
 842   applyInference(Func);
 843
 844   // Collect inferred counts and update function annotations.
 845   assignProfile(BF, BlockOrder, Func);
 846
 847   // As of now, we always mark the binary function having "correct" profile.
 848   // In the future, we may discard the results for instances with poor inference
 849   // metrics and keep such functions un-optimized.
 850   return true;
 851 }
 852
 853 } // end namespace bolt
 854 } // end namespace llvm