bolt/lib/Profile/StaleProfileMatching.cpp

   1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching   ----===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // BOLT often has to deal with profiles collected on binaries built from several
  10 // revisions behind release. As a result, a certain percentage of functions is
  11 // considered stale and not optimized. This file implements an ability to match
  12 // profile to functions that are not 100% binary identical, and thus, increasing
  13 // the optimization coverage and boost the performance of applications.
  14 //
  15 // The algorithm consists of two phases: matching and inference:
  16 // - At the matching phase, we try to "guess" as many block and jump counts from
  17 //   the stale profile as possible. To this end, the content of each basic block
  18 //   is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
  19 //   it computes block hashes and identifies the corresponding entries in the
  20 //   stale profile. It yields a partial profile for every CFG in the binary.
  21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
  22 //   reconstruct "realistic" block and jump counts from the partial profile
  23 //   generated at the first stage. In practice, we don't always produce proper
  24 //   profile data but the majority (e.g., >90%) of CFGs get the correct counts.
  25 //
  26 //===----------------------------------------------------------------------===//
  27
  28 #include "bolt/Core/HashUtilities.h"
  29 #include "bolt/Profile/YAMLProfileReader.h"
  30 #include "llvm/ADT/Bitfields.h"
  31 #include "llvm/ADT/Hashing.h"
  32 #include "llvm/Support/CommandLine.h"
  33 #include "llvm/Transforms/Utils/SampleProfileInference.h"
  34
  35 #include <queue>
  36
  37 using namespace llvm;
  38
  39 #undef DEBUG_TYPE
  40 #define DEBUG_TYPE "bolt-prof"
  41
  42 namespace opts {
  43
  44 extern cl::OptionCategory BoltOptCategory;
  45
  46 cl::opt<bool>
  47     InferStaleProfile("infer-stale-profile",
  48                       cl::desc("Infer counts from stale profile data."),
  49                       cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
  50
  51 cl::opt<unsigned> StaleMatchingMaxFuncSize(
  52     "stale-matching-max-func-size",
  53     cl::desc("The maximum size of a function to consider for inference."),
  54     cl::init(10000), cl::Hidden, cl::cat(BoltOptCategory));
  55
  56 // Parameters of the profile inference algorithm. The default values are tuned
  57 // on several benchmarks.
  58 cl::opt<bool> StaleMatchingEvenFlowDistribution(
  59     "stale-matching-even-flow-distribution",
  60     cl::desc("Try to evenly distribute flow when there are multiple equally "
  61              "likely options."),
  62     cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory));
  63
  64 cl::opt<bool> StaleMatchingRebalanceUnknown(
  65     "stale-matching-rebalance-unknown",
  66     cl::desc("Evenly re-distribute flow among unknown subgraphs."),
  67     cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory));
  68
  69 cl::opt<bool> StaleMatchingJoinIslands(
  70     "stale-matching-join-islands",
  71     cl::desc("Join isolated components having positive flow."), cl::init(true),
  72     cl::ReallyHidden, cl::cat(BoltOptCategory));
  73
  74 cl::opt<unsigned> StaleMatchingCostBlockInc(
  75     "stale-matching-cost-block-inc",
  76     cl::desc("The cost of increasing a block count by one."), cl::init(150),
  77     cl::ReallyHidden, cl::cat(BoltOptCategory));
  78
  79 cl::opt<unsigned> StaleMatchingCostBlockDec(
  80     "stale-matching-cost-block-dec",
  81     cl::desc("The cost of decreasing a block count by one."), cl::init(150),
  82     cl::ReallyHidden, cl::cat(BoltOptCategory));
  83
  84 cl::opt<unsigned> StaleMatchingCostJumpInc(
  85     "stale-matching-cost-jump-inc",
  86     cl::desc("The cost of increasing a jump count by one."), cl::init(150),
  87     cl::ReallyHidden, cl::cat(BoltOptCategory));
  88
  89 cl::opt<unsigned> StaleMatchingCostJumpDec(
  90     "stale-matching-cost-jump-dec",
  91     cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
  92     cl::ReallyHidden, cl::cat(BoltOptCategory));
  93
  94 cl::opt<unsigned> StaleMatchingCostBlockUnknownInc(
  95     "stale-matching-cost-block-unknown-inc",
  96     cl::desc("The cost of increasing an unknown block count by one."),
  97     cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory));
  98
  99 cl::opt<unsigned> StaleMatchingCostJumpUnknownInc(
 100     "stale-matching-cost-jump-unknown-inc",
 101     cl::desc("The cost of increasing an unknown jump count by one."),
 102     cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory));
 103
 104 cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
 105     "stale-matching-cost-jump-unknown-ft-inc",
 106     cl::desc(
 107         "The cost of increasing an unknown fall-through jump count by one."),
 108     cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory));
 109
 110 } // namespace opts
 111
 112 namespace llvm {
 113 namespace bolt {
 114
 115 /// An object wrapping several components of a basic block hash. The combined
 116 /// (blended) hash is represented and stored as one uint64_t, while individual
 117 /// components are of smaller size (e.g., uint16_t or uint8_t).
 118 struct BlendedBlockHash {
 119 private:
 120   using ValueOffset = Bitfield::Element<uint16_t, 0, 16>;
 121   using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>;
 122   using ValueInstr = Bitfield::Element<uint16_t, 32, 16>;
 123   using ValuePred = Bitfield::Element<uint8_t, 48, 8>;
 124   using ValueSucc = Bitfield::Element<uint8_t, 56, 8>;
 125
 126 public:
 127   explicit BlendedBlockHash() {}
 128
 129   explicit BlendedBlockHash(uint64_t Hash) {
 130     Offset = Bitfield::get<ValueOffset>(Hash);
 131     OpcodeHash = Bitfield::get<ValueOpcode>(Hash);
 132     InstrHash = Bitfield::get<ValueInstr>(Hash);
 133     PredHash = Bitfield::get<ValuePred>(Hash);
 134     SuccHash = Bitfield::get<ValueSucc>(Hash);
 135   }
 136
 137   /// Combine the blended hash into uint64_t.
 138   uint64_t combine() const {
 139     uint64_t Hash = 0;
 140     Bitfield::set<ValueOffset>(Hash, Offset);
 141     Bitfield::set<ValueOpcode>(Hash, OpcodeHash);
 142     Bitfield::set<ValueInstr>(Hash, InstrHash);
 143     Bitfield::set<ValuePred>(Hash, PredHash);
 144     Bitfield::set<ValueSucc>(Hash, SuccHash);
 145     return Hash;
 146   }
 147
 148   /// Compute a distance between two given blended hashes. The smaller the
 149   /// distance, the more similar two blocks are. For identical basic blocks,
 150   /// the distance is zero.
 151   uint64_t distance(const BlendedBlockHash &BBH) const {
 152     assert(OpcodeHash == BBH.OpcodeHash &&
 153            "incorrect blended hash distance computation");
 154     uint64_t Dist = 0;
 155     // Account for NeighborHash
 156     Dist += SuccHash == BBH.SuccHash ? 0 : 1;
 157     Dist += PredHash == BBH.PredHash ? 0 : 1;
 158     Dist <<= 16;
 159     // Account for InstrHash
 160     Dist += InstrHash == BBH.InstrHash ? 0 : 1;
 161     Dist <<= 16;
 162     // Account for Offset
 163     Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
 164     return Dist;
 165   }
 166
 167   /// The offset of the basic block from the function start.
 168   uint16_t Offset{0};
 169   /// (Loose) Hash of the basic block instructions, excluding operands.
 170   uint16_t OpcodeHash{0};
 171   /// (Strong) Hash of the basic block instructions, including opcodes and
 172   /// operands.
 173   uint16_t InstrHash{0};
 174   /// (Loose) Hashes of the predecessors of the basic block.
 175   uint8_t PredHash{0};
 176   /// (Loose) Hashes of the successors of the basic block.
 177   uint8_t SuccHash{0};
 178 };
 179
 180 /// The object is used to identify and match basic blocks in a BinaryFunction
 181 /// given their hashes computed on a binary built from several revisions behind
 182 /// release.
 183 class StaleMatcher {
 184 public:
 185   /// Initialize stale matcher.
 186   void init(const std::vector<FlowBlock *> &Blocks,
 187             const std::vector<BlendedBlockHash> &Hashes) {
 188     assert(Blocks.size() == Hashes.size() &&
 189            "incorrect matcher initialization");
 190     for (size_t I = 0; I < Blocks.size(); I++) {
 191       FlowBlock *Block = Blocks[I];
 192       uint16_t OpHash = Hashes[I].OpcodeHash;
 193       OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block));
 194     }
 195   }
 196
 197   /// Find the most similar block for a given hash.
 198   const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const {
 199     auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
 200     if (BlockIt == OpHashToBlocks.end())
 201       return nullptr;
 202     FlowBlock *BestBlock = nullptr;
 203     uint64_t BestDist = std::numeric_limits<uint64_t>::max();
 204     for (const auto &[Hash, Block] : BlockIt->second) {
 205       uint64_t Dist = Hash.distance(BlendedHash);
 206       if (BestBlock == nullptr || Dist < BestDist) {
 207         BestDist = Dist;
 208         BestBlock = Block;
 209       }
 210     }
 211     return BestBlock;
 212   }
 213
 214   /// Returns true if the two basic blocks (in the binary and in the profile)
 215   /// corresponding to the given hashes are matched to each other with a high
 216   /// confidence.
 217   static bool isHighConfidenceMatch(BlendedBlockHash Hash1,
 218                                     BlendedBlockHash Hash2) {
 219     return Hash1.InstrHash == Hash2.InstrHash;
 220   }
 221
 222 private:
 223   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
 224   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
 225 };
 226
 227 void BinaryFunction::computeBlockHashes() const {
 228   if (size() == 0)
 229     return;
 230
 231   assert(hasCFG() && "the function is expected to have CFG");
 232
 233   std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size());
 234   std::vector<uint64_t> OpcodeHashes(BasicBlocks.size());
 235   // Initialize hash components.
 236   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 237     const BinaryBasicBlock *BB = BasicBlocks[I];
 238     assert(BB->getIndex() == I && "incorrect block index");
 239     BlendedHashes[I].Offset = BB->getOffset();
 240     // Hashing complete instructions.
 241     std::string InstrHashStr = hashBlock(
 242         BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); });
 243     uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr);
 244     BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash);
 245     // Hashing opcodes.
 246     std::string OpcodeHashStr = hashBlockLoose(BC, *BB);
 247     OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr);
 248     BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]);
 249   }
 250
 251   // Initialize neighbor hash.
 252   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 253     const BinaryBasicBlock *BB = BasicBlocks[I];
 254     // Append hashes of successors.
 255     uint64_t Hash = 0;
 256     for (BinaryBasicBlock *SuccBB : BB->successors()) {
 257       uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()];
 258       Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
 259     }
 260     BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash);
 261
 262     // Append hashes of predecessors.
 263     Hash = 0;
 264     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
 265       uint64_t PredHash = OpcodeHashes[PredBB->getIndex()];
 266       Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
 267     }
 268     BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash);
 269   }
 270
 271   //  Assign hashes.
 272   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 273     const BinaryBasicBlock *BB = BasicBlocks[I];
 274     BB->setHash(BlendedHashes[I].combine());
 275   }
 276 }
 277
 278 /// Create a wrapper flow function to use with the profile inference algorithm,
 279 /// and initialize its jumps and metadata.
 280 FlowFunction
 281 createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 282   FlowFunction Func;
 283
 284   // Add a special "dummy" source so that there is always a unique entry point.
 285   // Because of the extra source, for all other blocks in FlowFunction it holds
 286   // that Block.Index == BB->getIndex() + 1
 287   FlowBlock EntryBlock;
 288   EntryBlock.Index = 0;
 289   Func.Blocks.push_back(EntryBlock);
 290
 291   // Create FlowBlock for every basic block in the binary function
 292   for (const BinaryBasicBlock *BB : BlockOrder) {
 293     Func.Blocks.emplace_back();
 294     FlowBlock &Block = Func.Blocks.back();
 295     Block.Index = Func.Blocks.size() - 1;
 296     (void)BB;
 297     assert(Block.Index == BB->getIndex() + 1 &&
 298            "incorrectly assigned basic block index");
 299   }
 300
 301   // Create FlowJump for each jump between basic blocks in the binary function
 302   std::vector<uint64_t> InDegree(Func.Blocks.size(), 0);
 303   for (const BinaryBasicBlock *SrcBB : BlockOrder) {
 304     std::unordered_set<const BinaryBasicBlock *> UniqueSuccs;
 305     // Collect regular jumps
 306     for (const BinaryBasicBlock *DstBB : SrcBB->successors()) {
 307       // Ignoring parallel edges
 308       if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
 309         continue;
 310
 311       Func.Jumps.emplace_back();
 312       FlowJump &Jump = Func.Jumps.back();
 313       Jump.Source = SrcBB->getIndex() + 1;
 314       Jump.Target = DstBB->getIndex() + 1;
 315       InDegree[Jump.Target]++;
 316       UniqueSuccs.insert(DstBB);
 317     }
 318     // Collect jumps to landing pads
 319     for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) {
 320       // Ignoring parallel edges
 321       if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
 322         continue;
 323
 324       Func.Jumps.emplace_back();
 325       FlowJump &Jump = Func.Jumps.back();
 326       Jump.Source = SrcBB->getIndex() + 1;
 327       Jump.Target = DstBB->getIndex() + 1;
 328       InDegree[Jump.Target]++;
 329       UniqueSuccs.insert(DstBB);
 330     }
 331   }
 332
 333   // Add dummy edges to the extra sources. If there are multiple entry blocks,
 334   // add an unlikely edge from 0 to the subsequent ones
 335   assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors");
 336   for (uint64_t I = 1; I < Func.Blocks.size(); I++) {
 337     const BinaryBasicBlock *BB = BlockOrder[I - 1];
 338     if (BB->isEntryPoint() || InDegree[I] == 0) {
 339       Func.Jumps.emplace_back();
 340       FlowJump &Jump = Func.Jumps.back();
 341       Jump.Source = 0;
 342       Jump.Target = I;
 343       if (!BB->isEntryPoint())
 344         Jump.IsUnlikely = true;
 345     }
 346   }
 347
 348   // Create necessary metadata for the flow function
 349   for (FlowJump &Jump : Func.Jumps) {
 350     Func.Blocks.at(Jump.Source).SuccJumps.push_back(&Jump);
 351     Func.Blocks.at(Jump.Target).PredJumps.push_back(&Jump);
 352   }
 353   return Func;
 354 }
 355
 356 /// Assign initial block/jump weights based on the stale profile data. The goal
 357 /// is to extract as much information from the stale profile as possible. Here
 358 /// we assume that each basic block is specified via a hash value computed from
 359 /// its content and the hashes of the unchanged basic blocks stay the same
 360 /// across different revisions of the binary.
 361 /// Whenever there is a count in the profile with the hash corresponding to one
 362 /// of the basic blocks in the binary, the count is "matched" to the block.
 363 /// Similarly, if both the source and the target of a count in the profile are
 364 /// matched to a jump in the binary, the count is recorded in CFG.
 365 void matchWeightsByHashes(BinaryContext &BC,
 366                           const BinaryFunction::BasicBlockOrderType &BlockOrder,
 367                           const yaml::bolt::BinaryFunctionProfile &YamlBF,
 368                           FlowFunction &Func) {
 369   assert(Func.Blocks.size() == BlockOrder.size() + 1);
 370
 371   std::vector<FlowBlock *> Blocks;
 372   std::vector<BlendedBlockHash> BlendedHashes;
 373   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
 374     const BinaryBasicBlock *BB = BlockOrder[I];
 375     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
 376     Blocks.push_back(&Func.Blocks[I + 1]);
 377     BlendedBlockHash BlendedHash(BB->getHash());
 378     BlendedHashes.push_back(BlendedHash);
 379     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
 380                       << Twine::utohexstr(BB->getHash()) << "\n");
 381   }
 382   StaleMatcher Matcher;
 383   Matcher.init(Blocks, BlendedHashes);
 384
 385   // Index in yaml profile => corresponding (matched) block
 386   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
 387   // Match blocks from the profile to the blocks in CFG
 388   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
 389     assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile");
 390     BlendedBlockHash YamlHash(YamlBB.Hash);
 391     const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash);
 392     // Always match the entry block.
 393     if (MatchedBlock == nullptr && YamlBB.Index == 0)
 394       MatchedBlock = Blocks[0];
 395     if (MatchedBlock != nullptr) {
 396       MatchedBlocks[YamlBB.Index] = MatchedBlock;
 397       BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1];
 398       LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")"
 399                         << " with hash " << Twine::utohexstr(YamlBB.Hash)
 400                         << " to BB (index = " << MatchedBlock->Index - 1 << ")"
 401                         << " with hash " << Twine::utohexstr(BinHash.combine())
 402                         << "\n");
 403       // Update matching stats accounting for the matched block.
 404       if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
 405         ++BC.Stats.NumMatchedBlocks;
 406         BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
 407         LLVM_DEBUG(dbgs() << "  exact match\n");
 408       }
 409     } else {
 410       LLVM_DEBUG(
 411           dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")"
 412                  << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n");
 413     }
 414
 415     // Update matching stats.
 416     ++BC.Stats.NumStaleBlocks;
 417     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
 418   }
 419
 420   // Match jumps from the profile to the jumps from CFG
 421   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
 422   std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);
 423   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
 424     for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) {
 425       if (YamlSI.Count == 0)
 426         continue;
 427
 428       // Try to find the jump for a given (src, dst) pair from the profile and
 429       // assign the jump weight based on the profile count
 430       const uint64_t SrcIndex = YamlBB.Index;
 431       const uint64_t DstIndex = YamlSI.Index;
 432
 433       const FlowBlock *MatchedSrcBlock = MatchedBlocks.lookup(SrcIndex);
 434       const FlowBlock *MatchedDstBlock = MatchedBlocks.lookup(DstIndex);
 435
 436       if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) {
 437         // Find a jump between the two blocks
 438         FlowJump *Jump = nullptr;
 439         for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) {
 440           if (SuccJump->Target == MatchedDstBlock->Index) {
 441             Jump = SuccJump;
 442             break;
 443           }
 444         }
 445         // Assign the weight, if the corresponding jump is found
 446         if (Jump != nullptr) {
 447           Jump->Weight = YamlSI.Count;
 448           Jump->HasUnknownWeight = false;
 449         }
 450       }
 451       // Assign the weight for the src block, if it is found
 452       if (MatchedSrcBlock != nullptr)
 453         OutWeight[MatchedSrcBlock->Index] += YamlSI.Count;
 454       // Assign the weight for the dst block, if it is found
 455       if (MatchedDstBlock != nullptr)
 456         InWeight[MatchedDstBlock->Index] += YamlSI.Count;
 457     }
 458   }
 459
 460   // Assign block counts based on in-/out- jumps
 461   for (FlowBlock &Block : Func.Blocks) {
 462     if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) {
 463       assert(Block.HasUnknownWeight && "unmatched block with a positive count");
 464       continue;
 465     }
 466     Block.HasUnknownWeight = false;
 467     Block.Weight = std::max(OutWeight[Block.Index], InWeight[Block.Index]);
 468   }
 469 }
 470
 471 /// The function finds all blocks that are (i) reachable from the Entry block
 472 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
 473 /// so that profi does not send any flow to such blocks.
 474 void preprocessUnreachableBlocks(FlowFunction &Func) {
 475   const uint64_t NumBlocks = Func.Blocks.size();
 476
 477   // Start bfs from the source
 478   std::queue<uint64_t> Queue;
 479   std::vector<bool> VisitedEntry(NumBlocks, false);
 480   for (uint64_t I = 0; I < NumBlocks; I++) {
 481     FlowBlock &Block = Func.Blocks[I];
 482     if (Block.isEntry()) {
 483       Queue.push(I);
 484       VisitedEntry[I] = true;
 485       break;
 486     }
 487   }
 488   while (!Queue.empty()) {
 489     const uint64_t Src = Queue.front();
 490     Queue.pop();
 491     for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) {
 492       const uint64_t Dst = Jump->Target;
 493       if (!VisitedEntry[Dst]) {
 494         Queue.push(Dst);
 495         VisitedEntry[Dst] = true;
 496       }
 497     }
 498   }
 499
 500   // Start bfs from all sinks
 501   std::vector<bool> VisitedExit(NumBlocks, false);
 502   for (uint64_t I = 0; I < NumBlocks; I++) {
 503     FlowBlock &Block = Func.Blocks[I];
 504     if (Block.isExit() && VisitedEntry[I]) {
 505       Queue.push(I);
 506       VisitedExit[I] = true;
 507     }
 508   }
 509   while (!Queue.empty()) {
 510     const uint64_t Src = Queue.front();
 511     Queue.pop();
 512     for (FlowJump *Jump : Func.Blocks[Src].PredJumps) {
 513       const uint64_t Dst = Jump->Source;
 514       if (!VisitedExit[Dst]) {
 515         Queue.push(Dst);
 516         VisitedExit[Dst] = true;
 517       }
 518     }
 519   }
 520
 521   // Make all blocks of zero weight so that flow is not sent
 522   for (uint64_t I = 0; I < NumBlocks; I++) {
 523     FlowBlock &Block = Func.Blocks[I];
 524     if (Block.Weight == 0)
 525       continue;
 526     if (!VisitedEntry[I] || !VisitedExit[I]) {
 527       Block.Weight = 0;
 528       Block.HasUnknownWeight = true;
 529       Block.IsUnlikely = true;
 530       for (FlowJump *Jump : Block.SuccJumps) {
 531         if (Jump->Source == Block.Index && Jump->Target == Block.Index) {
 532           Jump->Weight = 0;
 533           Jump->HasUnknownWeight = true;
 534           Jump->IsUnlikely = true;
 535         }
 536       }
 537     }
 538   }
 539 }
 540
 541 /// Decide if stale profile matching can be applied for a given function.
 542 /// Currently we skip inference for (very) large instances and for instances
 543 /// having "unexpected" control flow (e.g., having no sink basic blocks).
 544 bool canApplyInference(const FlowFunction &Func) {
 545   if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize)
 546     return false;
 547
 548   bool HasExitBlocks = llvm::any_of(
 549       Func.Blocks, [&](const FlowBlock &Block) { return Block.isExit(); });
 550   if (!HasExitBlocks)
 551     return false;
 552
 553   return true;
 554 }
 555
 556 /// Apply the profile inference algorithm for a given flow function.
 557 void applyInference(FlowFunction &Func) {
 558   ProfiParams Params;
 559   // Set the params from the command-line flags.
 560   Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution;
 561   Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown;
 562   Params.JoinIslands = opts::StaleMatchingJoinIslands;
 563
 564   Params.CostBlockInc = opts::StaleMatchingCostBlockInc;
 565   Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc;
 566   Params.CostBlockDec = opts::StaleMatchingCostBlockDec;
 567   Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec;
 568   Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc;
 569
 570   Params.CostJumpInc = opts::StaleMatchingCostJumpInc;
 571   Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc;
 572   Params.CostJumpDec = opts::StaleMatchingCostJumpDec;
 573   Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec;
 574   Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc;
 575   Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc;
 576
 577   applyFlowInference(Params, Func);
 578 }
 579
 580 /// Collect inferred counts from the flow function and update annotations in
 581 /// the binary function.
 582 void assignProfile(BinaryFunction &BF,
 583                    const BinaryFunction::BasicBlockOrderType &BlockOrder,
 584                    FlowFunction &Func) {
 585   BinaryContext &BC = BF.getBinaryContext();
 586
 587   assert(Func.Blocks.size() == BlockOrder.size() + 1);
 588   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
 589     FlowBlock &Block = Func.Blocks[I + 1];
 590     BinaryBasicBlock *BB = BlockOrder[I];
 591
 592     // Update block's count
 593     BB->setExecutionCount(Block.Flow);
 594
 595     // Update jump counts: (i) clean existing counts and then (ii) set new ones
 596     auto BI = BB->branch_info_begin();
 597     for (const BinaryBasicBlock *DstBB : BB->successors()) {
 598       (void)DstBB;
 599       BI->Count = 0;
 600       BI->MispredictedCount = 0;
 601       ++BI;
 602     }
 603     for (FlowJump *Jump : Block.SuccJumps) {
 604       if (Jump->IsUnlikely)
 605         continue;
 606       if (Jump->Flow == 0)
 607         continue;
 608
 609       BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1];
 610       // Check if the edge corresponds to a regular jump or a landing pad
 611       if (BB->getSuccessor(SuccBB.getLabel())) {
 612         BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(SuccBB);
 613         BI.Count += Jump->Flow;
 614       } else {
 615         BinaryBasicBlock *LP = BB->getLandingPad(SuccBB.getLabel());
 616         if (LP && LP->getKnownExecutionCount() < Jump->Flow)
 617           LP->setExecutionCount(Jump->Flow);
 618       }
 619     }
 620
 621     // Update call-site annotations
 622     auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name,
 623                                      uint64_t Count) {
 624       if (BC.MIB->hasAnnotation(Instr, Name))
 625         BC.MIB->removeAnnotation(Instr, Name);
 626       // Do not add zero-count annotations
 627       if (Count == 0)
 628         return;
 629       BC.MIB->addAnnotation(Instr, Name, Count);
 630     };
 631
 632     for (MCInst &Instr : *BB) {
 633       // Ignore pseudo instructions
 634       if (BC.MIB->isPseudo(Instr))
 635         continue;
 636       // Ignore jump tables
 637       const MCInst *LastInstr = BB->getLastNonPseudoInstr();
 638       if (BC.MIB->getJumpTable(*LastInstr) && LastInstr == &Instr)
 639         continue;
 640
 641       if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) {
 642         auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
 643             Instr, "CallProfile");
 644         if (!ICSP.empty()) {
 645           // Try to evenly distribute the counts among the call sites
 646           const uint64_t TotalCount = Block.Flow;
 647           const uint64_t NumSites = ICSP.size();
 648           for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) {
 649             IndirectCallProfile &CSP = ICSP[Idx];
 650             uint64_t CountPerSite = TotalCount / NumSites;
 651             // When counts cannot be exactly distributed, increase by 1 the
 652             // counts of the first (TotalCount % NumSites) call sites
 653             if (Idx < TotalCount % NumSites)
 654               CountPerSite++;
 655             CSP.Count = CountPerSite;
 656           }
 657         } else {
 658           ICSP.emplace_back(nullptr, Block.Flow, 0);
 659         }
 660       } else if (BC.MIB->getConditionalTailCall(Instr)) {
 661         // We don't know exactly the number of times the conditional tail call
 662         // is executed; conservatively, setting it to the count of the block
 663         setOrUpdateAnnotation(Instr, "CTCTakenCount", Block.Flow);
 664         BC.MIB->removeAnnotation(Instr, "CTCMispredCount");
 665       } else if (BC.MIB->isCall(Instr)) {
 666         setOrUpdateAnnotation(Instr, "Count", Block.Flow);
 667       }
 668     }
 669   }
 670
 671   // Update function's execution count and mark the function inferred.
 672   BF.setExecutionCount(Func.Blocks[0].Flow);
 673   BF.setHasInferredProfile(true);
 674 }
 675
 676 bool YAMLProfileReader::inferStaleProfile(
 677     BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
 678   LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
 679                     << "\"" << BF.getPrintName() << "\"\n");
 680
 681   // Make sure that block hashes are up to date.
 682   BF.computeBlockHashes();
 683
 684   const BinaryFunction::BasicBlockOrderType BlockOrder(
 685       BF.getLayout().block_begin(), BF.getLayout().block_end());
 686
 687   // Create a wrapper flow function to use with the profile inference algorithm.
 688   FlowFunction Func = createFlowFunction(BlockOrder);
 689
 690   // Match as many block/jump counts from the stale profile as possible
 691   matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func);
 692
 693   // Adjust the flow function by marking unreachable blocks Unlikely so that
 694   // they don't get any counts assigned.
 695   preprocessUnreachableBlocks(Func);
 696
 697   // Check if profile inference can be applied for the instance.
 698   if (!canApplyInference(Func))
 699     return false;
 700
 701   // Apply the profile inference algorithm.
 702   applyInference(Func);
 703
 704   // Collect inferred counts and update function annotations.
 705   assignProfile(BF, BlockOrder, Func);
 706
 707   // As of now, we always mark the binary function having "correct" profile.
 708   // In the future, we may discard the results for instances with poor inference
 709   // metrics and keep such functions un-optimized.
 710   return true;
 711 }
 712
 713 } // end namespace bolt
 714 } // end namespace llvm