bolt/lib/Profile/StaleProfileMatching.cpp

   1 //===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching   ----===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // BOLT often has to deal with profiles collected on binaries built from several
  10 // revisions behind release. As a result, a certain percentage of functions is
  11 // considered stale and not optimized. This file implements an ability to match
  12 // profile to functions that are not 100% binary identical, and thus, increasing
  13 // the optimization coverage and boost the performance of applications.
  14 //
  15 // The algorithm consists of two phases: matching and inference:
  16 // - At the matching phase, we try to "guess" as many block and jump counts from
  17 //   the stale profile as possible. To this end, the content of each basic block
  18 //   is hashed and stored in the (yaml) profile. When BOLT optimizes a binary,
  19 //   it computes block hashes and identifies the corresponding entries in the
  20 //   stale profile. It yields a partial profile for every CFG in the binary.
  21 // - At the inference phase, we employ a network flow-based algorithm (profi) to
  22 //   reconstruct "realistic" block and jump counts from the partial profile
  23 //   generated at the first stage. In practice, we don't always produce proper
  24 //   profile data but the majority (e.g., >90%) of CFGs get the correct counts.
  25 //
  26 //===----------------------------------------------------------------------===//
  27
  28 #include "bolt/Core/HashUtilities.h"
  29 #include "bolt/Profile/YAMLProfileReader.h"
  30 #include "llvm/ADT/Bitfields.h"
  31 #include "llvm/ADT/Hashing.h"
  32 #include "llvm/Support/CommandLine.h"
  33 #include "llvm/Support/xxhash.h"
  34 #include "llvm/Transforms/Utils/SampleProfileInference.h"
  35
  36 #include <queue>
  37
  38 using namespace llvm;
  39
  40 #undef DEBUG_TYPE
  41 #define DEBUG_TYPE "bolt-prof"
  42
  43 namespace opts {
  44
  45 extern cl::OptionCategory BoltOptCategory;
  46
  47 cl::opt<bool>
  48     InferStaleProfile("infer-stale-profile",
  49                       cl::desc("Infer counts from stale profile data."),
  50                       cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
  51
  52 cl::opt<unsigned> StaleMatchingMaxFuncSize(
  53     "stale-matching-max-func-size",
  54     cl::desc("The maximum size of a function to consider for inference."),
  55     cl::init(10000), cl::Hidden, cl::cat(BoltOptCategory));
  56
  57 // Parameters of the profile inference algorithm. The default values are tuned
  58 // on several benchmarks.
  59 cl::opt<bool> StaleMatchingEvenFlowDistribution(
  60     "stale-matching-even-flow-distribution",
  61     cl::desc("Try to evenly distribute flow when there are multiple equally "
  62              "likely options."),
  63     cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory));
  64
  65 cl::opt<bool> StaleMatchingRebalanceUnknown(
  66     "stale-matching-rebalance-unknown",
  67     cl::desc("Evenly re-distribute flow among unknown subgraphs."),
  68     cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory));
  69
  70 cl::opt<bool> StaleMatchingJoinIslands(
  71     "stale-matching-join-islands",
  72     cl::desc("Join isolated components having positive flow."), cl::init(true),
  73     cl::ReallyHidden, cl::cat(BoltOptCategory));
  74
  75 cl::opt<unsigned> StaleMatchingCostBlockInc(
  76     "stale-matching-cost-block-inc",
  77     cl::desc("The cost of increasing a block count by one."), cl::init(150),
  78     cl::ReallyHidden, cl::cat(BoltOptCategory));
  79
  80 cl::opt<unsigned> StaleMatchingCostBlockDec(
  81     "stale-matching-cost-block-dec",
  82     cl::desc("The cost of decreasing a block count by one."), cl::init(150),
  83     cl::ReallyHidden, cl::cat(BoltOptCategory));
  84
  85 cl::opt<unsigned> StaleMatchingCostJumpInc(
  86     "stale-matching-cost-jump-inc",
  87     cl::desc("The cost of increasing a jump count by one."), cl::init(150),
  88     cl::ReallyHidden, cl::cat(BoltOptCategory));
  89
  90 cl::opt<unsigned> StaleMatchingCostJumpDec(
  91     "stale-matching-cost-jump-dec",
  92     cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
  93     cl::ReallyHidden, cl::cat(BoltOptCategory));
  94
  95 cl::opt<unsigned> StaleMatchingCostBlockUnknownInc(
  96     "stale-matching-cost-block-unknown-inc",
  97     cl::desc("The cost of increasing an unknown block count by one."),
  98     cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory));
  99
 100 cl::opt<unsigned> StaleMatchingCostJumpUnknownInc(
 101     "stale-matching-cost-jump-unknown-inc",
 102     cl::desc("The cost of increasing an unknown jump count by one."),
 103     cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory));
 104
 105 cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
 106     "stale-matching-cost-jump-unknown-ft-inc",
 107     cl::desc(
 108         "The cost of increasing an unknown fall-through jump count by one."),
 109     cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory));
 110
 111 } // namespace opts
 112
 113 namespace llvm {
 114 namespace bolt {
 115
 116 /// An object wrapping several components of a basic block hash. The combined
 117 /// (blended) hash is represented and stored as one uint64_t, while individual
 118 /// components are of smaller size (e.g., uint16_t or uint8_t).
 119 struct BlendedBlockHash {
 120 private:
 121   using ValueOffset = Bitfield::Element<uint16_t, 0, 16>;
 122   using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>;
 123   using ValueInstr = Bitfield::Element<uint16_t, 32, 16>;
 124   using ValuePred = Bitfield::Element<uint8_t, 48, 8>;
 125   using ValueSucc = Bitfield::Element<uint8_t, 56, 8>;
 126
 127 public:
 128   explicit BlendedBlockHash() {}
 129
 130   explicit BlendedBlockHash(uint64_t Hash) {
 131     Offset = Bitfield::get<ValueOffset>(Hash);
 132     OpcodeHash = Bitfield::get<ValueOpcode>(Hash);
 133     InstrHash = Bitfield::get<ValueInstr>(Hash);
 134     PredHash = Bitfield::get<ValuePred>(Hash);
 135     SuccHash = Bitfield::get<ValueSucc>(Hash);
 136   }
 137
 138   /// Combine the blended hash into uint64_t.
 139   uint64_t combine() const {
 140     uint64_t Hash = 0;
 141     Bitfield::set<ValueOffset>(Hash, Offset);
 142     Bitfield::set<ValueOpcode>(Hash, OpcodeHash);
 143     Bitfield::set<ValueInstr>(Hash, InstrHash);
 144     Bitfield::set<ValuePred>(Hash, PredHash);
 145     Bitfield::set<ValueSucc>(Hash, SuccHash);
 146     return Hash;
 147   }
 148
 149   /// Compute a distance between two given blended hashes. The smaller the
 150   /// distance, the more similar two blocks are. For identical basic blocks,
 151   /// the distance is zero.
 152   uint64_t distance(const BlendedBlockHash &BBH) const {
 153     assert(OpcodeHash == BBH.OpcodeHash &&
 154            "incorrect blended hash distance computation");
 155     uint64_t Dist = 0;
 156     // Account for NeighborHash
 157     Dist += SuccHash == BBH.SuccHash ? 0 : 1;
 158     Dist += PredHash == BBH.PredHash ? 0 : 1;
 159     Dist <<= 16;
 160     // Account for InstrHash
 161     Dist += InstrHash == BBH.InstrHash ? 0 : 1;
 162     Dist <<= 16;
 163     // Account for Offset
 164     Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
 165     return Dist;
 166   }
 167
 168   /// The offset of the basic block from the function start.
 169   uint16_t Offset{0};
 170   /// (Loose) Hash of the basic block instructions, excluding operands.
 171   uint16_t OpcodeHash{0};
 172   /// (Strong) Hash of the basic block instructions, including opcodes and
 173   /// operands.
 174   uint16_t InstrHash{0};
 175   /// (Loose) Hashes of the predecessors of the basic block.
 176   uint8_t PredHash{0};
 177   /// (Loose) Hashes of the successors of the basic block.
 178   uint8_t SuccHash{0};
 179 };
 180
 181 /// The object is used to identify and match basic blocks in a BinaryFunction
 182 /// given their hashes computed on a binary built from several revisions behind
 183 /// release.
 184 class StaleMatcher {
 185 public:
 186   /// Initialize stale matcher.
 187   void init(const std::vector<FlowBlock *> &Blocks,
 188             const std::vector<BlendedBlockHash> &Hashes) {
 189     assert(Blocks.size() == Hashes.size() &&
 190            "incorrect matcher initialization");
 191     for (size_t I = 0; I < Blocks.size(); I++) {
 192       FlowBlock *Block = Blocks[I];
 193       uint16_t OpHash = Hashes[I].OpcodeHash;
 194       OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block));
 195     }
 196   }
 197
 198   /// Find the most similar block for a given hash.
 199   const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const {
 200     auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
 201     if (BlockIt == OpHashToBlocks.end())
 202       return nullptr;
 203     FlowBlock *BestBlock = nullptr;
 204     uint64_t BestDist = std::numeric_limits<uint64_t>::max();
 205     for (const auto &[Hash, Block] : BlockIt->second) {
 206       uint64_t Dist = Hash.distance(BlendedHash);
 207       if (BestBlock == nullptr || Dist < BestDist) {
 208         BestDist = Dist;
 209         BestBlock = Block;
 210       }
 211     }
 212     return BestBlock;
 213   }
 214
 215   /// Returns true if the two basic blocks (in the binary and in the profile)
 216   /// corresponding to the given hashes are matched to each other with a high
 217   /// confidence.
 218   static bool isHighConfidenceMatch(BlendedBlockHash Hash1,
 219                                     BlendedBlockHash Hash2) {
 220     return Hash1.InstrHash == Hash2.InstrHash;
 221   }
 222
 223 private:
 224   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
 225   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
 226 };
 227
 228 void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const {
 229   if (size() == 0)
 230     return;
 231
 232   assert(hasCFG() && "the function is expected to have CFG");
 233
 234   std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size());
 235   std::vector<uint64_t> OpcodeHashes(BasicBlocks.size());
 236   // Initialize hash components.
 237   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 238     const BinaryBasicBlock *BB = BasicBlocks[I];
 239     assert(BB->getIndex() == I && "incorrect block index");
 240     BlendedHashes[I].Offset = BB->getOffset();
 241     // Hashing complete instructions.
 242     std::string InstrHashStr = hashBlock(
 243         BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); });
 244     if (HashFunction == HashFunction::StdHash) {
 245       uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr);
 246       BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash);
 247     } else if (HashFunction == HashFunction::XXH3) {
 248       uint64_t InstrHash = llvm::xxh3_64bits(InstrHashStr);
 249       BlendedHashes[I].InstrHash = (uint16_t)InstrHash;
 250     } else {
 251       llvm_unreachable("Unhandled HashFunction");
 252     }
 253     // Hashing opcodes.
 254     std::string OpcodeHashStr = hashBlockLoose(BC, *BB);
 255     if (HashFunction == HashFunction::StdHash) {
 256       OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr);
 257       BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]);
 258     } else if (HashFunction == HashFunction::XXH3) {
 259       OpcodeHashes[I] = llvm::xxh3_64bits(OpcodeHashStr);
 260       BlendedHashes[I].OpcodeHash = (uint16_t)OpcodeHashes[I];
 261     } else {
 262       llvm_unreachable("Unhandled HashFunction");
 263     }
 264   }
 265
 266   // Initialize neighbor hash.
 267   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 268     const BinaryBasicBlock *BB = BasicBlocks[I];
 269     // Append hashes of successors.
 270     uint64_t Hash = 0;
 271     for (BinaryBasicBlock *SuccBB : BB->successors()) {
 272       uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()];
 273       Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
 274     }
 275     if (HashFunction == HashFunction::StdHash) {
 276       // Compatibility with old behavior.
 277       BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash);
 278     } else {
 279       BlendedHashes[I].SuccHash = (uint8_t)Hash;
 280     }
 281
 282     // Append hashes of predecessors.
 283     Hash = 0;
 284     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
 285       uint64_t PredHash = OpcodeHashes[PredBB->getIndex()];
 286       Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
 287     }
 288     if (HashFunction == HashFunction::StdHash) {
 289       // Compatibility with old behavior.
 290       BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash);
 291     } else {
 292       BlendedHashes[I].PredHash = (uint8_t)Hash;
 293     }
 294   }
 295
 296   //  Assign hashes.
 297   for (size_t I = 0; I < BasicBlocks.size(); I++) {
 298     const BinaryBasicBlock *BB = BasicBlocks[I];
 299     BB->setHash(BlendedHashes[I].combine());
 300   }
 301 }
 302
 303 /// Create a wrapper flow function to use with the profile inference algorithm,
 304 /// and initialize its jumps and metadata.
 305 FlowFunction
 306 createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 307   FlowFunction Func;
 308
 309   // Add a special "dummy" source so that there is always a unique entry point.
 310   // Because of the extra source, for all other blocks in FlowFunction it holds
 311   // that Block.Index == BB->getIndex() + 1
 312   FlowBlock EntryBlock;
 313   EntryBlock.Index = 0;
 314   Func.Blocks.push_back(EntryBlock);
 315
 316   // Create FlowBlock for every basic block in the binary function
 317   for (const BinaryBasicBlock *BB : BlockOrder) {
 318     Func.Blocks.emplace_back();
 319     FlowBlock &Block = Func.Blocks.back();
 320     Block.Index = Func.Blocks.size() - 1;
 321     (void)BB;
 322     assert(Block.Index == BB->getIndex() + 1 &&
 323            "incorrectly assigned basic block index");
 324   }
 325
 326   // Create FlowJump for each jump between basic blocks in the binary function
 327   std::vector<uint64_t> InDegree(Func.Blocks.size(), 0);
 328   for (const BinaryBasicBlock *SrcBB : BlockOrder) {
 329     std::unordered_set<const BinaryBasicBlock *> UniqueSuccs;
 330     // Collect regular jumps
 331     for (const BinaryBasicBlock *DstBB : SrcBB->successors()) {
 332       // Ignoring parallel edges
 333       if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
 334         continue;
 335
 336       Func.Jumps.emplace_back();
 337       FlowJump &Jump = Func.Jumps.back();
 338       Jump.Source = SrcBB->getIndex() + 1;
 339       Jump.Target = DstBB->getIndex() + 1;
 340       InDegree[Jump.Target]++;
 341       UniqueSuccs.insert(DstBB);
 342     }
 343     // Collect jumps to landing pads
 344     for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) {
 345       // Ignoring parallel edges
 346       if (UniqueSuccs.find(DstBB) != UniqueSuccs.end())
 347         continue;
 348
 349       Func.Jumps.emplace_back();
 350       FlowJump &Jump = Func.Jumps.back();
 351       Jump.Source = SrcBB->getIndex() + 1;
 352       Jump.Target = DstBB->getIndex() + 1;
 353       InDegree[Jump.Target]++;
 354       UniqueSuccs.insert(DstBB);
 355     }
 356   }
 357
 358   // Add dummy edges to the extra sources. If there are multiple entry blocks,
 359   // add an unlikely edge from 0 to the subsequent ones
 360   assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors");
 361   for (uint64_t I = 1; I < Func.Blocks.size(); I++) {
 362     const BinaryBasicBlock *BB = BlockOrder[I - 1];
 363     if (BB->isEntryPoint() || InDegree[I] == 0) {
 364       Func.Jumps.emplace_back();
 365       FlowJump &Jump = Func.Jumps.back();
 366       Jump.Source = 0;
 367       Jump.Target = I;
 368       if (!BB->isEntryPoint())
 369         Jump.IsUnlikely = true;
 370     }
 371   }
 372
 373   // Create necessary metadata for the flow function
 374   for (FlowJump &Jump : Func.Jumps) {
 375     Func.Blocks.at(Jump.Source).SuccJumps.push_back(&Jump);
 376     Func.Blocks.at(Jump.Target).PredJumps.push_back(&Jump);
 377   }
 378   return Func;
 379 }
 380
 381 /// Assign initial block/jump weights based on the stale profile data. The goal
 382 /// is to extract as much information from the stale profile as possible. Here
 383 /// we assume that each basic block is specified via a hash value computed from
 384 /// its content and the hashes of the unchanged basic blocks stay the same
 385 /// across different revisions of the binary.
 386 /// Whenever there is a count in the profile with the hash corresponding to one
 387 /// of the basic blocks in the binary, the count is "matched" to the block.
 388 /// Similarly, if both the source and the target of a count in the profile are
 389 /// matched to a jump in the binary, the count is recorded in CFG.
 390 void matchWeightsByHashes(BinaryContext &BC,
 391                           const BinaryFunction::BasicBlockOrderType &BlockOrder,
 392                           const yaml::bolt::BinaryFunctionProfile &YamlBF,
 393                           FlowFunction &Func) {
 394   assert(Func.Blocks.size() == BlockOrder.size() + 1);
 395
 396   std::vector<FlowBlock *> Blocks;
 397   std::vector<BlendedBlockHash> BlendedHashes;
 398   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
 399     const BinaryBasicBlock *BB = BlockOrder[I];
 400     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
 401     Blocks.push_back(&Func.Blocks[I + 1]);
 402     BlendedBlockHash BlendedHash(BB->getHash());
 403     BlendedHashes.push_back(BlendedHash);
 404     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
 405                       << Twine::utohexstr(BB->getHash()) << "\n");
 406   }
 407   StaleMatcher Matcher;
 408   Matcher.init(Blocks, BlendedHashes);
 409
 410   // Index in yaml profile => corresponding (matched) block
 411   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
 412   // Match blocks from the profile to the blocks in CFG
 413   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
 414     assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile");
 415     BlendedBlockHash YamlHash(YamlBB.Hash);
 416     const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash);
 417     // Always match the entry block.
 418     if (MatchedBlock == nullptr && YamlBB.Index == 0)
 419       MatchedBlock = Blocks[0];
 420     if (MatchedBlock != nullptr) {
 421       MatchedBlocks[YamlBB.Index] = MatchedBlock;
 422       BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1];
 423       LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")"
 424                         << " with hash " << Twine::utohexstr(YamlBB.Hash)
 425                         << " to BB (index = " << MatchedBlock->Index - 1 << ")"
 426                         << " with hash " << Twine::utohexstr(BinHash.combine())
 427                         << "\n");
 428       // Update matching stats accounting for the matched block.
 429       if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
 430         ++BC.Stats.NumMatchedBlocks;
 431         BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
 432         LLVM_DEBUG(dbgs() << "  exact match\n");
 433       } else {
 434         LLVM_DEBUG(dbgs() << "  loose match\n");
 435       }
 436     } else {
 437       LLVM_DEBUG(
 438           dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")"
 439                  << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n");
 440     }
 441
 442     // Update matching stats.
 443     ++BC.Stats.NumStaleBlocks;
 444     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
 445   }
 446
 447   // Match jumps from the profile to the jumps from CFG
 448   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
 449   std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);
 450   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
 451     for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) {
 452       if (YamlSI.Count == 0)
 453         continue;
 454
 455       // Try to find the jump for a given (src, dst) pair from the profile and
 456       // assign the jump weight based on the profile count
 457       const uint64_t SrcIndex = YamlBB.Index;
 458       const uint64_t DstIndex = YamlSI.Index;
 459
 460       const FlowBlock *MatchedSrcBlock = MatchedBlocks.lookup(SrcIndex);
 461       const FlowBlock *MatchedDstBlock = MatchedBlocks.lookup(DstIndex);
 462
 463       if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) {
 464         // Find a jump between the two blocks
 465         FlowJump *Jump = nullptr;
 466         for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) {
 467           if (SuccJump->Target == MatchedDstBlock->Index) {
 468             Jump = SuccJump;
 469             break;
 470           }
 471         }
 472         // Assign the weight, if the corresponding jump is found
 473         if (Jump != nullptr) {
 474           Jump->Weight = YamlSI.Count;
 475           Jump->HasUnknownWeight = false;
 476         }
 477       }
 478       // Assign the weight for the src block, if it is found
 479       if (MatchedSrcBlock != nullptr)
 480         OutWeight[MatchedSrcBlock->Index] += YamlSI.Count;
 481       // Assign the weight for the dst block, if it is found
 482       if (MatchedDstBlock != nullptr)
 483         InWeight[MatchedDstBlock->Index] += YamlSI.Count;
 484     }
 485   }
 486
 487   // Assign block counts based on in-/out- jumps
 488   for (FlowBlock &Block : Func.Blocks) {
 489     if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) {
 490       assert(Block.HasUnknownWeight && "unmatched block with a positive count");
 491       continue;
 492     }
 493     Block.HasUnknownWeight = false;
 494     Block.Weight = std::max(OutWeight[Block.Index], InWeight[Block.Index]);
 495   }
 496 }
 497
 498 /// The function finds all blocks that are (i) reachable from the Entry block
 499 /// and (ii) do not have a path to an exit, and marks all such blocks 'cold'
 500 /// so that profi does not send any flow to such blocks.
 501 void preprocessUnreachableBlocks(FlowFunction &Func) {
 502   const uint64_t NumBlocks = Func.Blocks.size();
 503
 504   // Start bfs from the source
 505   std::queue<uint64_t> Queue;
 506   std::vector<bool> VisitedEntry(NumBlocks, false);
 507   for (uint64_t I = 0; I < NumBlocks; I++) {
 508     FlowBlock &Block = Func.Blocks[I];
 509     if (Block.isEntry()) {
 510       Queue.push(I);
 511       VisitedEntry[I] = true;
 512       break;
 513     }
 514   }
 515   while (!Queue.empty()) {
 516     const uint64_t Src = Queue.front();
 517     Queue.pop();
 518     for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) {
 519       const uint64_t Dst = Jump->Target;
 520       if (!VisitedEntry[Dst]) {
 521         Queue.push(Dst);
 522         VisitedEntry[Dst] = true;
 523       }
 524     }
 525   }
 526
 527   // Start bfs from all sinks
 528   std::vector<bool> VisitedExit(NumBlocks, false);
 529   for (uint64_t I = 0; I < NumBlocks; I++) {
 530     FlowBlock &Block = Func.Blocks[I];
 531     if (Block.isExit() && VisitedEntry[I]) {
 532       Queue.push(I);
 533       VisitedExit[I] = true;
 534     }
 535   }
 536   while (!Queue.empty()) {
 537     const uint64_t Src = Queue.front();
 538     Queue.pop();
 539     for (FlowJump *Jump : Func.Blocks[Src].PredJumps) {
 540       const uint64_t Dst = Jump->Source;
 541       if (!VisitedExit[Dst]) {
 542         Queue.push(Dst);
 543         VisitedExit[Dst] = true;
 544       }
 545     }
 546   }
 547
 548   // Make all blocks of zero weight so that flow is not sent
 549   for (uint64_t I = 0; I < NumBlocks; I++) {
 550     FlowBlock &Block = Func.Blocks[I];
 551     if (Block.Weight == 0)
 552       continue;
 553     if (!VisitedEntry[I] || !VisitedExit[I]) {
 554       Block.Weight = 0;
 555       Block.HasUnknownWeight = true;
 556       Block.IsUnlikely = true;
 557       for (FlowJump *Jump : Block.SuccJumps) {
 558         if (Jump->Source == Block.Index && Jump->Target == Block.Index) {
 559           Jump->Weight = 0;
 560           Jump->HasUnknownWeight = true;
 561           Jump->IsUnlikely = true;
 562         }
 563       }
 564     }
 565   }
 566 }
 567
 568 /// Decide if stale profile matching can be applied for a given function.
 569 /// Currently we skip inference for (very) large instances and for instances
 570 /// having "unexpected" control flow (e.g., having no sink basic blocks).
 571 bool canApplyInference(const FlowFunction &Func) {
 572   if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize)
 573     return false;
 574
 575   bool HasExitBlocks = llvm::any_of(
 576       Func.Blocks, [&](const FlowBlock &Block) { return Block.isExit(); });
 577   if (!HasExitBlocks)
 578     return false;
 579
 580   return true;
 581 }
 582
 583 /// Apply the profile inference algorithm for a given flow function.
 584 void applyInference(FlowFunction &Func) {
 585   ProfiParams Params;
 586   // Set the params from the command-line flags.
 587   Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution;
 588   Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown;
 589   Params.JoinIslands = opts::StaleMatchingJoinIslands;
 590
 591   Params.CostBlockInc = opts::StaleMatchingCostBlockInc;
 592   Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc;
 593   Params.CostBlockDec = opts::StaleMatchingCostBlockDec;
 594   Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec;
 595   Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc;
 596
 597   Params.CostJumpInc = opts::StaleMatchingCostJumpInc;
 598   Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc;
 599   Params.CostJumpDec = opts::StaleMatchingCostJumpDec;
 600   Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec;
 601   Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc;
 602   Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc;
 603
 604   applyFlowInference(Params, Func);
 605 }
 606
 607 /// Collect inferred counts from the flow function and update annotations in
 608 /// the binary function.
 609 void assignProfile(BinaryFunction &BF,
 610                    const BinaryFunction::BasicBlockOrderType &BlockOrder,
 611                    FlowFunction &Func) {
 612   BinaryContext &BC = BF.getBinaryContext();
 613
 614   assert(Func.Blocks.size() == BlockOrder.size() + 1);
 615   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
 616     FlowBlock &Block = Func.Blocks[I + 1];
 617     BinaryBasicBlock *BB = BlockOrder[I];
 618
 619     // Update block's count
 620     BB->setExecutionCount(Block.Flow);
 621
 622     // Update jump counts: (i) clean existing counts and then (ii) set new ones
 623     auto BI = BB->branch_info_begin();
 624     for (const BinaryBasicBlock *DstBB : BB->successors()) {
 625       (void)DstBB;
 626       BI->Count = 0;
 627       BI->MispredictedCount = 0;
 628       ++BI;
 629     }
 630     for (FlowJump *Jump : Block.SuccJumps) {
 631       if (Jump->IsUnlikely)
 632         continue;
 633       if (Jump->Flow == 0)
 634         continue;
 635
 636       BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1];
 637       // Check if the edge corresponds to a regular jump or a landing pad
 638       if (BB->getSuccessor(SuccBB.getLabel())) {
 639         BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(SuccBB);
 640         BI.Count += Jump->Flow;
 641       } else {
 642         BinaryBasicBlock *LP = BB->getLandingPad(SuccBB.getLabel());
 643         if (LP && LP->getKnownExecutionCount() < Jump->Flow)
 644           LP->setExecutionCount(Jump->Flow);
 645       }
 646     }
 647
 648     // Update call-site annotations
 649     auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name,
 650                                      uint64_t Count) {
 651       if (BC.MIB->hasAnnotation(Instr, Name))
 652         BC.MIB->removeAnnotation(Instr, Name);
 653       // Do not add zero-count annotations
 654       if (Count == 0)
 655         return;
 656       BC.MIB->addAnnotation(Instr, Name, Count);
 657     };
 658
 659     for (MCInst &Instr : *BB) {
 660       // Ignore pseudo instructions
 661       if (BC.MIB->isPseudo(Instr))
 662         continue;
 663       // Ignore jump tables
 664       const MCInst *LastInstr = BB->getLastNonPseudoInstr();
 665       if (BC.MIB->getJumpTable(*LastInstr) && LastInstr == &Instr)
 666         continue;
 667
 668       if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) {
 669         auto &ICSP = BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
 670             Instr, "CallProfile");
 671         if (!ICSP.empty()) {
 672           // Try to evenly distribute the counts among the call sites
 673           const uint64_t TotalCount = Block.Flow;
 674           const uint64_t NumSites = ICSP.size();
 675           for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) {
 676             IndirectCallProfile &CSP = ICSP[Idx];
 677             uint64_t CountPerSite = TotalCount / NumSites;
 678             // When counts cannot be exactly distributed, increase by 1 the
 679             // counts of the first (TotalCount % NumSites) call sites
 680             if (Idx < TotalCount % NumSites)
 681               CountPerSite++;
 682             CSP.Count = CountPerSite;
 683           }
 684         } else {
 685           ICSP.emplace_back(nullptr, Block.Flow, 0);
 686         }
 687       } else if (BC.MIB->getConditionalTailCall(Instr)) {
 688         // We don't know exactly the number of times the conditional tail call
 689         // is executed; conservatively, setting it to the count of the block
 690         setOrUpdateAnnotation(Instr, "CTCTakenCount", Block.Flow);
 691         BC.MIB->removeAnnotation(Instr, "CTCMispredCount");
 692       } else if (BC.MIB->isCall(Instr)) {
 693         setOrUpdateAnnotation(Instr, "Count", Block.Flow);
 694       }
 695     }
 696   }
 697
 698   // Update function's execution count and mark the function inferred.
 699   BF.setExecutionCount(Func.Blocks[0].Flow);
 700   BF.setHasInferredProfile(true);
 701 }
 702
 703 bool YAMLProfileReader::inferStaleProfile(
 704     BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
 705   LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
 706                     << "\"" << BF.getPrintName() << "\"\n");
 707
 708   // Make sure that block hashes are up to date.
 709   BF.computeBlockHashes(YamlBP.Header.HashFunction);
 710
 711   const BinaryFunction::BasicBlockOrderType BlockOrder(
 712       BF.getLayout().block_begin(), BF.getLayout().block_end());
 713
 714   // Create a wrapper flow function to use with the profile inference algorithm.
 715   FlowFunction Func = createFlowFunction(BlockOrder);
 716
 717   // Match as many block/jump counts from the stale profile as possible
 718   matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func);
 719
 720   // Adjust the flow function by marking unreachable blocks Unlikely so that
 721   // they don't get any counts assigned.
 722   preprocessUnreachableBlocks(Func);
 723
 724   // Check if profile inference can be applied for the instance.
 725   if (!canApplyInference(Func))
 726     return false;
 727
 728   // Apply the profile inference algorithm.
 729   applyInference(Func);
 730
 731   // Collect inferred counts and update function annotations.
 732   assignProfile(BF, BlockOrder, Func);
 733
 734   // As of now, we always mark the binary function having "correct" profile.
 735   // In the future, we may discard the results for instances with poor inference
 736   // metrics and keep such functions un-optimized.
 737   return true;
 738 }
 739
 740 } // end namespace bolt
 741 } // end namespace llvm