llvm/tools/llvm-profgen/CSPreInliner.cpp

   1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "CSPreInliner.h"
  10 #include "ProfiledBinary.h"
  11 #include "llvm/ADT/SCCIterator.h"
  12 #include "llvm/ADT/Statistic.h"
  13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
  14 #include "llvm/Transforms/IPO/SampleProfile.h"
  15 #include <cstdint>
  16 #include <queue>
  17
  18 #define DEBUG_TYPE "cs-preinliner"
  19
  20 using namespace llvm;
  21 using namespace sampleprof;
  22
  23 STATISTIC(PreInlNumCSInlined,
  24           "Number of functions inlined with context sensitive profile");
  25 STATISTIC(PreInlNumCSNotInlined,
  26           "Number of functions not inlined with context sensitive profile");
  27 STATISTIC(PreInlNumCSInlinedHitMinLimit,
  28           "Number of functions with FDO inline stopped due to min size limit");
  29 STATISTIC(PreInlNumCSInlinedHitMaxLimit,
  30           "Number of functions with FDO inline stopped due to max size limit");
  31 STATISTIC(
  32     PreInlNumCSInlinedHitGrowthLimit,
  33     "Number of functions with FDO inline stopped due to growth size limit");
  34
  35 // The switches specify inline thresholds used in SampleProfileLoader inlining.
  36 // TODO: the actual threshold to be tuned here because the size here is based
  37 // on machine code not LLVM IR.
  38 namespace llvm {
  39 cl::opt<bool> EnableCSPreInliner(
  40     "csspgo-preinliner", cl::Hidden, cl::init(true),
  41     cl::desc("Run a global pre-inliner to merge context profile based on "
  42              "estimated global top-down inline decisions"));
  43
  44 cl::opt<bool> UseContextCostForPreInliner(
  45     "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
  46     cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
  47 } // namespace llvm
  48
  49 static cl::opt<bool> SamplePreInlineReplay(
  50     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
  51     cl::desc(
  52         "Replay previous inlining and adjust context profile accordingly"));
  53
  54 static cl::opt<int> CSPreinlMultiplierForPrevInl(
  55     "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
  56     cl::init(100),
  57     cl::desc(
  58         "Multiplier to bump up callsite threshold for previous inlining."));
  59
  60 CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
  61                            ProfiledBinary &Binary, ProfileSummary *Summary)
  62     : UseContextCost(UseContextCostForPreInliner),
  63       // TODO: Pass in a guid-to-name map in order for
  64       // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
  65       // as their profile context.
  66       ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
  67   // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
  68   // for good performance with reasonable profile size.
  69   if (!SampleHotCallSiteThreshold.getNumOccurrences())
  70     SampleHotCallSiteThreshold = 1500;
  71   if (!SampleColdCallSiteThreshold.getNumOccurrences())
  72     SampleColdCallSiteThreshold = 0;
  73   if (!ProfileInlineLimitMax.getNumOccurrences())
  74     ProfileInlineLimitMax = 50000;
  75 }
  76
  77 std::vector<FunctionId> CSPreInliner::buildTopDownOrder() {
  78   std::vector<FunctionId> Order;
  79   // Trim cold edges to get a more stable call graph. This allows for a more
  80   // stable top-down order which in turns helps the stablity of the generated
  81   // profile from run to run.
  82   uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
  83       (Summary->getDetailedSummary()));
  84   ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
  85
  86   // Now that we have a profiled call graph, construct top-down order
  87   // by building up SCC and reversing SCC order.
  88   scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
  89   while (!I.isAtEnd()) {
  90     auto Range = *I;
  91     if (SortProfiledSCC) {
  92       // Sort nodes in one SCC based on callsite hotness.
  93       scc_member_iterator<ProfiledCallGraph *> SI(*I);
  94       Range = *SI;
  95     }
  96     for (auto *Node : Range) {
  97       if (Node != ProfiledCG.getEntryNode())
  98         Order.push_back(Node->Name);
  99     }
 100     ++I;
 101   }
 102   std::reverse(Order.begin(), Order.end());
 103
 104   return Order;
 105 }
 106
 107 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
 108                                        const FunctionSamples *CallerSamples) {
 109   assert(CallerSamples && "Expect non-null caller samples");
 110
 111   // Ideally we want to consider everything a function calls, but as far as
 112   // context profile is concerned, only those frames that are children of
 113   // current one in the trie is relavent. So we walk the trie instead of call
 114   // targets from function profile.
 115   ContextTrieNode *CallerNode =
 116       ContextTracker.getContextNodeForProfile(CallerSamples);
 117
 118   bool HasNewCandidate = false;
 119   for (auto &Child : CallerNode->getAllChildContext()) {
 120     ContextTrieNode *CalleeNode = &Child.second;
 121     FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
 122     if (!CalleeSamples)
 123       continue;
 124
 125     // Call site count is more reliable, so we look up the corresponding call
 126     // target profile in caller's context profile to retrieve call site count.
 127     uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
 128     uint64_t CallsiteCount = 0;
 129     LineLocation Callsite = CalleeNode->getCallSiteLoc();
 130     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
 131       SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
 132       auto It = TargetCounts.find(CalleeSamples->getFunction());
 133       if (It != TargetCounts.end())
 134         CallsiteCount = It->second;
 135     }
 136
 137     // TODO: call site and callee entry count should be mostly consistent, add
 138     // check for that.
 139     HasNewCandidate = true;
 140     uint32_t CalleeSize = getFuncSize(CalleeNode);
 141     CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
 142                    CalleeSize);
 143   }
 144
 145   return HasNewCandidate;
 146 }
 147
 148 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
 149   if (UseContextCost)
 150     return Binary.getFuncSizeForContext(ContextNode);
 151
 152   return ContextNode->getFunctionSamples()->getBodySamples().size();
 153 }
 154
 155 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
 156   bool WasInlined =
 157       Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
 158   // If replay inline is requested, simply follow the inline decision of the
 159   // profiled binary.
 160   if (SamplePreInlineReplay)
 161     return WasInlined;
 162
 163   unsigned int SampleThreshold = SampleColdCallSiteThreshold;
 164   uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
 165       (Summary->getDetailedSummary()));
 166
 167   if (Candidate.CallsiteCount <= ColdCountThreshold)
 168     SampleThreshold = SampleColdCallSiteThreshold;
 169   else {
 170     // Linearly adjust threshold based on normalized hotness, i.e, a value in
 171     // [0,1]. Use 10% cutoff instead of the max count as the normalization
 172     // upperbound for stability.
 173     double NormalizationUpperBound =
 174         ProfileSummaryBuilder::getEntryForPercentile(
 175             Summary->getDetailedSummary(), 100000 /* 10% */)
 176             .MinCount;
 177     double NormalizationLowerBound = ColdCountThreshold;
 178     double NormalizedHotness =
 179         (Candidate.CallsiteCount - NormalizationLowerBound) /
 180         (NormalizationUpperBound - NormalizationLowerBound);
 181     if (NormalizedHotness > 1.0)
 182       NormalizedHotness = 1.0;
 183     // Add 1 to ensure hot callsites get a non-zero threshold, which could
 184     // happen when SampleColdCallSiteThreshold is 0. This is when we do not
 185     // want any inlining for cold callsites.
 186     SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
 187                       SampleColdCallSiteThreshold + 1;
 188     // Bump up the threshold to favor previous compiler inline decision. The
 189     // compiler has more insight and knowledge about functions based on their IR
 190     // and attribures and should be able to make a more reasonable inline
 191     // decision.
 192     if (WasInlined)
 193       SampleThreshold *= CSPreinlMultiplierForPrevInl;
 194   }
 195
 196   return (Candidate.SizeCost < SampleThreshold);
 197 }
 198
 199 void CSPreInliner::processFunction(const FunctionId Name) {
 200   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
 201   if (!FSamples)
 202     return;
 203
 204   unsigned FuncSize =
 205       getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
 206   unsigned FuncFinalSize = FuncSize;
 207   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
 208   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
 209   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
 210
 211   LLVM_DEBUG(dbgs() << "Process " << Name
 212                     << " for context-sensitive pre-inlining (pre-inline size: "
 213                     << FuncSize << ", size limit: " << SizeLimit << ")\n");
 214
 215   ProfiledCandidateQueue CQueue;
 216   getInlineCandidates(CQueue, FSamples);
 217
 218   while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
 219     ProfiledInlineCandidate Candidate = CQueue.top();
 220     CQueue.pop();
 221     bool ShouldInline = false;
 222     if ((ShouldInline = shouldInline(Candidate))) {
 223       // We mark context as inlined as the corresponding context profile
 224       // won't be merged into that function's base profile.
 225       ++PreInlNumCSInlined;
 226       ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
 227       Candidate.CalleeSamples->getContext().setAttribute(
 228           ContextShouldBeInlined);
 229       FuncFinalSize += Candidate.SizeCost;
 230       getInlineCandidates(CQueue, Candidate.CalleeSamples);
 231     } else {
 232       ++PreInlNumCSNotInlined;
 233     }
 234     LLVM_DEBUG(
 235         dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
 236                << " context profile for: "
 237                << ContextTracker.getContextString(*Candidate.CalleeSamples)
 238                << " (callee size: " << Candidate.SizeCost
 239                << ", call count:" << Candidate.CallsiteCount << ")\n");
 240   }
 241
 242   if (!CQueue.empty()) {
 243     if (SizeLimit == (unsigned)ProfileInlineLimitMax)
 244       ++PreInlNumCSInlinedHitMaxLimit;
 245     else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
 246       ++PreInlNumCSInlinedHitMinLimit;
 247     else
 248       ++PreInlNumCSInlinedHitGrowthLimit;
 249   }
 250
 251   LLVM_DEBUG({
 252     if (!CQueue.empty())
 253       dbgs() << "  Inline candidates ignored due to size limit (inliner "
 254                 "original size: "
 255              << FuncSize << ", inliner final size: " << FuncFinalSize
 256              << ", size limit: " << SizeLimit << ")\n";
 257
 258     while (!CQueue.empty()) {
 259       ProfiledInlineCandidate Candidate = CQueue.top();
 260       CQueue.pop();
 261       bool WasInlined =
 262           Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
 263       dbgs() << "    "
 264              << ContextTracker.getContextString(*Candidate.CalleeSamples)
 265              << " (candidate size:" << Candidate.SizeCost
 266              << ", call count: " << Candidate.CallsiteCount << ", previously "
 267              << (WasInlined ? "inlined)\n" : "not inlined)\n");
 268     }
 269   });
 270 }
 271
 272 void CSPreInliner::run() {
 273 #ifndef NDEBUG
 274   auto printProfileNames = [](SampleContextTracker &ContextTracker,
 275                               bool IsInput) {
 276     uint32_t Size = 0;
 277     for (auto *Node : ContextTracker) {
 278       FunctionSamples *FSamples = Node->getFunctionSamples();
 279       if (FSamples) {
 280         Size++;
 281         dbgs() << "  [" << ContextTracker.getContextString(Node) << "] "
 282                << FSamples->getTotalSamples() << ":"
 283                << FSamples->getHeadSamples() << "\n";
 284       }
 285     }
 286     dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
 287            << Size << " total):\n";
 288   };
 289 #endif
 290
 291   LLVM_DEBUG(printProfileNames(ContextTracker, true));
 292
 293   // Execute global pre-inliner to estimate a global top-down inline
 294   // decision and merge profiles accordingly. This helps with profile
 295   // merge for ThinLTO otherwise we won't be able to merge profiles back
 296   // to base profile across module/thin-backend boundaries.
 297   // It also helps better compress context profile to control profile
 298   // size, as we now only need context profile for functions going to
 299   // be inlined.
 300   for (FunctionId FuncName : buildTopDownOrder()) {
 301     processFunction(FuncName);
 302   }
 303
 304   // Not inlined context profiles are merged into its base, so we can
 305   // trim out such profiles from the output.
 306   for (auto *Node : ContextTracker) {
 307     FunctionSamples *FProfile = Node->getFunctionSamples();
 308     if (FProfile &&
 309         (Node->getParentContext() != &ContextTracker.getRootContext() &&
 310          !FProfile->getContext().hasState(InlinedContext))) {
 311       Node->setFunctionSamples(nullptr);
 312     }
 313   }
 314   FunctionSamples::ProfileIsPreInlined = true;
 315
 316   LLVM_DEBUG(printProfileNames(ContextTracker, false));
 317 }