[llvm-shlib] Fix the version naming style of libLLVM for Windows (#85710)
[llvm-project.git] / llvm / tools / llvm-profgen / CSPreInliner.cpp
blob87df6996aa435adf16acde338beb4f123ad49aeb
1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include "llvm/Transforms/IPO/SampleProfile.h"
15 #include <cstdint>
16 #include <queue>
18 #define DEBUG_TYPE "cs-preinliner"
20 using namespace llvm;
21 using namespace sampleprof;
23 STATISTIC(PreInlNumCSInlined,
24 "Number of functions inlined with context sensitive profile");
25 STATISTIC(PreInlNumCSNotInlined,
26 "Number of functions not inlined with context sensitive profile");
27 STATISTIC(PreInlNumCSInlinedHitMinLimit,
28 "Number of functions with FDO inline stopped due to min size limit");
29 STATISTIC(PreInlNumCSInlinedHitMaxLimit,
30 "Number of functions with FDO inline stopped due to max size limit");
31 STATISTIC(
32 PreInlNumCSInlinedHitGrowthLimit,
33 "Number of functions with FDO inline stopped due to growth size limit");
35 // The switches specify inline thresholds used in SampleProfileLoader inlining.
36 // TODO: the actual threshold to be tuned here because the size here is based
37 // on machine code not LLVM IR.
38 namespace llvm {
39 cl::opt<bool> EnableCSPreInliner(
40 "csspgo-preinliner", cl::Hidden, cl::init(true),
41 cl::desc("Run a global pre-inliner to merge context profile based on "
42 "estimated global top-down inline decisions"));
44 cl::opt<bool> UseContextCostForPreInliner(
45 "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
46 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
47 } // namespace llvm
49 static cl::opt<bool> SamplePreInlineReplay(
50 "csspgo-replay-preinline", cl::Hidden, cl::init(false),
51 cl::desc(
52 "Replay previous inlining and adjust context profile accordingly"));
54 static cl::opt<int> CSPreinlMultiplierForPrevInl(
55 "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
56 cl::init(100),
57 cl::desc(
58 "Multiplier to bump up callsite threshold for previous inlining."));
60 CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
61 ProfiledBinary &Binary, ProfileSummary *Summary)
62 : UseContextCost(UseContextCostForPreInliner),
63 // TODO: Pass in a guid-to-name map in order for
64 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
65 // as their profile context.
66 ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
67 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
68 // for good performance with reasonable profile size.
69 if (!SampleHotCallSiteThreshold.getNumOccurrences())
70 SampleHotCallSiteThreshold = 1500;
71 if (!SampleColdCallSiteThreshold.getNumOccurrences())
72 SampleColdCallSiteThreshold = 0;
73 if (!ProfileInlineLimitMax.getNumOccurrences())
74 ProfileInlineLimitMax = 50000;
77 std::vector<FunctionId> CSPreInliner::buildTopDownOrder() {
78 std::vector<FunctionId> Order;
79 // Trim cold edges to get a more stable call graph. This allows for a more
80 // stable top-down order which in turns helps the stablity of the generated
81 // profile from run to run.
82 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
83 (Summary->getDetailedSummary()));
84 ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
86 // Now that we have a profiled call graph, construct top-down order
87 // by building up SCC and reversing SCC order.
88 scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
89 while (!I.isAtEnd()) {
90 auto Range = *I;
91 if (SortProfiledSCC) {
92 // Sort nodes in one SCC based on callsite hotness.
93 scc_member_iterator<ProfiledCallGraph *> SI(*I);
94 Range = *SI;
96 for (auto *Node : Range) {
97 if (Node != ProfiledCG.getEntryNode())
98 Order.push_back(Node->Name);
100 ++I;
102 std::reverse(Order.begin(), Order.end());
104 return Order;
107 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
108 const FunctionSamples *CallerSamples) {
109 assert(CallerSamples && "Expect non-null caller samples");
111 // Ideally we want to consider everything a function calls, but as far as
112 // context profile is concerned, only those frames that are children of
113 // current one in the trie is relavent. So we walk the trie instead of call
114 // targets from function profile.
115 ContextTrieNode *CallerNode =
116 ContextTracker.getContextNodeForProfile(CallerSamples);
118 bool HasNewCandidate = false;
119 for (auto &Child : CallerNode->getAllChildContext()) {
120 ContextTrieNode *CalleeNode = &Child.second;
121 FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
122 if (!CalleeSamples)
123 continue;
125 // Call site count is more reliable, so we look up the corresponding call
126 // target profile in caller's context profile to retrieve call site count.
127 uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
128 uint64_t CallsiteCount = 0;
129 LineLocation Callsite = CalleeNode->getCallSiteLoc();
130 if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
131 auto It = CallTargets->find(CalleeSamples->getFunction());
132 if (It != CallTargets->end())
133 CallsiteCount = It->second;
136 // TODO: call site and callee entry count should be mostly consistent, add
137 // check for that.
138 HasNewCandidate = true;
139 uint32_t CalleeSize = getFuncSize(CalleeNode);
140 CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
141 CalleeSize);
144 return HasNewCandidate;
147 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
148 if (UseContextCost)
149 return Binary.getFuncSizeForContext(ContextNode);
151 return ContextNode->getFunctionSamples()->getBodySamples().size();
154 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
155 bool WasInlined =
156 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
157 // If replay inline is requested, simply follow the inline decision of the
158 // profiled binary.
159 if (SamplePreInlineReplay)
160 return WasInlined;
162 unsigned int SampleThreshold = SampleColdCallSiteThreshold;
163 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
164 (Summary->getDetailedSummary()));
166 if (Candidate.CallsiteCount <= ColdCountThreshold)
167 SampleThreshold = SampleColdCallSiteThreshold;
168 else {
169 // Linearly adjust threshold based on normalized hotness, i.e, a value in
170 // [0,1]. Use 10% cutoff instead of the max count as the normalization
171 // upperbound for stability.
172 double NormalizationUpperBound =
173 ProfileSummaryBuilder::getEntryForPercentile(
174 Summary->getDetailedSummary(), 100000 /* 10% */)
175 .MinCount;
176 double NormalizationLowerBound = ColdCountThreshold;
177 double NormalizedHotness =
178 (Candidate.CallsiteCount - NormalizationLowerBound) /
179 (NormalizationUpperBound - NormalizationLowerBound);
180 if (NormalizedHotness > 1.0)
181 NormalizedHotness = 1.0;
182 // Add 1 to ensure hot callsites get a non-zero threshold, which could
183 // happen when SampleColdCallSiteThreshold is 0. This is when we do not
184 // want any inlining for cold callsites.
185 SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
186 SampleColdCallSiteThreshold + 1;
187 // Bump up the threshold to favor previous compiler inline decision. The
188 // compiler has more insight and knowledge about functions based on their IR
189 // and attribures and should be able to make a more reasonable inline
190 // decision.
191 if (WasInlined)
192 SampleThreshold *= CSPreinlMultiplierForPrevInl;
195 return (Candidate.SizeCost < SampleThreshold);
198 void CSPreInliner::processFunction(const FunctionId Name) {
199 FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
200 if (!FSamples)
201 return;
203 unsigned FuncSize =
204 getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
205 unsigned FuncFinalSize = FuncSize;
206 unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
207 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
208 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
210 LLVM_DEBUG(dbgs() << "Process " << Name
211 << " for context-sensitive pre-inlining (pre-inline size: "
212 << FuncSize << ", size limit: " << SizeLimit << ")\n");
214 ProfiledCandidateQueue CQueue;
215 getInlineCandidates(CQueue, FSamples);
217 while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
218 ProfiledInlineCandidate Candidate = CQueue.top();
219 CQueue.pop();
220 bool ShouldInline = false;
221 if ((ShouldInline = shouldInline(Candidate))) {
222 // We mark context as inlined as the corresponding context profile
223 // won't be merged into that function's base profile.
224 ++PreInlNumCSInlined;
225 ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
226 Candidate.CalleeSamples->getContext().setAttribute(
227 ContextShouldBeInlined);
228 FuncFinalSize += Candidate.SizeCost;
229 getInlineCandidates(CQueue, Candidate.CalleeSamples);
230 } else {
231 ++PreInlNumCSNotInlined;
233 LLVM_DEBUG(
234 dbgs() << (ShouldInline ? " Inlined" : " Outlined")
235 << " context profile for: "
236 << ContextTracker.getContextString(*Candidate.CalleeSamples)
237 << " (callee size: " << Candidate.SizeCost
238 << ", call count:" << Candidate.CallsiteCount << ")\n");
241 if (!CQueue.empty()) {
242 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
243 ++PreInlNumCSInlinedHitMaxLimit;
244 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
245 ++PreInlNumCSInlinedHitMinLimit;
246 else
247 ++PreInlNumCSInlinedHitGrowthLimit;
250 LLVM_DEBUG({
251 if (!CQueue.empty())
252 dbgs() << " Inline candidates ignored due to size limit (inliner "
253 "original size: "
254 << FuncSize << ", inliner final size: " << FuncFinalSize
255 << ", size limit: " << SizeLimit << ")\n";
257 while (!CQueue.empty()) {
258 ProfiledInlineCandidate Candidate = CQueue.top();
259 CQueue.pop();
260 bool WasInlined =
261 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
262 dbgs() << " "
263 << ContextTracker.getContextString(*Candidate.CalleeSamples)
264 << " (candidate size:" << Candidate.SizeCost
265 << ", call count: " << Candidate.CallsiteCount << ", previously "
266 << (WasInlined ? "inlined)\n" : "not inlined)\n");
271 void CSPreInliner::run() {
272 #ifndef NDEBUG
273 auto printProfileNames = [](SampleContextTracker &ContextTracker,
274 bool IsInput) {
275 uint32_t Size = 0;
276 for (auto *Node : ContextTracker) {
277 FunctionSamples *FSamples = Node->getFunctionSamples();
278 if (FSamples) {
279 Size++;
280 dbgs() << " [" << ContextTracker.getContextString(Node) << "] "
281 << FSamples->getTotalSamples() << ":"
282 << FSamples->getHeadSamples() << "\n";
285 dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
286 << Size << " total):\n";
288 #endif
290 LLVM_DEBUG(printProfileNames(ContextTracker, true));
292 // Execute global pre-inliner to estimate a global top-down inline
293 // decision and merge profiles accordingly. This helps with profile
294 // merge for ThinLTO otherwise we won't be able to merge profiles back
295 // to base profile across module/thin-backend boundaries.
296 // It also helps better compress context profile to control profile
297 // size, as we now only need context profile for functions going to
298 // be inlined.
299 for (FunctionId FuncName : buildTopDownOrder()) {
300 processFunction(FuncName);
303 // Not inlined context profiles are merged into its base, so we can
304 // trim out such profiles from the output.
305 for (auto *Node : ContextTracker) {
306 FunctionSamples *FProfile = Node->getFunctionSamples();
307 if (FProfile &&
308 (Node->getParentContext() != &ContextTracker.getRootContext() &&
309 !FProfile->getContext().hasState(InlinedContext))) {
310 Node->setFunctionSamples(nullptr);
313 FunctionSamples::ProfileIsPreInlined = true;
315 LLVM_DEBUG(printProfileNames(ContextTracker, false));