1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include "llvm/Transforms/IPO/SampleProfile.h"
18 #define DEBUG_TYPE "cs-preinliner"
21 using namespace sampleprof
;
23 STATISTIC(PreInlNumCSInlined
,
24 "Number of functions inlined with context sensitive profile");
25 STATISTIC(PreInlNumCSNotInlined
,
26 "Number of functions not inlined with context sensitive profile");
27 STATISTIC(PreInlNumCSInlinedHitMinLimit
,
28 "Number of functions with FDO inline stopped due to min size limit");
29 STATISTIC(PreInlNumCSInlinedHitMaxLimit
,
30 "Number of functions with FDO inline stopped due to max size limit");
32 PreInlNumCSInlinedHitGrowthLimit
,
33 "Number of functions with FDO inline stopped due to growth size limit");
35 // The switches specify inline thresholds used in SampleProfileLoader inlining.
36 // TODO: the actual threshold to be tuned here because the size here is based
37 // on machine code not LLVM IR.
39 cl::opt
<bool> EnableCSPreInliner(
40 "csspgo-preinliner", cl::Hidden
, cl::init(true),
41 cl::desc("Run a global pre-inliner to merge context profile based on "
42 "estimated global top-down inline decisions"));
44 cl::opt
<bool> UseContextCostForPreInliner(
45 "use-context-cost-for-preinliner", cl::Hidden
, cl::init(true),
46 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
49 static cl::opt
<bool> SamplePreInlineReplay(
50 "csspgo-replay-preinline", cl::Hidden
, cl::init(false),
52 "Replay previous inlining and adjust context profile accordingly"));
54 static cl::opt
<int> CSPreinlMultiplierForPrevInl(
55 "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden
,
58 "Multiplier to bump up callsite threshold for previous inlining."));
60 CSPreInliner::CSPreInliner(SampleContextTracker
&Tracker
,
61 ProfiledBinary
&Binary
, ProfileSummary
*Summary
)
62 : UseContextCost(UseContextCostForPreInliner
),
63 // TODO: Pass in a guid-to-name map in order for
64 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
65 // as their profile context.
66 ContextTracker(Tracker
), Binary(Binary
), Summary(Summary
) {
67 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
68 // for good performance with reasonable profile size.
69 if (!SampleHotCallSiteThreshold
.getNumOccurrences())
70 SampleHotCallSiteThreshold
= 1500;
71 if (!SampleColdCallSiteThreshold
.getNumOccurrences())
72 SampleColdCallSiteThreshold
= 0;
73 if (!ProfileInlineLimitMax
.getNumOccurrences())
74 ProfileInlineLimitMax
= 50000;
77 std::vector
<FunctionId
> CSPreInliner::buildTopDownOrder() {
78 std::vector
<FunctionId
> Order
;
79 // Trim cold edges to get a more stable call graph. This allows for a more
80 // stable top-down order which in turns helps the stablity of the generated
81 // profile from run to run.
82 uint64_t ColdCountThreshold
= ProfileSummaryBuilder::getColdCountThreshold(
83 (Summary
->getDetailedSummary()));
84 ProfiledCallGraph
ProfiledCG(ContextTracker
, ColdCountThreshold
);
86 // Now that we have a profiled call graph, construct top-down order
87 // by building up SCC and reversing SCC order.
88 scc_iterator
<ProfiledCallGraph
*> I
= scc_begin(&ProfiledCG
);
89 while (!I
.isAtEnd()) {
91 if (SortProfiledSCC
) {
92 // Sort nodes in one SCC based on callsite hotness.
93 scc_member_iterator
<ProfiledCallGraph
*> SI(*I
);
96 for (auto *Node
: Range
) {
97 if (Node
!= ProfiledCG
.getEntryNode())
98 Order
.push_back(Node
->Name
);
102 std::reverse(Order
.begin(), Order
.end());
107 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue
&CQueue
,
108 const FunctionSamples
*CallerSamples
) {
109 assert(CallerSamples
&& "Expect non-null caller samples");
111 // Ideally we want to consider everything a function calls, but as far as
112 // context profile is concerned, only those frames that are children of
113 // current one in the trie is relavent. So we walk the trie instead of call
114 // targets from function profile.
115 ContextTrieNode
*CallerNode
=
116 ContextTracker
.getContextNodeForProfile(CallerSamples
);
118 bool HasNewCandidate
= false;
119 for (auto &Child
: CallerNode
->getAllChildContext()) {
120 ContextTrieNode
*CalleeNode
= &Child
.second
;
121 FunctionSamples
*CalleeSamples
= CalleeNode
->getFunctionSamples();
125 // Call site count is more reliable, so we look up the corresponding call
126 // target profile in caller's context profile to retrieve call site count.
127 uint64_t CalleeEntryCount
= CalleeSamples
->getHeadSamplesEstimate();
128 uint64_t CallsiteCount
= 0;
129 LineLocation Callsite
= CalleeNode
->getCallSiteLoc();
130 if (auto CallTargets
= CallerSamples
->findCallTargetMapAt(Callsite
)) {
131 auto It
= CallTargets
->find(CalleeSamples
->getFunction());
132 if (It
!= CallTargets
->end())
133 CallsiteCount
= It
->second
;
136 // TODO: call site and callee entry count should be mostly consistent, add
138 HasNewCandidate
= true;
139 uint32_t CalleeSize
= getFuncSize(CalleeNode
);
140 CQueue
.emplace(CalleeSamples
, std::max(CallsiteCount
, CalleeEntryCount
),
144 return HasNewCandidate
;
147 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode
*ContextNode
) {
149 return Binary
.getFuncSizeForContext(ContextNode
);
151 return ContextNode
->getFunctionSamples()->getBodySamples().size();
154 bool CSPreInliner::shouldInline(ProfiledInlineCandidate
&Candidate
) {
156 Candidate
.CalleeSamples
->getContext().hasAttribute(ContextWasInlined
);
157 // If replay inline is requested, simply follow the inline decision of the
159 if (SamplePreInlineReplay
)
162 unsigned int SampleThreshold
= SampleColdCallSiteThreshold
;
163 uint64_t ColdCountThreshold
= ProfileSummaryBuilder::getColdCountThreshold(
164 (Summary
->getDetailedSummary()));
166 if (Candidate
.CallsiteCount
<= ColdCountThreshold
)
167 SampleThreshold
= SampleColdCallSiteThreshold
;
169 // Linearly adjust threshold based on normalized hotness, i.e, a value in
170 // [0,1]. Use 10% cutoff instead of the max count as the normalization
171 // upperbound for stability.
172 double NormalizationUpperBound
=
173 ProfileSummaryBuilder::getEntryForPercentile(
174 Summary
->getDetailedSummary(), 100000 /* 10% */)
176 double NormalizationLowerBound
= ColdCountThreshold
;
177 double NormalizedHotness
=
178 (Candidate
.CallsiteCount
- NormalizationLowerBound
) /
179 (NormalizationUpperBound
- NormalizationLowerBound
);
180 if (NormalizedHotness
> 1.0)
181 NormalizedHotness
= 1.0;
182 // Add 1 to ensure hot callsites get a non-zero threshold, which could
183 // happen when SampleColdCallSiteThreshold is 0. This is when we do not
184 // want any inlining for cold callsites.
185 SampleThreshold
= SampleHotCallSiteThreshold
* NormalizedHotness
* 100 +
186 SampleColdCallSiteThreshold
+ 1;
187 // Bump up the threshold to favor previous compiler inline decision. The
188 // compiler has more insight and knowledge about functions based on their IR
189 // and attribures and should be able to make a more reasonable inline
192 SampleThreshold
*= CSPreinlMultiplierForPrevInl
;
195 return (Candidate
.SizeCost
< SampleThreshold
);
198 void CSPreInliner::processFunction(const FunctionId Name
) {
199 FunctionSamples
*FSamples
= ContextTracker
.getBaseSamplesFor(Name
);
204 getFuncSize(ContextTracker
.getContextNodeForProfile(FSamples
));
205 unsigned FuncFinalSize
= FuncSize
;
206 unsigned SizeLimit
= FuncSize
* ProfileInlineGrowthLimit
;
207 SizeLimit
= std::min(SizeLimit
, (unsigned)ProfileInlineLimitMax
);
208 SizeLimit
= std::max(SizeLimit
, (unsigned)ProfileInlineLimitMin
);
210 LLVM_DEBUG(dbgs() << "Process " << Name
211 << " for context-sensitive pre-inlining (pre-inline size: "
212 << FuncSize
<< ", size limit: " << SizeLimit
<< ")\n");
214 ProfiledCandidateQueue CQueue
;
215 getInlineCandidates(CQueue
, FSamples
);
217 while (!CQueue
.empty() && FuncFinalSize
< SizeLimit
) {
218 ProfiledInlineCandidate Candidate
= CQueue
.top();
220 bool ShouldInline
= false;
221 if ((ShouldInline
= shouldInline(Candidate
))) {
222 // We mark context as inlined as the corresponding context profile
223 // won't be merged into that function's base profile.
224 ++PreInlNumCSInlined
;
225 ContextTracker
.markContextSamplesInlined(Candidate
.CalleeSamples
);
226 Candidate
.CalleeSamples
->getContext().setAttribute(
227 ContextShouldBeInlined
);
228 FuncFinalSize
+= Candidate
.SizeCost
;
229 getInlineCandidates(CQueue
, Candidate
.CalleeSamples
);
231 ++PreInlNumCSNotInlined
;
234 dbgs() << (ShouldInline
? " Inlined" : " Outlined")
235 << " context profile for: "
236 << ContextTracker
.getContextString(*Candidate
.CalleeSamples
)
237 << " (callee size: " << Candidate
.SizeCost
238 << ", call count:" << Candidate
.CallsiteCount
<< ")\n");
241 if (!CQueue
.empty()) {
242 if (SizeLimit
== (unsigned)ProfileInlineLimitMax
)
243 ++PreInlNumCSInlinedHitMaxLimit
;
244 else if (SizeLimit
== (unsigned)ProfileInlineLimitMin
)
245 ++PreInlNumCSInlinedHitMinLimit
;
247 ++PreInlNumCSInlinedHitGrowthLimit
;
252 dbgs() << " Inline candidates ignored due to size limit (inliner "
254 << FuncSize
<< ", inliner final size: " << FuncFinalSize
255 << ", size limit: " << SizeLimit
<< ")\n";
257 while (!CQueue
.empty()) {
258 ProfiledInlineCandidate Candidate
= CQueue
.top();
261 Candidate
.CalleeSamples
->getContext().hasAttribute(ContextWasInlined
);
263 << ContextTracker
.getContextString(*Candidate
.CalleeSamples
)
264 << " (candidate size:" << Candidate
.SizeCost
265 << ", call count: " << Candidate
.CallsiteCount
<< ", previously "
266 << (WasInlined
? "inlined)\n" : "not inlined)\n");
271 void CSPreInliner::run() {
273 auto printProfileNames
= [](SampleContextTracker
&ContextTracker
,
276 for (auto *Node
: ContextTracker
) {
277 FunctionSamples
*FSamples
= Node
->getFunctionSamples();
280 dbgs() << " [" << ContextTracker
.getContextString(Node
) << "] "
281 << FSamples
->getTotalSamples() << ":"
282 << FSamples
->getHeadSamples() << "\n";
285 dbgs() << (IsInput
? "Input" : "Output") << " context-sensitive profiles ("
286 << Size
<< " total):\n";
290 LLVM_DEBUG(printProfileNames(ContextTracker
, true));
292 // Execute global pre-inliner to estimate a global top-down inline
293 // decision and merge profiles accordingly. This helps with profile
294 // merge for ThinLTO otherwise we won't be able to merge profiles back
295 // to base profile across module/thin-backend boundaries.
296 // It also helps better compress context profile to control profile
297 // size, as we now only need context profile for functions going to
299 for (FunctionId FuncName
: buildTopDownOrder()) {
300 processFunction(FuncName
);
303 // Not inlined context profiles are merged into its base, so we can
304 // trim out such profiles from the output.
305 for (auto *Node
: ContextTracker
) {
306 FunctionSamples
*FProfile
= Node
->getFunctionSamples();
308 (Node
->getParentContext() != &ContextTracker
.getRootContext() &&
309 !FProfile
->getContext().hasState(InlinedContext
))) {
310 Node
->setFunctionSamples(nullptr);
313 FunctionSamples::ProfileIsPreInlined
= true;
315 LLVM_DEBUG(printProfileNames(ContextTracker
, false));