1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
17 #define DEBUG_TYPE "cs-preinliner"
20 using namespace sampleprof
;
22 STATISTIC(PreInlNumCSInlined
,
23 "Number of functions inlined with context sensitive profile");
24 STATISTIC(PreInlNumCSNotInlined
,
25 "Number of functions not inlined with context sensitive profile");
26 STATISTIC(PreInlNumCSInlinedHitMinLimit
,
27 "Number of functions with FDO inline stopped due to min size limit");
28 STATISTIC(PreInlNumCSInlinedHitMaxLimit
,
29 "Number of functions with FDO inline stopped due to max size limit");
31 PreInlNumCSInlinedHitGrowthLimit
,
32 "Number of functions with FDO inline stopped due to growth size limit");
34 // The switches specify inline thresholds used in SampleProfileLoader inlining.
35 // TODO: the actual threshold to be tuned here because the size here is based
36 // on machine code not LLVM IR.
38 extern cl::opt
<int> SampleHotCallSiteThreshold
;
39 extern cl::opt
<int> SampleColdCallSiteThreshold
;
40 extern cl::opt
<int> ProfileInlineGrowthLimit
;
41 extern cl::opt
<int> ProfileInlineLimitMin
;
42 extern cl::opt
<int> ProfileInlineLimitMax
;
43 extern cl::opt
<bool> SortProfiledSCC
;
45 cl::opt
<bool> EnableCSPreInliner(
46 "csspgo-preinliner", cl::Hidden
, cl::init(true),
47 cl::desc("Run a global pre-inliner to merge context profile based on "
48 "estimated global top-down inline decisions"));
50 cl::opt
<bool> UseContextCostForPreInliner(
51 "use-context-cost-for-preinliner", cl::Hidden
, cl::init(true),
52 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
55 static cl::opt
<bool> SamplePreInlineReplay(
56 "csspgo-replay-preinline", cl::Hidden
, cl::init(false),
58 "Replay previous inlining and adjust context profile accordingly"));
60 static cl::opt
<int> CSPreinlMultiplierForPrevInl(
61 "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden
,
64 "Multiplier to bump up callsite threshold for previous inlining."));
66 CSPreInliner::CSPreInliner(SampleContextTracker
&Tracker
,
67 ProfiledBinary
&Binary
, ProfileSummary
*Summary
)
68 : UseContextCost(UseContextCostForPreInliner
),
69 // TODO: Pass in a guid-to-name map in order for
70 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
71 // as their profile context.
72 ContextTracker(Tracker
), Binary(Binary
), Summary(Summary
) {
73 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
74 // for good performance with reasonable profile size.
75 if (!SampleHotCallSiteThreshold
.getNumOccurrences())
76 SampleHotCallSiteThreshold
= 1500;
77 if (!SampleColdCallSiteThreshold
.getNumOccurrences())
78 SampleColdCallSiteThreshold
= 0;
79 if (!ProfileInlineLimitMax
.getNumOccurrences())
80 ProfileInlineLimitMax
= 50000;
83 std::vector
<StringRef
> CSPreInliner::buildTopDownOrder() {
84 std::vector
<StringRef
> Order
;
85 // Trim cold edges to get a more stable call graph. This allows for a more
86 // stable top-down order which in turns helps the stablity of the generated
87 // profile from run to run.
88 uint64_t ColdCountThreshold
= ProfileSummaryBuilder::getColdCountThreshold(
89 (Summary
->getDetailedSummary()));
90 ProfiledCallGraph
ProfiledCG(ContextTracker
, ColdCountThreshold
);
92 // Now that we have a profiled call graph, construct top-down order
93 // by building up SCC and reversing SCC order.
94 scc_iterator
<ProfiledCallGraph
*> I
= scc_begin(&ProfiledCG
);
95 while (!I
.isAtEnd()) {
97 if (SortProfiledSCC
) {
98 // Sort nodes in one SCC based on callsite hotness.
99 scc_member_iterator
<ProfiledCallGraph
*> SI(*I
);
102 for (auto *Node
: Range
) {
103 if (Node
!= ProfiledCG
.getEntryNode())
104 Order
.push_back(Node
->Name
);
108 std::reverse(Order
.begin(), Order
.end());
113 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue
&CQueue
,
114 const FunctionSamples
*CallerSamples
) {
115 assert(CallerSamples
&& "Expect non-null caller samples");
117 // Ideally we want to consider everything a function calls, but as far as
118 // context profile is concerned, only those frames that are children of
119 // current one in the trie is relavent. So we walk the trie instead of call
120 // targets from function profile.
121 ContextTrieNode
*CallerNode
=
122 ContextTracker
.getContextNodeForProfile(CallerSamples
);
124 bool HasNewCandidate
= false;
125 for (auto &Child
: CallerNode
->getAllChildContext()) {
126 ContextTrieNode
*CalleeNode
= &Child
.second
;
127 FunctionSamples
*CalleeSamples
= CalleeNode
->getFunctionSamples();
131 // Call site count is more reliable, so we look up the corresponding call
132 // target profile in caller's context profile to retrieve call site count.
133 uint64_t CalleeEntryCount
= CalleeSamples
->getHeadSamplesEstimate();
134 uint64_t CallsiteCount
= 0;
135 LineLocation Callsite
= CalleeNode
->getCallSiteLoc();
136 if (auto CallTargets
= CallerSamples
->findCallTargetMapAt(Callsite
)) {
137 SampleRecord::CallTargetMap
&TargetCounts
= CallTargets
.get();
138 auto It
= TargetCounts
.find(CalleeSamples
->getName());
139 if (It
!= TargetCounts
.end())
140 CallsiteCount
= It
->second
;
143 // TODO: call site and callee entry count should be mostly consistent, add
145 HasNewCandidate
= true;
146 uint32_t CalleeSize
= getFuncSize(CalleeNode
);
147 CQueue
.emplace(CalleeSamples
, std::max(CallsiteCount
, CalleeEntryCount
),
151 return HasNewCandidate
;
154 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode
*ContextNode
) {
156 return Binary
.getFuncSizeForContext(ContextNode
);
158 return ContextNode
->getFunctionSamples()->getBodySamples().size();
161 bool CSPreInliner::shouldInline(ProfiledInlineCandidate
&Candidate
) {
163 Candidate
.CalleeSamples
->getContext().hasAttribute(ContextWasInlined
);
164 // If replay inline is requested, simply follow the inline decision of the
166 if (SamplePreInlineReplay
)
169 unsigned int SampleThreshold
= SampleColdCallSiteThreshold
;
170 uint64_t ColdCountThreshold
= ProfileSummaryBuilder::getColdCountThreshold(
171 (Summary
->getDetailedSummary()));
173 if (Candidate
.CallsiteCount
<= ColdCountThreshold
)
174 SampleThreshold
= SampleColdCallSiteThreshold
;
176 // Linearly adjust threshold based on normalized hotness, i.e, a value in
177 // [0,1]. Use 10% cutoff instead of the max count as the normalization
178 // upperbound for stability.
179 double NormalizationUpperBound
=
180 ProfileSummaryBuilder::getEntryForPercentile(
181 Summary
->getDetailedSummary(), 100000 /* 10% */)
183 double NormalizationLowerBound
= ColdCountThreshold
;
184 double NormalizedHotness
=
185 (Candidate
.CallsiteCount
- NormalizationLowerBound
) /
186 (NormalizationUpperBound
- NormalizationLowerBound
);
187 if (NormalizedHotness
> 1.0)
188 NormalizedHotness
= 1.0;
189 // Add 1 to ensure hot callsites get a non-zero threshold, which could
190 // happen when SampleColdCallSiteThreshold is 0. This is when we do not
191 // want any inlining for cold callsites.
192 SampleThreshold
= SampleHotCallSiteThreshold
* NormalizedHotness
* 100 +
193 SampleColdCallSiteThreshold
+ 1;
194 // Bump up the threshold to favor previous compiler inline decision. The
195 // compiler has more insight and knowledge about functions based on their IR
196 // and attribures and should be able to make a more reasonable inline
199 SampleThreshold
*= CSPreinlMultiplierForPrevInl
;
202 return (Candidate
.SizeCost
< SampleThreshold
);
205 void CSPreInliner::processFunction(const StringRef Name
) {
206 FunctionSamples
*FSamples
= ContextTracker
.getBaseSamplesFor(Name
);
211 getFuncSize(ContextTracker
.getContextNodeForProfile(FSamples
));
212 unsigned FuncFinalSize
= FuncSize
;
213 unsigned SizeLimit
= FuncSize
* ProfileInlineGrowthLimit
;
214 SizeLimit
= std::min(SizeLimit
, (unsigned)ProfileInlineLimitMax
);
215 SizeLimit
= std::max(SizeLimit
, (unsigned)ProfileInlineLimitMin
);
217 LLVM_DEBUG(dbgs() << "Process " << Name
218 << " for context-sensitive pre-inlining (pre-inline size: "
219 << FuncSize
<< ", size limit: " << SizeLimit
<< ")\n");
221 ProfiledCandidateQueue CQueue
;
222 getInlineCandidates(CQueue
, FSamples
);
224 while (!CQueue
.empty() && FuncFinalSize
< SizeLimit
) {
225 ProfiledInlineCandidate Candidate
= CQueue
.top();
227 bool ShouldInline
= false;
228 if ((ShouldInline
= shouldInline(Candidate
))) {
229 // We mark context as inlined as the corresponding context profile
230 // won't be merged into that function's base profile.
231 ++PreInlNumCSInlined
;
232 ContextTracker
.markContextSamplesInlined(Candidate
.CalleeSamples
);
233 Candidate
.CalleeSamples
->getContext().setAttribute(
234 ContextShouldBeInlined
);
235 FuncFinalSize
+= Candidate
.SizeCost
;
236 getInlineCandidates(CQueue
, Candidate
.CalleeSamples
);
238 ++PreInlNumCSNotInlined
;
241 dbgs() << (ShouldInline
? " Inlined" : " Outlined")
242 << " context profile for: "
243 << ContextTracker
.getContextString(*Candidate
.CalleeSamples
)
244 << " (callee size: " << Candidate
.SizeCost
245 << ", call count:" << Candidate
.CallsiteCount
<< ")\n");
248 if (!CQueue
.empty()) {
249 if (SizeLimit
== (unsigned)ProfileInlineLimitMax
)
250 ++PreInlNumCSInlinedHitMaxLimit
;
251 else if (SizeLimit
== (unsigned)ProfileInlineLimitMin
)
252 ++PreInlNumCSInlinedHitMinLimit
;
254 ++PreInlNumCSInlinedHitGrowthLimit
;
259 dbgs() << " Inline candidates ignored due to size limit (inliner "
261 << FuncSize
<< ", inliner final size: " << FuncFinalSize
262 << ", size limit: " << SizeLimit
<< ")\n";
264 while (!CQueue
.empty()) {
265 ProfiledInlineCandidate Candidate
= CQueue
.top();
268 Candidate
.CalleeSamples
->getContext().hasAttribute(ContextWasInlined
);
270 << ContextTracker
.getContextString(*Candidate
.CalleeSamples
)
271 << " (candidate size:" << Candidate
.SizeCost
272 << ", call count: " << Candidate
.CallsiteCount
<< ", previously "
273 << (WasInlined
? "inlined)\n" : "not inlined)\n");
278 void CSPreInliner::run() {
280 auto printProfileNames
= [](SampleContextTracker
&ContextTracker
,
283 for (auto *Node
: ContextTracker
) {
284 FunctionSamples
*FSamples
= Node
->getFunctionSamples();
287 dbgs() << " [" << ContextTracker
.getContextString(Node
) << "] "
288 << FSamples
->getTotalSamples() << ":"
289 << FSamples
->getHeadSamples() << "\n";
292 dbgs() << (IsInput
? "Input" : "Output") << " context-sensitive profiles ("
293 << Size
<< " total):\n";
297 LLVM_DEBUG(printProfileNames(ContextTracker
, true));
299 // Execute global pre-inliner to estimate a global top-down inline
300 // decision and merge profiles accordingly. This helps with profile
301 // merge for ThinLTO otherwise we won't be able to merge profiles back
302 // to base profile across module/thin-backend boundaries.
303 // It also helps better compress context profile to control profile
304 // size, as we now only need context profile for functions going to
306 for (StringRef FuncName
: buildTopDownOrder()) {
307 processFunction(FuncName
);
310 // Not inlined context profiles are merged into its base, so we can
311 // trim out such profiles from the output.
312 for (auto *Node
: ContextTracker
) {
313 FunctionSamples
*FProfile
= Node
->getFunctionSamples();
315 (Node
->getParentContext() != &ContextTracker
.getRootContext() &&
316 !FProfile
->getContext().hasState(InlinedContext
))) {
317 Node
->setFunctionSamples(nullptr);
320 FunctionSamples::ProfileIsPreInlined
= true;
322 LLVM_DEBUG(printProfileNames(ContextTracker
, false));