1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include "llvm/Transforms/IPO/SampleProfile.h"
18 #define DEBUG_TYPE "cs-preinliner"
21 using namespace sampleprof
;
23 STATISTIC(PreInlNumCSInlined
,
24 "Number of functions inlined with context sensitive profile");
25 STATISTIC(PreInlNumCSNotInlined
,
26 "Number of functions not inlined with context sensitive profile");
27 STATISTIC(PreInlNumCSInlinedHitMinLimit
,
28 "Number of functions with FDO inline stopped due to min size limit");
29 STATISTIC(PreInlNumCSInlinedHitMaxLimit
,
30 "Number of functions with FDO inline stopped due to max size limit");
32 PreInlNumCSInlinedHitGrowthLimit
,
33 "Number of functions with FDO inline stopped due to growth size limit");
35 // The switches specify inline thresholds used in SampleProfileLoader inlining.
36 // TODO: the actual threshold to be tuned here because the size here is based
37 // on machine code not LLVM IR.
39 cl::opt
<bool> EnableCSPreInliner(
40 "csspgo-preinliner", cl::Hidden
, cl::init(true),
41 cl::desc("Run a global pre-inliner to merge context profile based on "
42 "estimated global top-down inline decisions"));
44 cl::opt
<bool> UseContextCostForPreInliner(
45 "use-context-cost-for-preinliner", cl::Hidden
, cl::init(true),
46 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
49 static cl::opt
<bool> SamplePreInlineReplay(
50 "csspgo-replay-preinline", cl::Hidden
, cl::init(false),
52 "Replay previous inlining and adjust context profile accordingly"));
54 static cl::opt
<int> CSPreinlMultiplierForPrevInl(
55 "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden
,
58 "Multiplier to bump up callsite threshold for previous inlining."));
60 CSPreInliner::CSPreInliner(SampleContextTracker
&Tracker
,
61 ProfiledBinary
&Binary
, ProfileSummary
*Summary
)
62 : UseContextCost(UseContextCostForPreInliner
),
63 // TODO: Pass in a guid-to-name map in order for
64 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
65 // as their profile context.
66 ContextTracker(Tracker
), Binary(Binary
), Summary(Summary
) {
67 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
68 // for good performance with reasonable profile size.
69 if (!SampleHotCallSiteThreshold
.getNumOccurrences())
70 SampleHotCallSiteThreshold
= 1500;
71 if (!SampleColdCallSiteThreshold
.getNumOccurrences())
72 SampleColdCallSiteThreshold
= 0;
73 if (!ProfileInlineLimitMax
.getNumOccurrences())
74 ProfileInlineLimitMax
= 50000;
77 std::vector
<FunctionId
> CSPreInliner::buildTopDownOrder() {
78 std::vector
<FunctionId
> Order
;
79 // Trim cold edges to get a more stable call graph. This allows for a more
80 // stable top-down order which in turns helps the stablity of the generated
81 // profile from run to run.
82 uint64_t ColdCountThreshold
= ProfileSummaryBuilder::getColdCountThreshold(
83 (Summary
->getDetailedSummary()));
84 ProfiledCallGraph
ProfiledCG(ContextTracker
, ColdCountThreshold
);
86 // Now that we have a profiled call graph, construct top-down order
87 // by building up SCC and reversing SCC order.
88 scc_iterator
<ProfiledCallGraph
*> I
= scc_begin(&ProfiledCG
);
89 while (!I
.isAtEnd()) {
91 if (SortProfiledSCC
) {
92 // Sort nodes in one SCC based on callsite hotness.
93 scc_member_iterator
<ProfiledCallGraph
*> SI(*I
);
96 for (auto *Node
: Range
) {
97 if (Node
!= ProfiledCG
.getEntryNode())
98 Order
.push_back(Node
->Name
);
102 std::reverse(Order
.begin(), Order
.end());
107 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue
&CQueue
,
108 const FunctionSamples
*CallerSamples
) {
109 assert(CallerSamples
&& "Expect non-null caller samples");
111 // Ideally we want to consider everything a function calls, but as far as
112 // context profile is concerned, only those frames that are children of
113 // current one in the trie is relavent. So we walk the trie instead of call
114 // targets from function profile.
115 ContextTrieNode
*CallerNode
=
116 ContextTracker
.getContextNodeForProfile(CallerSamples
);
118 bool HasNewCandidate
= false;
119 for (auto &Child
: CallerNode
->getAllChildContext()) {
120 ContextTrieNode
*CalleeNode
= &Child
.second
;
121 FunctionSamples
*CalleeSamples
= CalleeNode
->getFunctionSamples();
125 // Call site count is more reliable, so we look up the corresponding call
126 // target profile in caller's context profile to retrieve call site count.
127 uint64_t CalleeEntryCount
= CalleeSamples
->getHeadSamplesEstimate();
128 uint64_t CallsiteCount
= 0;
129 LineLocation Callsite
= CalleeNode
->getCallSiteLoc();
130 if (auto CallTargets
= CallerSamples
->findCallTargetMapAt(Callsite
)) {
131 SampleRecord::CallTargetMap
&TargetCounts
= CallTargets
.get();
132 auto It
= TargetCounts
.find(CalleeSamples
->getFunction());
133 if (It
!= TargetCounts
.end())
134 CallsiteCount
= It
->second
;
137 // TODO: call site and callee entry count should be mostly consistent, add
139 HasNewCandidate
= true;
140 uint32_t CalleeSize
= getFuncSize(CalleeNode
);
141 CQueue
.emplace(CalleeSamples
, std::max(CallsiteCount
, CalleeEntryCount
),
145 return HasNewCandidate
;
148 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode
*ContextNode
) {
150 return Binary
.getFuncSizeForContext(ContextNode
);
152 return ContextNode
->getFunctionSamples()->getBodySamples().size();
155 bool CSPreInliner::shouldInline(ProfiledInlineCandidate
&Candidate
) {
157 Candidate
.CalleeSamples
->getContext().hasAttribute(ContextWasInlined
);
158 // If replay inline is requested, simply follow the inline decision of the
160 if (SamplePreInlineReplay
)
163 unsigned int SampleThreshold
= SampleColdCallSiteThreshold
;
164 uint64_t ColdCountThreshold
= ProfileSummaryBuilder::getColdCountThreshold(
165 (Summary
->getDetailedSummary()));
167 if (Candidate
.CallsiteCount
<= ColdCountThreshold
)
168 SampleThreshold
= SampleColdCallSiteThreshold
;
170 // Linearly adjust threshold based on normalized hotness, i.e, a value in
171 // [0,1]. Use 10% cutoff instead of the max count as the normalization
172 // upperbound for stability.
173 double NormalizationUpperBound
=
174 ProfileSummaryBuilder::getEntryForPercentile(
175 Summary
->getDetailedSummary(), 100000 /* 10% */)
177 double NormalizationLowerBound
= ColdCountThreshold
;
178 double NormalizedHotness
=
179 (Candidate
.CallsiteCount
- NormalizationLowerBound
) /
180 (NormalizationUpperBound
- NormalizationLowerBound
);
181 if (NormalizedHotness
> 1.0)
182 NormalizedHotness
= 1.0;
183 // Add 1 to ensure hot callsites get a non-zero threshold, which could
184 // happen when SampleColdCallSiteThreshold is 0. This is when we do not
185 // want any inlining for cold callsites.
186 SampleThreshold
= SampleHotCallSiteThreshold
* NormalizedHotness
* 100 +
187 SampleColdCallSiteThreshold
+ 1;
188 // Bump up the threshold to favor previous compiler inline decision. The
189 // compiler has more insight and knowledge about functions based on their IR
190 // and attribures and should be able to make a more reasonable inline
193 SampleThreshold
*= CSPreinlMultiplierForPrevInl
;
196 return (Candidate
.SizeCost
< SampleThreshold
);
199 void CSPreInliner::processFunction(const FunctionId Name
) {
200 FunctionSamples
*FSamples
= ContextTracker
.getBaseSamplesFor(Name
);
205 getFuncSize(ContextTracker
.getContextNodeForProfile(FSamples
));
206 unsigned FuncFinalSize
= FuncSize
;
207 unsigned SizeLimit
= FuncSize
* ProfileInlineGrowthLimit
;
208 SizeLimit
= std::min(SizeLimit
, (unsigned)ProfileInlineLimitMax
);
209 SizeLimit
= std::max(SizeLimit
, (unsigned)ProfileInlineLimitMin
);
211 LLVM_DEBUG(dbgs() << "Process " << Name
212 << " for context-sensitive pre-inlining (pre-inline size: "
213 << FuncSize
<< ", size limit: " << SizeLimit
<< ")\n");
215 ProfiledCandidateQueue CQueue
;
216 getInlineCandidates(CQueue
, FSamples
);
218 while (!CQueue
.empty() && FuncFinalSize
< SizeLimit
) {
219 ProfiledInlineCandidate Candidate
= CQueue
.top();
221 bool ShouldInline
= false;
222 if ((ShouldInline
= shouldInline(Candidate
))) {
223 // We mark context as inlined as the corresponding context profile
224 // won't be merged into that function's base profile.
225 ++PreInlNumCSInlined
;
226 ContextTracker
.markContextSamplesInlined(Candidate
.CalleeSamples
);
227 Candidate
.CalleeSamples
->getContext().setAttribute(
228 ContextShouldBeInlined
);
229 FuncFinalSize
+= Candidate
.SizeCost
;
230 getInlineCandidates(CQueue
, Candidate
.CalleeSamples
);
232 ++PreInlNumCSNotInlined
;
235 dbgs() << (ShouldInline
? " Inlined" : " Outlined")
236 << " context profile for: "
237 << ContextTracker
.getContextString(*Candidate
.CalleeSamples
)
238 << " (callee size: " << Candidate
.SizeCost
239 << ", call count:" << Candidate
.CallsiteCount
<< ")\n");
242 if (!CQueue
.empty()) {
243 if (SizeLimit
== (unsigned)ProfileInlineLimitMax
)
244 ++PreInlNumCSInlinedHitMaxLimit
;
245 else if (SizeLimit
== (unsigned)ProfileInlineLimitMin
)
246 ++PreInlNumCSInlinedHitMinLimit
;
248 ++PreInlNumCSInlinedHitGrowthLimit
;
253 dbgs() << " Inline candidates ignored due to size limit (inliner "
255 << FuncSize
<< ", inliner final size: " << FuncFinalSize
256 << ", size limit: " << SizeLimit
<< ")\n";
258 while (!CQueue
.empty()) {
259 ProfiledInlineCandidate Candidate
= CQueue
.top();
262 Candidate
.CalleeSamples
->getContext().hasAttribute(ContextWasInlined
);
264 << ContextTracker
.getContextString(*Candidate
.CalleeSamples
)
265 << " (candidate size:" << Candidate
.SizeCost
266 << ", call count: " << Candidate
.CallsiteCount
<< ", previously "
267 << (WasInlined
? "inlined)\n" : "not inlined)\n");
272 void CSPreInliner::run() {
274 auto printProfileNames
= [](SampleContextTracker
&ContextTracker
,
277 for (auto *Node
: ContextTracker
) {
278 FunctionSamples
*FSamples
= Node
->getFunctionSamples();
281 dbgs() << " [" << ContextTracker
.getContextString(Node
) << "] "
282 << FSamples
->getTotalSamples() << ":"
283 << FSamples
->getHeadSamples() << "\n";
286 dbgs() << (IsInput
? "Input" : "Output") << " context-sensitive profiles ("
287 << Size
<< " total):\n";
291 LLVM_DEBUG(printProfileNames(ContextTracker
, true));
293 // Execute global pre-inliner to estimate a global top-down inline
294 // decision and merge profiles accordingly. This helps with profile
295 // merge for ThinLTO otherwise we won't be able to merge profiles back
296 // to base profile across module/thin-backend boundaries.
297 // It also helps better compress context profile to control profile
298 // size, as we now only need context profile for functions going to
300 for (FunctionId FuncName
: buildTopDownOrder()) {
301 processFunction(FuncName
);
304 // Not inlined context profiles are merged into its base, so we can
305 // trim out such profiles from the output.
306 for (auto *Node
: ContextTracker
) {
307 FunctionSamples
*FProfile
= Node
->getFunctionSamples();
309 (Node
->getParentContext() != &ContextTracker
.getRootContext() &&
310 !FProfile
->getContext().hasState(InlinedContext
))) {
311 Node
->setFunctionSamples(nullptr);
314 FunctionSamples::ProfileIsPreInlined
= true;
316 LLVM_DEBUG(printProfileNames(ContextTracker
, false));