1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
16 #define DEBUG_TYPE "cs-preinliner"
19 using namespace sampleprof
;
21 STATISTIC(PreInlNumCSInlined
,
22 "Number of functions inlined with context sensitive profile");
23 STATISTIC(PreInlNumCSNotInlined
,
24 "Number of functions not inlined with context sensitive profile");
25 STATISTIC(PreInlNumCSInlinedHitMinLimit
,
26 "Number of functions with FDO inline stopped due to min size limit");
27 STATISTIC(PreInlNumCSInlinedHitMaxLimit
,
28 "Number of functions with FDO inline stopped due to max size limit");
30 PreInlNumCSInlinedHitGrowthLimit
,
31 "Number of functions with FDO inline stopped due to growth size limit");
33 // The switches specify inline thresholds used in SampleProfileLoader inlining.
34 // TODO: the actual threshold to be tuned here because the size here is based
35 // on machine code not LLVM IR.
36 extern cl::opt
<int> SampleHotCallSiteThreshold
;
37 extern cl::opt
<int> SampleColdCallSiteThreshold
;
38 extern cl::opt
<int> ProfileInlineGrowthLimit
;
39 extern cl::opt
<int> ProfileInlineLimitMin
;
40 extern cl::opt
<int> ProfileInlineLimitMax
;
41 extern cl::opt
<bool> SortProfiledSCC
;
43 cl::opt
<bool> EnableCSPreInliner(
44 "csspgo-preinliner", cl::Hidden
, cl::init(true),
45 cl::desc("Run a global pre-inliner to merge context profile based on "
46 "estimated global top-down inline decisions"));
48 cl::opt
<bool> UseContextCostForPreInliner(
49 "use-context-cost-for-preinliner", cl::Hidden
, cl::init(true),
50 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
52 static cl::opt
<bool> SamplePreInlineReplay(
53 "csspgo-replay-preinline", cl::Hidden
, cl::init(false),
55 "Replay previous inlining and adjust context profile accordingly"));
57 CSPreInliner::CSPreInliner(SampleProfileMap
&Profiles
, ProfiledBinary
&Binary
,
58 uint64_t HotThreshold
, uint64_t ColdThreshold
)
59 : UseContextCost(UseContextCostForPreInliner
),
60 // TODO: Pass in a guid-to-name map in order for
61 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
62 // as their profile context.
63 ContextTracker(Profiles
, nullptr), ProfileMap(Profiles
), Binary(Binary
),
64 HotCountThreshold(HotThreshold
), ColdCountThreshold(ColdThreshold
) {
65 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
66 // for good performance with reasonable profile size.
67 if (!SampleHotCallSiteThreshold
.getNumOccurrences())
68 SampleHotCallSiteThreshold
= 1500;
69 if (!SampleColdCallSiteThreshold
.getNumOccurrences())
70 SampleColdCallSiteThreshold
= 0;
73 std::vector
<StringRef
> CSPreInliner::buildTopDownOrder() {
74 std::vector
<StringRef
> Order
;
75 ProfiledCallGraph
ProfiledCG(ContextTracker
);
77 // Now that we have a profiled call graph, construct top-down order
78 // by building up SCC and reversing SCC order.
79 scc_iterator
<ProfiledCallGraph
*> I
= scc_begin(&ProfiledCG
);
80 while (!I
.isAtEnd()) {
82 if (SortProfiledSCC
) {
83 // Sort nodes in one SCC based on callsite hotness.
84 scc_member_iterator
<ProfiledCallGraph
*> SI(*I
);
87 for (auto *Node
: Range
) {
88 if (Node
!= ProfiledCG
.getEntryNode())
89 Order
.push_back(Node
->Name
);
93 std::reverse(Order
.begin(), Order
.end());
98 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue
&CQueue
,
99 const FunctionSamples
*CallerSamples
) {
100 assert(CallerSamples
&& "Expect non-null caller samples");
102 // Ideally we want to consider everything a function calls, but as far as
103 // context profile is concerned, only those frames that are children of
104 // current one in the trie is relavent. So we walk the trie instead of call
105 // targets from function profile.
106 ContextTrieNode
*CallerNode
=
107 ContextTracker
.getContextFor(CallerSamples
->getContext());
109 bool HasNewCandidate
= false;
110 for (auto &Child
: CallerNode
->getAllChildContext()) {
111 ContextTrieNode
*CalleeNode
= &Child
.second
;
112 FunctionSamples
*CalleeSamples
= CalleeNode
->getFunctionSamples();
116 // Call site count is more reliable, so we look up the corresponding call
117 // target profile in caller's context profile to retrieve call site count.
118 uint64_t CalleeEntryCount
= CalleeSamples
->getEntrySamples();
119 uint64_t CallsiteCount
= 0;
120 LineLocation Callsite
= CalleeNode
->getCallSiteLoc();
121 if (auto CallTargets
= CallerSamples
->findCallTargetMapAt(Callsite
)) {
122 SampleRecord::CallTargetMap
&TargetCounts
= CallTargets
.get();
123 auto It
= TargetCounts
.find(CalleeSamples
->getName());
124 if (It
!= TargetCounts
.end())
125 CallsiteCount
= It
->second
;
128 // TODO: call site and callee entry count should be mostly consistent, add
130 HasNewCandidate
= true;
131 uint32_t CalleeSize
= getFuncSize(*CalleeSamples
);
132 CQueue
.emplace(CalleeSamples
, std::max(CallsiteCount
, CalleeEntryCount
),
136 return HasNewCandidate
;
139 uint32_t CSPreInliner::getFuncSize(const FunctionSamples
&FSamples
) {
140 if (UseContextCost
) {
141 return Binary
.getFuncSizeForContext(FSamples
.getContext());
144 return FSamples
.getBodySamples().size();
147 bool CSPreInliner::shouldInline(ProfiledInlineCandidate
&Candidate
) {
148 // If replay inline is requested, simply follow the inline decision of the
150 if (SamplePreInlineReplay
)
151 return Candidate
.CalleeSamples
->getContext().hasAttribute(
154 // Adjust threshold based on call site hotness, only do this for callsite
155 // prioritized inliner because otherwise cost-benefit check is done earlier.
156 unsigned int SampleThreshold
= SampleColdCallSiteThreshold
;
157 if (Candidate
.CallsiteCount
> HotCountThreshold
)
158 SampleThreshold
= SampleHotCallSiteThreshold
;
160 // TODO: for small cold functions, we may inlined them and we need to keep
161 // context profile accordingly.
162 if (Candidate
.CallsiteCount
< ColdCountThreshold
)
163 SampleThreshold
= SampleColdCallSiteThreshold
;
165 return (Candidate
.SizeCost
< SampleThreshold
);
168 void CSPreInliner::processFunction(const StringRef Name
) {
169 FunctionSamples
*FSamples
= ContextTracker
.getBaseSamplesFor(Name
);
173 unsigned FuncSize
= getFuncSize(*FSamples
);
174 unsigned FuncFinalSize
= FuncSize
;
175 unsigned SizeLimit
= FuncSize
* ProfileInlineGrowthLimit
;
176 SizeLimit
= std::min(SizeLimit
, (unsigned)ProfileInlineLimitMax
);
177 SizeLimit
= std::max(SizeLimit
, (unsigned)ProfileInlineLimitMin
);
179 LLVM_DEBUG(dbgs() << "Process " << Name
180 << " for context-sensitive pre-inlining (pre-inline size: "
181 << FuncSize
<< ", size limit: " << SizeLimit
<< ")\n");
183 ProfiledCandidateQueue CQueue
;
184 getInlineCandidates(CQueue
, FSamples
);
186 while (!CQueue
.empty() && FuncFinalSize
< SizeLimit
) {
187 ProfiledInlineCandidate Candidate
= CQueue
.top();
189 bool ShouldInline
= false;
190 if ((ShouldInline
= shouldInline(Candidate
))) {
191 // We mark context as inlined as the corresponding context profile
192 // won't be merged into that function's base profile.
193 ++PreInlNumCSInlined
;
194 ContextTracker
.markContextSamplesInlined(Candidate
.CalleeSamples
);
195 Candidate
.CalleeSamples
->getContext().setAttribute(
196 ContextShouldBeInlined
);
197 FuncFinalSize
+= Candidate
.SizeCost
;
198 getInlineCandidates(CQueue
, Candidate
.CalleeSamples
);
200 ++PreInlNumCSNotInlined
;
202 LLVM_DEBUG(dbgs() << (ShouldInline
? " Inlined" : " Outlined")
203 << " context profile for: "
204 << Candidate
.CalleeSamples
->getContext().toString()
205 << " (callee size: " << Candidate
.SizeCost
206 << ", call count:" << Candidate
.CallsiteCount
<< ")\n");
209 if (!CQueue
.empty()) {
210 if (SizeLimit
== (unsigned)ProfileInlineLimitMax
)
211 ++PreInlNumCSInlinedHitMaxLimit
;
212 else if (SizeLimit
== (unsigned)ProfileInlineLimitMin
)
213 ++PreInlNumCSInlinedHitMinLimit
;
215 ++PreInlNumCSInlinedHitGrowthLimit
;
220 dbgs() << " Inline candidates ignored due to size limit (inliner "
222 << FuncSize
<< ", inliner final size: " << FuncFinalSize
223 << ", size limit: " << SizeLimit
<< ")\n";
225 while (!CQueue
.empty()) {
226 ProfiledInlineCandidate Candidate
= CQueue
.top();
229 Candidate
.CalleeSamples
->getContext().hasAttribute(ContextWasInlined
);
230 dbgs() << " " << Candidate
.CalleeSamples
->getContext().toString()
231 << " (candidate size:" << Candidate
.SizeCost
232 << ", call count: " << Candidate
.CallsiteCount
<< ", previously "
233 << (WasInlined
? "inlined)\n" : "not inlined)\n");
238 void CSPreInliner::run() {
240 auto printProfileNames
= [](SampleProfileMap
&Profiles
, bool IsInput
) {
241 dbgs() << (IsInput
? "Input" : "Output") << " context-sensitive profiles ("
242 << Profiles
.size() << " total):\n";
243 for (auto &It
: Profiles
) {
244 const FunctionSamples
&Samples
= It
.second
;
245 dbgs() << " [" << Samples
.getContext().toString() << "] "
246 << Samples
.getTotalSamples() << ":" << Samples
.getHeadSamples()
252 LLVM_DEBUG(printProfileNames(ProfileMap
, true));
254 // Execute global pre-inliner to estimate a global top-down inline
255 // decision and merge profiles accordingly. This helps with profile
256 // merge for ThinLTO otherwise we won't be able to merge profiles back
257 // to base profile across module/thin-backend boundaries.
258 // It also helps better compress context profile to control profile
259 // size, as we now only need context profile for functions going to
261 for (StringRef FuncName
: buildTopDownOrder()) {
262 processFunction(FuncName
);
265 // Not inlined context profiles are merged into its base, so we can
266 // trim out such profiles from the output.
267 std::vector
<SampleContext
> ProfilesToBeRemoved
;
268 for (auto &It
: ProfileMap
) {
269 SampleContext
&Context
= It
.second
.getContext();
270 if (!Context
.isBaseContext() && !Context
.hasState(InlinedContext
)) {
271 assert(Context
.hasState(MergedContext
) &&
272 "Not inlined context profile should be merged already");
273 ProfilesToBeRemoved
.push_back(It
.first
);
277 for (auto &ContextName
: ProfilesToBeRemoved
) {
278 ProfileMap
.erase(ContextName
);
281 // Make sure ProfileMap's key is consistent with FunctionSamples' name.
282 SampleContextTrimmer(ProfileMap
).canonicalizeContextProfiles();
284 LLVM_DEBUG(printProfileNames(ProfileMap
, false));