Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / llvm / tools / llvm-profgen / CSPreInliner.cpp
blob025d3ca5a6da5adbcd57633ca5f5d14055176b88
1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include "llvm/Transforms/IPO/SampleProfile.h"
15 #include <cstdint>
16 #include <queue>
18 #define DEBUG_TYPE "cs-preinliner"
20 using namespace llvm;
21 using namespace sampleprof;
23 STATISTIC(PreInlNumCSInlined,
24 "Number of functions inlined with context sensitive profile");
25 STATISTIC(PreInlNumCSNotInlined,
26 "Number of functions not inlined with context sensitive profile");
27 STATISTIC(PreInlNumCSInlinedHitMinLimit,
28 "Number of functions with FDO inline stopped due to min size limit");
29 STATISTIC(PreInlNumCSInlinedHitMaxLimit,
30 "Number of functions with FDO inline stopped due to max size limit");
31 STATISTIC(
32 PreInlNumCSInlinedHitGrowthLimit,
33 "Number of functions with FDO inline stopped due to growth size limit");
35 // The switches specify inline thresholds used in SampleProfileLoader inlining.
36 // TODO: the actual threshold to be tuned here because the size here is based
37 // on machine code not LLVM IR.
38 namespace llvm {
39 cl::opt<bool> EnableCSPreInliner(
40 "csspgo-preinliner", cl::Hidden, cl::init(true),
41 cl::desc("Run a global pre-inliner to merge context profile based on "
42 "estimated global top-down inline decisions"));
44 cl::opt<bool> UseContextCostForPreInliner(
45 "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
46 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
47 } // namespace llvm
49 static cl::opt<bool> SamplePreInlineReplay(
50 "csspgo-replay-preinline", cl::Hidden, cl::init(false),
51 cl::desc(
52 "Replay previous inlining and adjust context profile accordingly"));
54 static cl::opt<int> CSPreinlMultiplierForPrevInl(
55 "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
56 cl::init(100),
57 cl::desc(
58 "Multiplier to bump up callsite threshold for previous inlining."));
60 CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
61 ProfiledBinary &Binary, ProfileSummary *Summary)
62 : UseContextCost(UseContextCostForPreInliner),
63 // TODO: Pass in a guid-to-name map in order for
64 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
65 // as their profile context.
66 ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
67 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
68 // for good performance with reasonable profile size.
69 if (!SampleHotCallSiteThreshold.getNumOccurrences())
70 SampleHotCallSiteThreshold = 1500;
71 if (!SampleColdCallSiteThreshold.getNumOccurrences())
72 SampleColdCallSiteThreshold = 0;
73 if (!ProfileInlineLimitMax.getNumOccurrences())
74 ProfileInlineLimitMax = 50000;
77 std::vector<FunctionId> CSPreInliner::buildTopDownOrder() {
78 std::vector<FunctionId> Order;
79 // Trim cold edges to get a more stable call graph. This allows for a more
80 // stable top-down order which in turns helps the stablity of the generated
81 // profile from run to run.
82 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
83 (Summary->getDetailedSummary()));
84 ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
86 // Now that we have a profiled call graph, construct top-down order
87 // by building up SCC and reversing SCC order.
88 scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
89 while (!I.isAtEnd()) {
90 auto Range = *I;
91 if (SortProfiledSCC) {
92 // Sort nodes in one SCC based on callsite hotness.
93 scc_member_iterator<ProfiledCallGraph *> SI(*I);
94 Range = *SI;
96 for (auto *Node : Range) {
97 if (Node != ProfiledCG.getEntryNode())
98 Order.push_back(Node->Name);
100 ++I;
102 std::reverse(Order.begin(), Order.end());
104 return Order;
107 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
108 const FunctionSamples *CallerSamples) {
109 assert(CallerSamples && "Expect non-null caller samples");
111 // Ideally we want to consider everything a function calls, but as far as
112 // context profile is concerned, only those frames that are children of
113 // current one in the trie is relavent. So we walk the trie instead of call
114 // targets from function profile.
115 ContextTrieNode *CallerNode =
116 ContextTracker.getContextNodeForProfile(CallerSamples);
118 bool HasNewCandidate = false;
119 for (auto &Child : CallerNode->getAllChildContext()) {
120 ContextTrieNode *CalleeNode = &Child.second;
121 FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
122 if (!CalleeSamples)
123 continue;
125 // Call site count is more reliable, so we look up the corresponding call
126 // target profile in caller's context profile to retrieve call site count.
127 uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
128 uint64_t CallsiteCount = 0;
129 LineLocation Callsite = CalleeNode->getCallSiteLoc();
130 if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
131 SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
132 auto It = TargetCounts.find(CalleeSamples->getFunction());
133 if (It != TargetCounts.end())
134 CallsiteCount = It->second;
137 // TODO: call site and callee entry count should be mostly consistent, add
138 // check for that.
139 HasNewCandidate = true;
140 uint32_t CalleeSize = getFuncSize(CalleeNode);
141 CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
142 CalleeSize);
145 return HasNewCandidate;
148 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
149 if (UseContextCost)
150 return Binary.getFuncSizeForContext(ContextNode);
152 return ContextNode->getFunctionSamples()->getBodySamples().size();
155 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
156 bool WasInlined =
157 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
158 // If replay inline is requested, simply follow the inline decision of the
159 // profiled binary.
160 if (SamplePreInlineReplay)
161 return WasInlined;
163 unsigned int SampleThreshold = SampleColdCallSiteThreshold;
164 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
165 (Summary->getDetailedSummary()));
167 if (Candidate.CallsiteCount <= ColdCountThreshold)
168 SampleThreshold = SampleColdCallSiteThreshold;
169 else {
170 // Linearly adjust threshold based on normalized hotness, i.e, a value in
171 // [0,1]. Use 10% cutoff instead of the max count as the normalization
172 // upperbound for stability.
173 double NormalizationUpperBound =
174 ProfileSummaryBuilder::getEntryForPercentile(
175 Summary->getDetailedSummary(), 100000 /* 10% */)
176 .MinCount;
177 double NormalizationLowerBound = ColdCountThreshold;
178 double NormalizedHotness =
179 (Candidate.CallsiteCount - NormalizationLowerBound) /
180 (NormalizationUpperBound - NormalizationLowerBound);
181 if (NormalizedHotness > 1.0)
182 NormalizedHotness = 1.0;
183 // Add 1 to ensure hot callsites get a non-zero threshold, which could
184 // happen when SampleColdCallSiteThreshold is 0. This is when we do not
185 // want any inlining for cold callsites.
186 SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
187 SampleColdCallSiteThreshold + 1;
188 // Bump up the threshold to favor previous compiler inline decision. The
189 // compiler has more insight and knowledge about functions based on their IR
190 // and attribures and should be able to make a more reasonable inline
191 // decision.
192 if (WasInlined)
193 SampleThreshold *= CSPreinlMultiplierForPrevInl;
196 return (Candidate.SizeCost < SampleThreshold);
199 void CSPreInliner::processFunction(const FunctionId Name) {
200 FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
201 if (!FSamples)
202 return;
204 unsigned FuncSize =
205 getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
206 unsigned FuncFinalSize = FuncSize;
207 unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
208 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
209 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
211 LLVM_DEBUG(dbgs() << "Process " << Name
212 << " for context-sensitive pre-inlining (pre-inline size: "
213 << FuncSize << ", size limit: " << SizeLimit << ")\n");
215 ProfiledCandidateQueue CQueue;
216 getInlineCandidates(CQueue, FSamples);
218 while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
219 ProfiledInlineCandidate Candidate = CQueue.top();
220 CQueue.pop();
221 bool ShouldInline = false;
222 if ((ShouldInline = shouldInline(Candidate))) {
223 // We mark context as inlined as the corresponding context profile
224 // won't be merged into that function's base profile.
225 ++PreInlNumCSInlined;
226 ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
227 Candidate.CalleeSamples->getContext().setAttribute(
228 ContextShouldBeInlined);
229 FuncFinalSize += Candidate.SizeCost;
230 getInlineCandidates(CQueue, Candidate.CalleeSamples);
231 } else {
232 ++PreInlNumCSNotInlined;
234 LLVM_DEBUG(
235 dbgs() << (ShouldInline ? " Inlined" : " Outlined")
236 << " context profile for: "
237 << ContextTracker.getContextString(*Candidate.CalleeSamples)
238 << " (callee size: " << Candidate.SizeCost
239 << ", call count:" << Candidate.CallsiteCount << ")\n");
242 if (!CQueue.empty()) {
243 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
244 ++PreInlNumCSInlinedHitMaxLimit;
245 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
246 ++PreInlNumCSInlinedHitMinLimit;
247 else
248 ++PreInlNumCSInlinedHitGrowthLimit;
251 LLVM_DEBUG({
252 if (!CQueue.empty())
253 dbgs() << " Inline candidates ignored due to size limit (inliner "
254 "original size: "
255 << FuncSize << ", inliner final size: " << FuncFinalSize
256 << ", size limit: " << SizeLimit << ")\n";
258 while (!CQueue.empty()) {
259 ProfiledInlineCandidate Candidate = CQueue.top();
260 CQueue.pop();
261 bool WasInlined =
262 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
263 dbgs() << " "
264 << ContextTracker.getContextString(*Candidate.CalleeSamples)
265 << " (candidate size:" << Candidate.SizeCost
266 << ", call count: " << Candidate.CallsiteCount << ", previously "
267 << (WasInlined ? "inlined)\n" : "not inlined)\n");
272 void CSPreInliner::run() {
273 #ifndef NDEBUG
274 auto printProfileNames = [](SampleContextTracker &ContextTracker,
275 bool IsInput) {
276 uint32_t Size = 0;
277 for (auto *Node : ContextTracker) {
278 FunctionSamples *FSamples = Node->getFunctionSamples();
279 if (FSamples) {
280 Size++;
281 dbgs() << " [" << ContextTracker.getContextString(Node) << "] "
282 << FSamples->getTotalSamples() << ":"
283 << FSamples->getHeadSamples() << "\n";
286 dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
287 << Size << " total):\n";
289 #endif
291 LLVM_DEBUG(printProfileNames(ContextTracker, true));
293 // Execute global pre-inliner to estimate a global top-down inline
294 // decision and merge profiles accordingly. This helps with profile
295 // merge for ThinLTO otherwise we won't be able to merge profiles back
296 // to base profile across module/thin-backend boundaries.
297 // It also helps better compress context profile to control profile
298 // size, as we now only need context profile for functions going to
299 // be inlined.
300 for (FunctionId FuncName : buildTopDownOrder()) {
301 processFunction(FuncName);
304 // Not inlined context profiles are merged into its base, so we can
305 // trim out such profiles from the output.
306 for (auto *Node : ContextTracker) {
307 FunctionSamples *FProfile = Node->getFunctionSamples();
308 if (FProfile &&
309 (Node->getParentContext() != &ContextTracker.getRootContext() &&
310 !FProfile->getContext().hasState(InlinedContext))) {
311 Node->setFunctionSamples(nullptr);
314 FunctionSamples::ProfileIsPreInlined = true;
316 LLVM_DEBUG(printProfileNames(ContextTracker, false));