[AMDGPU] Test codegen'ing True16 additions.
[llvm-project.git] / llvm / tools / llvm-profgen / CSPreInliner.cpp
blob9421118a3bb1b508b13cf2eb21ece8b21d3bdca7
1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include <cstdint>
15 #include <queue>
17 #define DEBUG_TYPE "cs-preinliner"
19 using namespace llvm;
20 using namespace sampleprof;
22 STATISTIC(PreInlNumCSInlined,
23 "Number of functions inlined with context sensitive profile");
24 STATISTIC(PreInlNumCSNotInlined,
25 "Number of functions not inlined with context sensitive profile");
26 STATISTIC(PreInlNumCSInlinedHitMinLimit,
27 "Number of functions with FDO inline stopped due to min size limit");
28 STATISTIC(PreInlNumCSInlinedHitMaxLimit,
29 "Number of functions with FDO inline stopped due to max size limit");
30 STATISTIC(
31 PreInlNumCSInlinedHitGrowthLimit,
32 "Number of functions with FDO inline stopped due to growth size limit");
34 // The switches specify inline thresholds used in SampleProfileLoader inlining.
35 // TODO: the actual threshold to be tuned here because the size here is based
36 // on machine code not LLVM IR.
37 namespace llvm {
38 extern cl::opt<int> SampleHotCallSiteThreshold;
39 extern cl::opt<int> SampleColdCallSiteThreshold;
40 extern cl::opt<int> ProfileInlineGrowthLimit;
41 extern cl::opt<int> ProfileInlineLimitMin;
42 extern cl::opt<int> ProfileInlineLimitMax;
43 extern cl::opt<bool> SortProfiledSCC;
45 cl::opt<bool> EnableCSPreInliner(
46 "csspgo-preinliner", cl::Hidden, cl::init(true),
47 cl::desc("Run a global pre-inliner to merge context profile based on "
48 "estimated global top-down inline decisions"));
50 cl::opt<bool> UseContextCostForPreInliner(
51 "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
52 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
53 } // namespace llvm
55 static cl::opt<bool> SamplePreInlineReplay(
56 "csspgo-replay-preinline", cl::Hidden, cl::init(false),
57 cl::desc(
58 "Replay previous inlining and adjust context profile accordingly"));
60 static cl::opt<int> CSPreinlMultiplierForPrevInl(
61 "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
62 cl::init(100),
63 cl::desc(
64 "Multiplier to bump up callsite threshold for previous inlining."));
66 CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
67 ProfiledBinary &Binary, ProfileSummary *Summary)
68 : UseContextCost(UseContextCostForPreInliner),
69 // TODO: Pass in a guid-to-name map in order for
70 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
71 // as their profile context.
72 ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
73 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
74 // for good performance with reasonable profile size.
75 if (!SampleHotCallSiteThreshold.getNumOccurrences())
76 SampleHotCallSiteThreshold = 1500;
77 if (!SampleColdCallSiteThreshold.getNumOccurrences())
78 SampleColdCallSiteThreshold = 0;
79 if (!ProfileInlineLimitMax.getNumOccurrences())
80 ProfileInlineLimitMax = 50000;
83 std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
84 std::vector<StringRef> Order;
85 // Trim cold edges to get a more stable call graph. This allows for a more
86 // stable top-down order which in turns helps the stablity of the generated
87 // profile from run to run.
88 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
89 (Summary->getDetailedSummary()));
90 ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
92 // Now that we have a profiled call graph, construct top-down order
93 // by building up SCC and reversing SCC order.
94 scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
95 while (!I.isAtEnd()) {
96 auto Range = *I;
97 if (SortProfiledSCC) {
98 // Sort nodes in one SCC based on callsite hotness.
99 scc_member_iterator<ProfiledCallGraph *> SI(*I);
100 Range = *SI;
102 for (auto *Node : Range) {
103 if (Node != ProfiledCG.getEntryNode())
104 Order.push_back(Node->Name);
106 ++I;
108 std::reverse(Order.begin(), Order.end());
110 return Order;
113 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
114 const FunctionSamples *CallerSamples) {
115 assert(CallerSamples && "Expect non-null caller samples");
117 // Ideally we want to consider everything a function calls, but as far as
118 // context profile is concerned, only those frames that are children of
119 // current one in the trie is relavent. So we walk the trie instead of call
120 // targets from function profile.
121 ContextTrieNode *CallerNode =
122 ContextTracker.getContextNodeForProfile(CallerSamples);
124 bool HasNewCandidate = false;
125 for (auto &Child : CallerNode->getAllChildContext()) {
126 ContextTrieNode *CalleeNode = &Child.second;
127 FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
128 if (!CalleeSamples)
129 continue;
131 // Call site count is more reliable, so we look up the corresponding call
132 // target profile in caller's context profile to retrieve call site count.
133 uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
134 uint64_t CallsiteCount = 0;
135 LineLocation Callsite = CalleeNode->getCallSiteLoc();
136 if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
137 SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
138 auto It = TargetCounts.find(CalleeSamples->getName());
139 if (It != TargetCounts.end())
140 CallsiteCount = It->second;
143 // TODO: call site and callee entry count should be mostly consistent, add
144 // check for that.
145 HasNewCandidate = true;
146 uint32_t CalleeSize = getFuncSize(CalleeNode);
147 CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
148 CalleeSize);
151 return HasNewCandidate;
154 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
155 if (UseContextCost)
156 return Binary.getFuncSizeForContext(ContextNode);
158 return ContextNode->getFunctionSamples()->getBodySamples().size();
161 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
162 bool WasInlined =
163 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
164 // If replay inline is requested, simply follow the inline decision of the
165 // profiled binary.
166 if (SamplePreInlineReplay)
167 return WasInlined;
169 unsigned int SampleThreshold = SampleColdCallSiteThreshold;
170 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
171 (Summary->getDetailedSummary()));
173 if (Candidate.CallsiteCount <= ColdCountThreshold)
174 SampleThreshold = SampleColdCallSiteThreshold;
175 else {
176 // Linearly adjust threshold based on normalized hotness, i.e, a value in
177 // [0,1]. Use 10% cutoff instead of the max count as the normalization
178 // upperbound for stability.
179 double NormalizationUpperBound =
180 ProfileSummaryBuilder::getEntryForPercentile(
181 Summary->getDetailedSummary(), 100000 /* 10% */)
182 .MinCount;
183 double NormalizationLowerBound = ColdCountThreshold;
184 double NormalizedHotness =
185 (Candidate.CallsiteCount - NormalizationLowerBound) /
186 (NormalizationUpperBound - NormalizationLowerBound);
187 if (NormalizedHotness > 1.0)
188 NormalizedHotness = 1.0;
189 // Add 1 to ensure hot callsites get a non-zero threshold, which could
190 // happen when SampleColdCallSiteThreshold is 0. This is when we do not
191 // want any inlining for cold callsites.
192 SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
193 SampleColdCallSiteThreshold + 1;
194 // Bump up the threshold to favor previous compiler inline decision. The
195 // compiler has more insight and knowledge about functions based on their IR
196 // and attribures and should be able to make a more reasonable inline
197 // decision.
198 if (WasInlined)
199 SampleThreshold *= CSPreinlMultiplierForPrevInl;
202 return (Candidate.SizeCost < SampleThreshold);
205 void CSPreInliner::processFunction(const StringRef Name) {
206 FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
207 if (!FSamples)
208 return;
210 unsigned FuncSize =
211 getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
212 unsigned FuncFinalSize = FuncSize;
213 unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
214 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
215 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
217 LLVM_DEBUG(dbgs() << "Process " << Name
218 << " for context-sensitive pre-inlining (pre-inline size: "
219 << FuncSize << ", size limit: " << SizeLimit << ")\n");
221 ProfiledCandidateQueue CQueue;
222 getInlineCandidates(CQueue, FSamples);
224 while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
225 ProfiledInlineCandidate Candidate = CQueue.top();
226 CQueue.pop();
227 bool ShouldInline = false;
228 if ((ShouldInline = shouldInline(Candidate))) {
229 // We mark context as inlined as the corresponding context profile
230 // won't be merged into that function's base profile.
231 ++PreInlNumCSInlined;
232 ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
233 Candidate.CalleeSamples->getContext().setAttribute(
234 ContextShouldBeInlined);
235 FuncFinalSize += Candidate.SizeCost;
236 getInlineCandidates(CQueue, Candidate.CalleeSamples);
237 } else {
238 ++PreInlNumCSNotInlined;
240 LLVM_DEBUG(
241 dbgs() << (ShouldInline ? " Inlined" : " Outlined")
242 << " context profile for: "
243 << ContextTracker.getContextString(*Candidate.CalleeSamples)
244 << " (callee size: " << Candidate.SizeCost
245 << ", call count:" << Candidate.CallsiteCount << ")\n");
248 if (!CQueue.empty()) {
249 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
250 ++PreInlNumCSInlinedHitMaxLimit;
251 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
252 ++PreInlNumCSInlinedHitMinLimit;
253 else
254 ++PreInlNumCSInlinedHitGrowthLimit;
257 LLVM_DEBUG({
258 if (!CQueue.empty())
259 dbgs() << " Inline candidates ignored due to size limit (inliner "
260 "original size: "
261 << FuncSize << ", inliner final size: " << FuncFinalSize
262 << ", size limit: " << SizeLimit << ")\n";
264 while (!CQueue.empty()) {
265 ProfiledInlineCandidate Candidate = CQueue.top();
266 CQueue.pop();
267 bool WasInlined =
268 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
269 dbgs() << " "
270 << ContextTracker.getContextString(*Candidate.CalleeSamples)
271 << " (candidate size:" << Candidate.SizeCost
272 << ", call count: " << Candidate.CallsiteCount << ", previously "
273 << (WasInlined ? "inlined)\n" : "not inlined)\n");
278 void CSPreInliner::run() {
279 #ifndef NDEBUG
280 auto printProfileNames = [](SampleContextTracker &ContextTracker,
281 bool IsInput) {
282 uint32_t Size = 0;
283 for (auto *Node : ContextTracker) {
284 FunctionSamples *FSamples = Node->getFunctionSamples();
285 if (FSamples) {
286 Size++;
287 dbgs() << " [" << ContextTracker.getContextString(Node) << "] "
288 << FSamples->getTotalSamples() << ":"
289 << FSamples->getHeadSamples() << "\n";
292 dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
293 << Size << " total):\n";
295 #endif
297 LLVM_DEBUG(printProfileNames(ContextTracker, true));
299 // Execute global pre-inliner to estimate a global top-down inline
300 // decision and merge profiles accordingly. This helps with profile
301 // merge for ThinLTO otherwise we won't be able to merge profiles back
302 // to base profile across module/thin-backend boundaries.
303 // It also helps better compress context profile to control profile
304 // size, as we now only need context profile for functions going to
305 // be inlined.
306 for (StringRef FuncName : buildTopDownOrder()) {
307 processFunction(FuncName);
310 // Not inlined context profiles are merged into its base, so we can
311 // trim out such profiles from the output.
312 for (auto *Node : ContextTracker) {
313 FunctionSamples *FProfile = Node->getFunctionSamples();
314 if (FProfile &&
315 (Node->getParentContext() != &ContextTracker.getRootContext() &&
316 !FProfile->getContext().hasState(InlinedContext))) {
317 Node->setFunctionSamples(nullptr);
320 FunctionSamples::ProfileIsPreInlined = true;
322 LLVM_DEBUG(printProfileNames(ContextTracker, false));