Revert "[lldb][test] Remove compiler version check and use regex" (#124101)
[llvm-project.git] / llvm / tools / llvm-profgen / ProfileGenerator.cpp
blobb47c77c5f2ff3ff2720951d9f9eaf8bc5aff7432
1 //===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "ProfileGenerator.h"
9 #include "ErrorHandling.h"
10 #include "MissingFrameInferrer.h"
11 #include "PerfReader.h"
12 #include "ProfiledBinary.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include "llvm/ProfileData/ProfileCommon.h"
15 #include <algorithm>
16 #include <float.h>
17 #include <unordered_set>
18 #include <utility>
20 cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
21 cl::Required,
22 cl::desc("Output profile file"));
23 static cl::alias OutputA("o", cl::desc("Alias for --output"),
24 cl::aliasopt(OutputFilename));
26 static cl::opt<SampleProfileFormat> OutputFormat(
27 "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary),
28 cl::values(
29 clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
30 clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"),
31 clEnumValN(SPF_Text, "text", "Text encoding"),
32 clEnumValN(SPF_GCC, "gcc",
33 "GCC encoding (only meaningful for -sample)")));
35 static cl::opt<bool> UseMD5(
36 "use-md5", cl::Hidden,
37 cl::desc("Use md5 to represent function names in the output profile (only "
38 "meaningful for -extbinary)"));
40 static cl::opt<bool> PopulateProfileSymbolList(
41 "populate-profile-symbol-list", cl::init(false), cl::Hidden,
42 cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
44 static cl::opt<bool> FillZeroForAllFuncs(
45 "fill-zero-for-all-funcs", cl::init(false), cl::Hidden,
46 cl::desc("Attribute all functions' range with zero count "
47 "even it's not hit by any samples."));
49 static cl::opt<int32_t, true> RecursionCompression(
50 "compress-recursion",
51 cl::desc("Compressing recursion by deduplicating adjacent frame "
52 "sequences up to the specified size. -1 means no size limit."),
53 cl::Hidden,
54 cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
56 static cl::opt<bool>
57 TrimColdProfile("trim-cold-profile",
58 cl::desc("If the total count of the profile is smaller "
59 "than threshold, it will be trimmed."));
61 static cl::opt<bool> CSProfMergeColdContext(
62 "csprof-merge-cold-context", cl::init(true),
63 cl::desc("If the total count of context profile is smaller than "
64 "the threshold, it will be merged into context-less base "
65 "profile."));
67 static cl::opt<uint32_t> CSProfMaxColdContextDepth(
68 "csprof-max-cold-context-depth", cl::init(1),
69 cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
70 "context-less base profile"));
72 static cl::opt<int, true> CSProfMaxContextDepth(
73 "csprof-max-context-depth",
74 cl::desc("Keep the last K contexts while merging profile. -1 means no "
75 "depth limit."),
76 cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
78 static cl::opt<double> ProfileDensityThreshold(
79 "profile-density-threshold", llvm::cl::init(50),
80 llvm::cl::desc("If the profile density is below the given threshold, it "
81 "will be suggested to increase the sampling rate."),
82 llvm::cl::Optional);
83 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
84 llvm::cl::desc("show profile density details"),
85 llvm::cl::Optional);
86 static cl::opt<int> ProfileDensityCutOffHot(
87 "profile-density-cutoff-hot", llvm::cl::init(990000),
88 llvm::cl::desc("Total samples cutoff for functions used to calculate "
89 "profile density."));
91 static cl::opt<bool> UpdateTotalSamples(
92 "update-total-samples", llvm::cl::init(false),
93 llvm::cl::desc(
94 "Update total samples by accumulating all its body samples."),
95 llvm::cl::Optional);
97 static cl::opt<bool> GenCSNestedProfile(
98 "gen-cs-nested-profile", cl::Hidden, cl::init(true),
99 cl::desc("Generate nested function profiles for CSSPGO"));
101 cl::opt<bool> InferMissingFrames(
102 "infer-missing-frames", llvm::cl::init(true),
103 llvm::cl::desc(
104 "Infer missing call frames due to compiler tail call elimination."),
105 llvm::cl::Optional);
107 using namespace llvm;
108 using namespace sampleprof;
110 namespace llvm {
111 extern cl::opt<int> ProfileSummaryCutoffHot;
112 extern cl::opt<bool> UseContextLessSummary;
114 namespace sampleprof {
116 // Initialize the MaxCompressionSize to -1 which means no size limit
117 int32_t CSProfileGenerator::MaxCompressionSize = -1;
119 int CSProfileGenerator::MaxContextDepth = -1;
121 bool ProfileGeneratorBase::UseFSDiscriminator = false;
123 std::unique_ptr<ProfileGeneratorBase>
124 ProfileGeneratorBase::create(ProfiledBinary *Binary,
125 const ContextSampleCounterMap *SampleCounters,
126 bool ProfileIsCS) {
127 std::unique_ptr<ProfileGeneratorBase> Generator;
128 if (ProfileIsCS) {
129 Generator.reset(new CSProfileGenerator(Binary, SampleCounters));
130 } else {
131 Generator.reset(new ProfileGenerator(Binary, SampleCounters));
133 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
134 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
136 return Generator;
139 std::unique_ptr<ProfileGeneratorBase>
140 ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles,
141 bool ProfileIsCS) {
142 std::unique_ptr<ProfileGeneratorBase> Generator;
143 if (ProfileIsCS) {
144 Generator.reset(new CSProfileGenerator(Binary, Profiles));
145 } else {
146 Generator.reset(new ProfileGenerator(Binary, std::move(Profiles)));
148 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
149 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
151 return Generator;
154 void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
155 SampleProfileMap &ProfileMap) {
156 // Populate profile symbol list if extended binary format is used.
157 ProfileSymbolList SymbolList;
159 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
160 Binary->populateSymbolListFromDWARF(SymbolList);
161 Writer->setProfileSymbolList(&SymbolList);
164 if (std::error_code EC = Writer->write(ProfileMap))
165 exitWithError(std::move(EC));
168 void ProfileGeneratorBase::write() {
169 auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
170 if (std::error_code EC = WriterOrErr.getError())
171 exitWithError(EC, OutputFilename);
173 if (UseMD5) {
174 if (OutputFormat != SPF_Ext_Binary)
175 WithColor::warning() << "-use-md5 is ignored. Specify "
176 "--format=extbinary to enable it\n";
177 else
178 WriterOrErr.get()->setUseMD5();
181 write(std::move(WriterOrErr.get()), ProfileMap);
184 void ProfileGeneratorBase::showDensitySuggestion(double Density) {
185 if (Density == 0.0)
186 WithColor::warning() << "The output profile is empty or the "
187 "--profile-density-cutoff-hot option is "
188 "set too low. Please check your command.\n";
189 else if (Density < ProfileDensityThreshold)
190 WithColor::warning()
191 << "Sample PGO is estimated to optimize better with "
192 << format("%.1f", ProfileDensityThreshold / Density)
193 << "x more samples. Please consider increasing sampling rate or "
194 "profiling for longer duration to get more samples.\n";
196 if (ShowDensity)
197 outs() << "Functions with density >= " << format("%.1f", Density)
198 << " account for "
199 << format("%.2f",
200 static_cast<double>(ProfileDensityCutOffHot) / 10000)
201 << "% total sample counts.\n";
204 bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
205 for (const auto &Prefix : FuncPrefixsToFilter) {
206 if (FS.getFuncName().starts_with(Prefix))
207 return true;
210 // Filter the function profiles for the inlinees. It's useful for fuzzy
211 // profile matching which flattens the profile and inlinees' samples are
212 // merged into top-level function.
213 for (auto &Callees :
214 const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) {
215 auto &CalleesMap = Callees.second;
216 for (auto I = CalleesMap.begin(); I != CalleesMap.end();) {
217 auto FS = I++;
218 if (filterAmbiguousProfile(FS->second))
219 CalleesMap.erase(FS);
222 return false;
225 // For built-in local initialization function such as __cxx_global_var_init,
226 // __tls_init prefix function, there could be multiple versions of the functions
227 // in the final binary. However, in the profile generation, we call
228 // getCanonicalFnName to canonicalize the names which strips the suffixes.
229 // Therefore, samples from different functions queries the same profile and the
230 // samples are merged. As the functions are essentially different, entries of
231 // the merged profile are ambiguous. In sample loader, the IR from one version
232 // would be attributed towards a merged entries, which is inaccurate. Especially
233 // for fuzzy profile matching, it gets multiple callsites(from different
234 // function) but used to match one callsite, which misleads the matching and
235 // causes a lot of false positives report. Hence, we want to filter them out
236 // from the profile map during the profile generation time. The profiles are all
237 // cold functions, it won't have perf impact.
238 void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
239 for (auto I = ProfileMap.begin(); I != ProfileMap.end();) {
240 auto FS = I++;
241 if (filterAmbiguousProfile(FS->second))
242 ProfileMap.erase(FS);
246 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
247 const RangeSample &Ranges) {
250 Regions may overlap with each other. Using the boundary info, find all
251 disjoint ranges and their sample count. BoundaryPoint contains the count
252 multiple samples begin/end at this points.
254 |<--100-->| Sample1
255 |<------200------>| Sample2
256 A B C
258 In the example above,
259 Sample1 begins at A, ends at B, its value is 100.
260 Sample2 beings at A, ends at C, its value is 200.
261 For A, BeginCount is the sum of sample begins at A, which is 300 and no
262 samples ends at A, so EndCount is 0.
263 Then boundary points A, B, and C with begin/end counts are:
264 A: (300, 0)
265 B: (0, 100)
266 C: (0, 200)
268 struct BoundaryPoint {
269 // Sum of sample counts beginning at this point
270 uint64_t BeginCount = UINT64_MAX;
271 // Sum of sample counts ending at this point
272 uint64_t EndCount = UINT64_MAX;
273 // Is the begin point of a zero range.
274 bool IsZeroRangeBegin = false;
275 // Is the end point of a zero range.
276 bool IsZeroRangeEnd = false;
278 void addBeginCount(uint64_t Count) {
279 if (BeginCount == UINT64_MAX)
280 BeginCount = 0;
281 BeginCount += Count;
284 void addEndCount(uint64_t Count) {
285 if (EndCount == UINT64_MAX)
286 EndCount = 0;
287 EndCount += Count;
292 For the above example. With boundary points, follwing logic finds two
293 disjoint region of
295 [A,B]: 300
296 [B+1,C]: 200
298 If there is a boundary point that both begin and end, the point itself
299 becomes a separate disjoint region. For example, if we have original
300 ranges of
302 |<--- 100 --->|
303 |<--- 200 --->|
304 A B C
306 there are three boundary points with their begin/end counts of
308 A: (100, 0)
309 B: (200, 100)
310 C: (0, 200)
312 the disjoint ranges would be
314 [A, B-1]: 100
315 [B, B]: 300
316 [B+1, C]: 200.
318 Example for zero value range:
320 |<--- 100 --->|
321 |<--- 200 --->|
322 |<--------------- 0 ----------------->|
323 A B C D E F
325 [A, B-1] : 0
326 [B, C] : 100
327 [C+1, D-1]: 0
328 [D, E] : 200
329 [E+1, F] : 0
331 std::map<uint64_t, BoundaryPoint> Boundaries;
333 for (const auto &Item : Ranges) {
334 assert(Item.first.first <= Item.first.second &&
335 "Invalid instruction range");
336 auto &BeginPoint = Boundaries[Item.first.first];
337 auto &EndPoint = Boundaries[Item.first.second];
338 uint64_t Count = Item.second;
340 BeginPoint.addBeginCount(Count);
341 EndPoint.addEndCount(Count);
342 if (Count == 0) {
343 BeginPoint.IsZeroRangeBegin = true;
344 EndPoint.IsZeroRangeEnd = true;
348 // Use UINT64_MAX to indicate there is no existing range between BeginAddress
349 // and the next valid address
350 uint64_t BeginAddress = UINT64_MAX;
351 int ZeroRangeDepth = 0;
352 uint64_t Count = 0;
353 for (const auto &Item : Boundaries) {
354 uint64_t Address = Item.first;
355 const BoundaryPoint &Point = Item.second;
356 if (Point.BeginCount != UINT64_MAX) {
357 if (BeginAddress != UINT64_MAX)
358 DisjointRanges[{BeginAddress, Address - 1}] = Count;
359 Count += Point.BeginCount;
360 BeginAddress = Address;
361 ZeroRangeDepth += Point.IsZeroRangeBegin;
363 if (Point.EndCount != UINT64_MAX) {
364 assert((BeginAddress != UINT64_MAX) &&
365 "First boundary point cannot be 'end' point");
366 DisjointRanges[{BeginAddress, Address}] = Count;
367 assert(Count >= Point.EndCount && "Mismatched live ranges");
368 Count -= Point.EndCount;
369 BeginAddress = Address + 1;
370 ZeroRangeDepth -= Point.IsZeroRangeEnd;
371 // If the remaining count is zero and it's no longer in a zero range, this
372 // means we consume all the ranges before, thus mark BeginAddress as
373 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
374 // [<---- 10 ---->]
375 // [<---- 20 ---->]
376 // A B C D
377 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
378 // have the [B+1, C-1] zero range.
379 if (Count == 0 && ZeroRangeDepth == 0)
380 BeginAddress = UINT64_MAX;
385 void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
386 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
387 uint64_t Count) {
388 // Use the maximum count of samples with same line location
389 uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator);
391 // Use duplication factor to compensated for loop unroll/vectorization.
392 // Note that this is only needed when we're taking MAX of the counts at
393 // the location instead of SUM.
394 Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
396 ErrorOr<uint64_t> R =
397 FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator);
399 uint64_t PreviousCount = R ? R.get() : 0;
400 if (PreviousCount <= Count) {
401 FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
402 Count - PreviousCount);
406 void ProfileGeneratorBase::updateTotalSamples() {
407 for (auto &Item : ProfileMap) {
408 FunctionSamples &FunctionProfile = Item.second;
409 FunctionProfile.updateTotalSamples();
413 void ProfileGeneratorBase::updateCallsiteSamples() {
414 for (auto &Item : ProfileMap) {
415 FunctionSamples &FunctionProfile = Item.second;
416 FunctionProfile.updateCallsiteSamples();
420 void ProfileGeneratorBase::updateFunctionSamples() {
421 updateCallsiteSamples();
423 if (UpdateTotalSamples)
424 updateTotalSamples();
427 void ProfileGeneratorBase::collectProfiledFunctions() {
428 std::unordered_set<const BinaryFunction *> ProfiledFunctions;
429 if (collectFunctionsFromRawProfile(ProfiledFunctions))
430 Binary->setProfiledFunctions(ProfiledFunctions);
431 else if (collectFunctionsFromLLVMProfile(ProfiledFunctions))
432 Binary->setProfiledFunctions(ProfiledFunctions);
433 else
434 llvm_unreachable("Unsupported input profile");
437 bool ProfileGeneratorBase::collectFunctionsFromRawProfile(
438 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
439 if (!SampleCounters)
440 return false;
441 // Go through all the stacks, ranges and branches in sample counters, use
442 // the start of the range to look up the function it belongs and record the
443 // function.
444 for (const auto &CI : *SampleCounters) {
445 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(CI.first.getPtr())) {
446 for (auto StackAddr : CtxKey->Context) {
447 if (FuncRange *FRange = Binary->findFuncRange(StackAddr))
448 ProfiledFunctions.insert(FRange->Func);
452 for (auto Item : CI.second.RangeCounter) {
453 uint64_t StartAddress = Item.first.first;
454 if (FuncRange *FRange = Binary->findFuncRange(StartAddress))
455 ProfiledFunctions.insert(FRange->Func);
458 for (auto Item : CI.second.BranchCounter) {
459 uint64_t SourceAddress = Item.first.first;
460 uint64_t TargetAddress = Item.first.second;
461 if (FuncRange *FRange = Binary->findFuncRange(SourceAddress))
462 ProfiledFunctions.insert(FRange->Func);
463 if (FuncRange *FRange = Binary->findFuncRange(TargetAddress))
464 ProfiledFunctions.insert(FRange->Func);
467 return true;
470 bool ProfileGenerator::collectFunctionsFromLLVMProfile(
471 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
472 for (const auto &FS : ProfileMap) {
473 if (auto *Func = Binary->getBinaryFunction(FS.second.getFunction()))
474 ProfiledFunctions.insert(Func);
476 return true;
479 bool CSProfileGenerator::collectFunctionsFromLLVMProfile(
480 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
481 for (auto *Node : ContextTracker) {
482 if (!Node->getFuncName().empty())
483 if (auto *Func = Binary->getBinaryFunction(Node->getFuncName()))
484 ProfiledFunctions.insert(Func);
486 return true;
489 FunctionSamples &
490 ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) {
491 SampleContext Context(FuncName);
492 return ProfileMap.create(Context);
495 void ProfileGenerator::generateProfile() {
496 collectProfiledFunctions();
498 if (Binary->usePseudoProbes())
499 Binary->decodePseudoProbe();
501 if (SampleCounters) {
502 if (Binary->usePseudoProbes()) {
503 generateProbeBasedProfile();
504 } else {
505 generateLineNumBasedProfile();
509 postProcessProfiles();
512 void ProfileGenerator::postProcessProfiles() {
513 computeSummaryAndThreshold(ProfileMap);
514 trimColdProfiles(ProfileMap, ColdCountThreshold);
515 filterAmbiguousProfile(ProfileMap);
516 calculateAndShowDensity(ProfileMap);
519 void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles,
520 uint64_t ColdCntThreshold) {
521 if (!TrimColdProfile)
522 return;
524 // Move cold profiles into a tmp container.
525 std::vector<hash_code> ColdProfileHashes;
526 for (const auto &I : ProfileMap) {
527 if (I.second.getTotalSamples() < ColdCntThreshold)
528 ColdProfileHashes.emplace_back(I.first);
531 // Remove the cold profile from ProfileMap.
532 for (const auto &I : ColdProfileHashes)
533 ProfileMap.erase(I);
536 void ProfileGenerator::generateLineNumBasedProfile() {
537 assert(SampleCounters->size() == 1 &&
538 "Must have one entry for profile generation.");
539 const SampleCounter &SC = SampleCounters->begin()->second;
540 // Fill in function body samples
541 populateBodySamplesForAllFunctions(SC.RangeCounter);
542 // Fill in boundary sample counts as well as call site samples for calls
543 populateBoundarySamplesForAllFunctions(SC.BranchCounter);
545 updateFunctionSamples();
548 void ProfileGenerator::generateProbeBasedProfile() {
549 assert(SampleCounters->size() == 1 &&
550 "Must have one entry for profile generation.");
551 // Enable pseudo probe functionalities in SampleProf
552 FunctionSamples::ProfileIsProbeBased = true;
553 const SampleCounter &SC = SampleCounters->begin()->second;
554 // Fill in function body samples
555 populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter);
556 // Fill in boundary sample counts as well as call site samples for calls
557 populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter);
559 updateFunctionSamples();
562 void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
563 const RangeSample &RangeCounter) {
564 ProbeCounterMap ProbeCounter;
565 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
566 // inside extractProbesFromRange.
567 extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter,
568 false);
570 for (const auto &PI : ProbeCounter) {
571 const MCDecodedPseudoProbe *Probe = PI.first;
572 uint64_t Count = PI.second;
573 SampleContextFrameVector FrameVec;
574 Binary->getInlineContextForProbe(Probe, FrameVec, true);
575 FunctionSamples &FunctionProfile =
576 getLeafProfileAndAddTotalSamples(FrameVec, Count);
577 FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(),
578 Count);
579 if (Probe->isEntry())
580 FunctionProfile.addHeadSamples(Count);
584 void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
585 const BranchSample &BranchCounters) {
586 for (const auto &Entry : BranchCounters) {
587 uint64_t SourceAddress = Entry.first.first;
588 uint64_t TargetAddress = Entry.first.second;
589 uint64_t Count = Entry.second;
590 assert(Count != 0 && "Unexpected zero weight branch");
592 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
593 if (CalleeName.size() == 0)
594 continue;
596 const MCDecodedPseudoProbe *CallProbe =
597 Binary->getCallProbeForAddr(SourceAddress);
598 if (CallProbe == nullptr)
599 continue;
601 // Record called target sample and its count.
602 SampleContextFrameVector FrameVec;
603 Binary->getInlineContextForProbe(CallProbe, FrameVec, true);
605 if (!FrameVec.empty()) {
606 FunctionSamples &FunctionProfile =
607 getLeafProfileAndAddTotalSamples(FrameVec, 0);
608 FunctionProfile.addCalledTargetSamples(
609 FrameVec.back().Location.LineOffset,
610 FrameVec.back().Location.Discriminator,
611 FunctionId(CalleeName), Count);
616 FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
617 const SampleContextFrameVector &FrameVec, uint64_t Count) {
618 // Get top level profile
619 FunctionSamples *FunctionProfile =
620 &getTopLevelFunctionProfile(FrameVec[0].Func);
621 FunctionProfile->addTotalSamples(Count);
622 if (Binary->usePseudoProbes()) {
623 const auto *FuncDesc = Binary->getFuncDescForGUID(
624 FunctionProfile->getFunction().getHashCode());
625 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
628 for (size_t I = 1; I < FrameVec.size(); I++) {
629 LineLocation Callsite(
630 FrameVec[I - 1].Location.LineOffset,
631 getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator));
632 FunctionSamplesMap &SamplesMap =
633 FunctionProfile->functionSamplesAt(Callsite);
634 auto Ret =
635 SamplesMap.emplace(FrameVec[I].Func, FunctionSamples());
636 if (Ret.second) {
637 SampleContext Context(FrameVec[I].Func);
638 Ret.first->second.setContext(Context);
640 FunctionProfile = &Ret.first->second;
641 FunctionProfile->addTotalSamples(Count);
642 if (Binary->usePseudoProbes()) {
643 const auto *FuncDesc = Binary->getFuncDescForGUID(
644 FunctionProfile->getFunction().getHashCode());
645 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
649 return *FunctionProfile;
652 RangeSample
653 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
654 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
655 if (FillZeroForAllFuncs) {
656 for (auto &FuncI : Binary->getAllBinaryFunctions()) {
657 for (auto &R : FuncI.second.Ranges) {
658 Ranges[{R.first, R.second - 1}] += 0;
661 } else {
662 // For each range, we search for all ranges of the function it belongs to
663 // and initialize it with zero count, so it remains zero if doesn't hit any
664 // samples. This is to be consistent with compiler that interpret zero count
665 // as unexecuted(cold).
666 for (const auto &I : RangeCounter) {
667 uint64_t StartAddress = I.first.first;
668 for (const auto &Range : Binary->getRanges(StartAddress))
669 Ranges[{Range.first, Range.second - 1}] += 0;
672 RangeSample DisjointRanges;
673 findDisjointRanges(DisjointRanges, Ranges);
674 return DisjointRanges;
677 void ProfileGenerator::populateBodySamplesForAllFunctions(
678 const RangeSample &RangeCounter) {
679 for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
680 uint64_t RangeBegin = Range.first.first;
681 uint64_t RangeEnd = Range.first.second;
682 uint64_t Count = Range.second;
684 InstructionPointer IP(Binary, RangeBegin, true);
685 // Disjoint ranges may have range in the middle of two instr,
686 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
687 // can be Addr1+1 to Addr2-1. We should ignore such range.
688 if (IP.Address > RangeEnd)
689 continue;
691 do {
692 const SampleContextFrameVector FrameVec =
693 Binary->getFrameLocationStack(IP.Address);
694 if (!FrameVec.empty()) {
695 // FIXME: As accumulating total count per instruction caused some
696 // regression, we changed to accumulate total count per byte as a
697 // workaround. Tuning hotness threshold on the compiler side might be
698 // necessary in the future.
699 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
700 FrameVec, Count * Binary->getInstSize(IP.Address));
701 updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(),
702 Count);
704 } while (IP.advance() && IP.Address <= RangeEnd);
708 StringRef
709 ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) {
710 // Get the function range by branch target if it's a call branch.
711 auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress);
713 // We won't accumulate sample count for a range whose start is not the real
714 // function entry such as outlined function or inner labels.
715 if (!FRange || !FRange->IsFuncEntry)
716 return StringRef();
718 return FunctionSamples::getCanonicalFnName(FRange->getFuncName());
721 void ProfileGenerator::populateBoundarySamplesForAllFunctions(
722 const BranchSample &BranchCounters) {
723 for (const auto &Entry : BranchCounters) {
724 uint64_t SourceAddress = Entry.first.first;
725 uint64_t TargetAddress = Entry.first.second;
726 uint64_t Count = Entry.second;
727 assert(Count != 0 && "Unexpected zero weight branch");
729 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
730 if (CalleeName.size() == 0)
731 continue;
732 // Record called target sample and its count.
733 const SampleContextFrameVector &FrameVec =
734 Binary->getCachedFrameLocationStack(SourceAddress);
735 if (!FrameVec.empty()) {
736 FunctionSamples &FunctionProfile =
737 getLeafProfileAndAddTotalSamples(FrameVec, 0);
738 FunctionProfile.addCalledTargetSamples(
739 FrameVec.back().Location.LineOffset,
740 getBaseDiscriminator(FrameVec.back().Location.Discriminator),
741 FunctionId(CalleeName), Count);
743 // Add head samples for callee.
744 FunctionSamples &CalleeProfile =
745 getTopLevelFunctionProfile(FunctionId(CalleeName));
746 CalleeProfile.addHeadSamples(Count);
750 void ProfileGeneratorBase::calculateBodySamplesAndSize(
751 const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
752 uint64_t &FuncBodySize) {
753 // Note that ideally the size should be the number of function instruction.
754 // However, for probe-based profile, we don't have the accurate instruction
755 // count for each probe, instead, the probe sample is the samples count for
756 // the block, which is equivelant to
757 // total_instruction_samples/num_of_instruction in one block. Hence, we use
758 // the number of probe as a proxy for the function's size.
759 FuncBodySize += FSamples.getBodySamples().size();
761 // The accumulated body samples re-calculated here could be different from the
762 // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
763 // based profile. The reason is that TotalSamples is the sum of all the
764 // samples of the machine instruction in one source-code line, however, the
765 // entry of Bodysamples is the only max number of them, so the TotalSamples is
766 // usually much bigger than the accumulated body samples as one souce-code
767 // line can emit many machine instructions. We observed a regression when we
768 // switched to use the accumulated body samples(by using
769 // -update-total-samples). Hence, it's safer to re-calculate here to avoid
770 // such discrepancy. There is no problem for probe-based profile, as the
771 // TotalSamples is exactly the same as the accumulated body samples.
772 for (const auto &I : FSamples.getBodySamples())
773 TotalBodySamples += I.second.getSamples();
775 for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
776 for (const auto &Callee : CallsiteSamples.second) {
777 // For binary-level density, the inlinees' samples and size should be
778 // included in the calculation.
779 calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
780 FuncBodySize);
784 // Calculate Profile-density:
785 // Calculate the density for each function and sort them in descending order,
786 // keep accumulating their total samples unitl it exceeds the
787 // percentage_threshold(cut-off) of total profile samples, the profile-density
788 // is the last(minimum) function-density of the processed functions, which means
789 // all the functions hot to perf are on good density if the profile-density is
790 // good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
791 // depending on how much regression the system want to tolerate.
792 double
793 ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
794 double ProfileDensity = 0.0;
796 uint64_t TotalProfileSamples = 0;
797 // A list of the function profile density and its total samples.
798 std::vector<std::pair<double, uint64_t>> FuncDensityList;
799 for (const auto &I : Profiles) {
800 uint64_t TotalBodySamples = 0;
801 uint64_t FuncBodySize = 0;
802 calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);
804 if (FuncBodySize == 0)
805 continue;
807 double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
808 TotalProfileSamples += TotalBodySamples;
809 FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
812 // Sorted by the density in descending order.
813 llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
814 const std::pair<double, uint64_t> &B) {
815 if (A.first != B.first)
816 return A.first > B.first;
817 return A.second < B.second;
820 uint64_t AccumulatedSamples = 0;
821 uint32_t I = 0;
822 assert(ProfileDensityCutOffHot <= 1000000 &&
823 "The cutoff value is greater than 1000000(100%)");
824 while (AccumulatedSamples < TotalProfileSamples *
825 static_cast<float>(ProfileDensityCutOffHot) /
826 1000000 &&
827 I < FuncDensityList.size()) {
828 AccumulatedSamples += FuncDensityList[I].second;
829 ProfileDensity = FuncDensityList[I].first;
830 I++;
833 return ProfileDensity;
836 void ProfileGeneratorBase::calculateAndShowDensity(
837 const SampleProfileMap &Profiles) {
838 double Density = calculateDensity(Profiles);
839 showDensitySuggestion(Density);
842 FunctionSamples *
843 CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
844 bool WasLeafInlined) {
845 FunctionSamples *FProfile = ContextNode->getFunctionSamples();
846 if (!FProfile) {
847 FSamplesList.emplace_back();
848 FProfile = &FSamplesList.back();
849 FProfile->setFunction(ContextNode->getFuncName());
850 ContextNode->setFunctionSamples(FProfile);
852 // Update ContextWasInlined attribute for existing contexts.
853 // The current function can be called in two ways:
854 // - when processing a probe of the current frame
855 // - when processing the entry probe of an inlinee's frame, which
856 // is then used to update the callsite count of the current frame.
857 // The two can happen in any order, hence here we are making sure
858 // `ContextWasInlined` is always set as expected.
859 // TODO: Note that the former does not always happen if no probes of the
860 // current frame has samples, and if the latter happens, we could lose the
861 // attribute. This should be fixed.
862 if (WasLeafInlined)
863 FProfile->getContext().setAttribute(ContextWasInlined);
864 return FProfile;
867 ContextTrieNode *
868 CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context,
869 bool WasLeafInlined) {
870 ContextTrieNode *ContextNode =
871 ContextTracker.getOrCreateContextPath(Context, true);
872 getOrCreateFunctionSamples(ContextNode, WasLeafInlined);
873 return ContextNode;
876 void CSProfileGenerator::generateProfile() {
877 FunctionSamples::ProfileIsCS = true;
879 collectProfiledFunctions();
881 if (Binary->usePseudoProbes()) {
882 Binary->decodePseudoProbe();
883 if (InferMissingFrames)
884 initializeMissingFrameInferrer();
887 if (SampleCounters) {
888 if (Binary->usePseudoProbes()) {
889 generateProbeBasedProfile();
890 } else {
891 generateLineNumBasedProfile();
895 if (Binary->getTrackFuncContextSize())
896 computeSizeForProfiledFunctions();
898 postProcessProfiles();
901 void CSProfileGenerator::initializeMissingFrameInferrer() {
902 Binary->getMissingContextInferrer()->initialize(SampleCounters);
905 void CSProfileGenerator::inferMissingFrames(
906 const SmallVectorImpl<uint64_t> &Context,
907 SmallVectorImpl<uint64_t> &NewContext) {
908 Binary->inferMissingFrames(Context, NewContext);
911 void CSProfileGenerator::computeSizeForProfiledFunctions() {
912 for (auto *Func : Binary->getProfiledFunctions())
913 Binary->computeInlinedContextSizeForFunc(Func);
915 // Flush the symbolizer to save memory.
916 Binary->flushSymbolizer();
919 void CSProfileGenerator::updateFunctionSamples() {
920 for (auto *Node : ContextTracker) {
921 FunctionSamples *FSamples = Node->getFunctionSamples();
922 if (FSamples) {
923 if (UpdateTotalSamples)
924 FSamples->updateTotalSamples();
925 FSamples->updateCallsiteSamples();
930 void CSProfileGenerator::generateLineNumBasedProfile() {
931 for (const auto &CI : *SampleCounters) {
932 const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr());
934 ContextTrieNode *ContextNode = &getRootContext();
935 // Sample context will be empty if the jump is an external-to-internal call
936 // pattern, the head samples should be added for the internal function.
937 if (!CtxKey->Context.empty()) {
938 // Get or create function profile for the range
939 ContextNode =
940 getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined);
941 // Fill in function body samples
942 populateBodySamplesForFunction(*ContextNode->getFunctionSamples(),
943 CI.second.RangeCounter);
945 // Fill in boundary sample counts as well as call site samples for calls
946 populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter);
948 // Fill in call site value sample for inlined calls and also use context to
949 // infer missing samples. Since we don't have call count for inlined
950 // functions, we estimate it from inlinee's profile using the entry of the
951 // body sample.
952 populateInferredFunctionSamples(getRootContext());
954 updateFunctionSamples();
957 void CSProfileGenerator::populateBodySamplesForFunction(
958 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
959 // Compute disjoint ranges first, so we can use MAX
960 // for calculating count for each location.
961 RangeSample Ranges;
962 findDisjointRanges(Ranges, RangeCounter);
963 for (const auto &Range : Ranges) {
964 uint64_t RangeBegin = Range.first.first;
965 uint64_t RangeEnd = Range.first.second;
966 uint64_t Count = Range.second;
967 // Disjoint ranges have introduce zero-filled gap that
968 // doesn't belong to current context, filter them out.
969 if (Count == 0)
970 continue;
972 InstructionPointer IP(Binary, RangeBegin, true);
973 // Disjoint ranges may have range in the middle of two instr,
974 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
975 // can be Addr1+1 to Addr2-1. We should ignore such range.
976 if (IP.Address > RangeEnd)
977 continue;
979 do {
980 auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address);
981 if (LeafLoc) {
982 // Recording body sample for this specific context
983 updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count);
984 FunctionProfile.addTotalSamples(Count);
986 } while (IP.advance() && IP.Address <= RangeEnd);
990 void CSProfileGenerator::populateBoundarySamplesForFunction(
991 ContextTrieNode *Node, const BranchSample &BranchCounters) {
993 for (const auto &Entry : BranchCounters) {
994 uint64_t SourceAddress = Entry.first.first;
995 uint64_t TargetAddress = Entry.first.second;
996 uint64_t Count = Entry.second;
997 assert(Count != 0 && "Unexpected zero weight branch");
999 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1000 if (CalleeName.size() == 0)
1001 continue;
1003 ContextTrieNode *CallerNode = Node;
1004 LineLocation CalleeCallSite(0, 0);
1005 if (CallerNode != &getRootContext()) {
1006 // Record called target sample and its count
1007 auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress);
1008 if (LeafLoc) {
1009 CallerNode->getFunctionSamples()->addCalledTargetSamples(
1010 LeafLoc->Location.LineOffset,
1011 getBaseDiscriminator(LeafLoc->Location.Discriminator),
1012 FunctionId(CalleeName),
1013 Count);
1014 // Record head sample for called target(callee)
1015 CalleeCallSite = LeafLoc->Location;
1019 ContextTrieNode *CalleeNode =
1020 CallerNode->getOrCreateChildContext(CalleeCallSite,
1021 FunctionId(CalleeName));
1022 FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode);
1023 CalleeProfile->addHeadSamples(Count);
1027 void CSProfileGenerator::populateInferredFunctionSamples(
1028 ContextTrieNode &Node) {
1029 // There is no call jmp sample between the inliner and inlinee, we need to use
1030 // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s
1031 // sample depends on child(inlinee)'s sample, so traverse the tree in
1032 // post-order.
1033 for (auto &It : Node.getAllChildContext())
1034 populateInferredFunctionSamples(It.second);
1036 FunctionSamples *CalleeProfile = Node.getFunctionSamples();
1037 if (!CalleeProfile)
1038 return;
1039 // If we already have head sample counts, we must have value profile
1040 // for call sites added already. Skip to avoid double counting.
1041 if (CalleeProfile->getHeadSamples())
1042 return;
1043 ContextTrieNode *CallerNode = Node.getParentContext();
1044 // If we don't have context, nothing to do for caller's call site.
1045 // This could happen for entry point function.
1046 if (CallerNode == &getRootContext())
1047 return;
1049 LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc();
1050 FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode);
1051 // Since we don't have call count for inlined functions, we
1052 // estimate it from inlinee's profile using entry body sample.
1053 uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate();
1054 // If we don't have samples with location, use 1 to indicate live.
1055 if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size())
1056 EstimatedCallCount = 1;
1057 CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset,
1058 CallerLeafFrameLoc.Discriminator,
1059 Node.getFuncName(), EstimatedCallCount);
1060 CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset,
1061 CallerLeafFrameLoc.Discriminator,
1062 EstimatedCallCount);
1063 CallerProfile.addTotalSamples(EstimatedCallCount);
1066 void CSProfileGenerator::convertToProfileMap(
1067 ContextTrieNode &Node, SampleContextFrameVector &Context) {
1068 FunctionSamples *FProfile = Node.getFunctionSamples();
1069 if (FProfile) {
1070 Context.emplace_back(Node.getFuncName(), LineLocation(0, 0));
1071 // Save the new context for future references.
1072 SampleContextFrames NewContext = *Contexts.insert(Context).first;
1073 auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile));
1074 FunctionSamples &NewProfile = Ret.first->second;
1075 NewProfile.getContext().setContext(NewContext);
1076 Context.pop_back();
1079 for (auto &It : Node.getAllChildContext()) {
1080 ContextTrieNode &ChildNode = It.second;
1081 Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc());
1082 convertToProfileMap(ChildNode, Context);
1083 Context.pop_back();
1087 void CSProfileGenerator::convertToProfileMap() {
1088 assert(ProfileMap.empty() &&
1089 "ProfileMap should be empty before converting from the trie");
1090 assert(IsProfileValidOnTrie &&
1091 "Do not convert the trie twice, it's already destroyed");
1093 SampleContextFrameVector Context;
1094 for (auto &It : getRootContext().getAllChildContext())
1095 convertToProfileMap(It.second, Context);
1097 IsProfileValidOnTrie = false;
1100 void CSProfileGenerator::postProcessProfiles() {
1101 // Compute hot/cold threshold based on profile. This will be used for cold
1102 // context profile merging/trimming.
1103 computeSummaryAndThreshold();
1105 // Run global pre-inliner to adjust/merge context profile based on estimated
1106 // inline decisions.
1107 if (EnableCSPreInliner) {
1108 ContextTracker.populateFuncToCtxtMap();
1109 CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
1110 // Turn off the profile merger by default unless it is explicitly enabled.
1111 if (!CSProfMergeColdContext.getNumOccurrences())
1112 CSProfMergeColdContext = false;
1115 convertToProfileMap();
1117 // Trim and merge cold context profile using cold threshold above.
1118 if (TrimColdProfile || CSProfMergeColdContext) {
1119 SampleContextTrimmer(ProfileMap)
1120 .trimAndMergeColdContextProfiles(
1121 HotCountThreshold, TrimColdProfile, CSProfMergeColdContext,
1122 CSProfMaxColdContextDepth, EnableCSPreInliner);
1125 if (GenCSNestedProfile) {
1126 ProfileConverter CSConverter(ProfileMap);
1127 CSConverter.convertCSProfiles();
1128 FunctionSamples::ProfileIsCS = false;
1130 filterAmbiguousProfile(ProfileMap);
1131 ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
1134 void ProfileGeneratorBase::computeSummaryAndThreshold(
1135 SampleProfileMap &Profiles) {
1136 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
1137 Summary = Builder.computeSummaryForProfiles(Profiles);
1138 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
1139 (Summary->getDetailedSummary()));
1140 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
1141 (Summary->getDetailedSummary()));
1144 void CSProfileGenerator::computeSummaryAndThreshold() {
1145 // Always merge and use context-less profile map to compute summary.
1146 SampleProfileMap ContextLessProfiles;
1147 ContextTracker.createContextLessProfileMap(ContextLessProfiles);
1149 // Set the flag below to avoid merging the profile again in
1150 // computeSummaryAndThreshold
1151 FunctionSamples::ProfileIsCS = false;
1152 assert(
1153 (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) &&
1154 "Don't set --profile-summary-contextless to false for profile "
1155 "generation");
1156 ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles);
1157 // Recover the old value.
1158 FunctionSamples::ProfileIsCS = true;
1161 void ProfileGeneratorBase::extractProbesFromRange(
1162 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
1163 bool FindDisjointRanges) {
1164 const RangeSample *PRanges = &RangeCounter;
1165 RangeSample Ranges;
1166 if (FindDisjointRanges) {
1167 findDisjointRanges(Ranges, RangeCounter);
1168 PRanges = &Ranges;
1171 for (const auto &Range : *PRanges) {
1172 uint64_t RangeBegin = Range.first.first;
1173 uint64_t RangeEnd = Range.first.second;
1174 uint64_t Count = Range.second;
1176 InstructionPointer IP(Binary, RangeBegin, true);
1177 // Disjoint ranges may have range in the middle of two instr,
1178 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1179 // can be Addr1+1 to Addr2-1. We should ignore such range.
1180 if (IP.Address > RangeEnd)
1181 continue;
1183 do {
1184 const AddressProbesMap &Address2ProbesMap =
1185 Binary->getAddress2ProbesMap();
1186 for (const MCDecodedPseudoProbe &Probe :
1187 Address2ProbesMap.find(IP.Address)) {
1188 ProbeCounter[&Probe] += Count;
1190 } while (IP.advance() && IP.Address <= RangeEnd);
1194 static void extractPrefixContextStack(SampleContextFrameVector &ContextStack,
1195 const SmallVectorImpl<uint64_t> &AddrVec,
1196 ProfiledBinary *Binary) {
1197 SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
1198 for (auto Address : reverse(AddrVec)) {
1199 const MCDecodedPseudoProbe *CallProbe =
1200 Binary->getCallProbeForAddr(Address);
1201 // These could be the cases when a probe is not found at a calliste. Cutting
1202 // off the context from here since the inliner will not know how to consume
1203 // a context with unknown callsites.
1204 // 1. for functions that are not sampled when
1205 // --decode-probe-for-profiled-functions-only is on.
1206 // 2. for a merged callsite. Callsite merging may cause the loss of original
1207 // probe IDs.
1208 // 3. for an external callsite.
1209 if (!CallProbe)
1210 break;
1211 Probes.push_back(CallProbe);
1214 std::reverse(Probes.begin(), Probes.end());
1216 // Extract context stack for reusing, leaf context stack will be added
1217 // compressed while looking up function profile.
1218 for (const auto *P : Probes) {
1219 Binary->getInlineContextForProbe(P, ContextStack, true);
1223 void CSProfileGenerator::generateProbeBasedProfile() {
1224 // Enable pseudo probe functionalities in SampleProf
1225 FunctionSamples::ProfileIsProbeBased = true;
1226 for (const auto &CI : *SampleCounters) {
1227 const AddrBasedCtxKey *CtxKey =
1228 dyn_cast<AddrBasedCtxKey>(CI.first.getPtr());
1229 // Fill in function body samples from probes, also infer caller's samples
1230 // from callee's probe
1231 populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey);
1232 // Fill in boundary samples for a call probe
1233 populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey);
1237 void CSProfileGenerator::populateBodySamplesWithProbes(
1238 const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) {
1239 ProbeCounterMap ProbeCounter;
1240 // Extract the top frame probes by looking up each address among the range in
1241 // the Address2ProbeMap
1242 extractProbesFromRange(RangeCounter, ProbeCounter);
1243 std::unordered_map<MCDecodedPseudoProbeInlineTree *,
1244 std::unordered_set<FunctionSamples *>>
1245 FrameSamples;
1246 for (const auto &PI : ProbeCounter) {
1247 const MCDecodedPseudoProbe *Probe = PI.first;
1248 uint64_t Count = PI.second;
1249 // Disjoint ranges have introduce zero-filled gap that
1250 // doesn't belong to current context, filter them out.
1251 if (!Probe->isBlock() || Count == 0)
1252 continue;
1254 ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe);
1255 FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples();
1256 // Record the current frame and FunctionProfile whenever samples are
1257 // collected for non-danglie probes. This is for reporting all of the
1258 // zero count probes of the frame later.
1259 FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile);
1260 FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(),
1261 Count);
1262 FunctionProfile.addTotalSamples(Count);
1263 if (Probe->isEntry()) {
1264 FunctionProfile.addHeadSamples(Count);
1265 // Look up for the caller's function profile
1266 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
1267 ContextTrieNode *CallerNode = ContextNode->getParentContext();
1268 if (InlinerDesc != nullptr && CallerNode != &getRootContext()) {
1269 // Since the context id will be compressed, we have to use callee's
1270 // context id to infer caller's context id to ensure they share the
1271 // same context prefix.
1272 uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset;
1273 uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator;
1274 assert(CallerIndex &&
1275 "Inferred caller's location index shouldn't be zero!");
1276 assert(!CallerDiscriminator &&
1277 "Callsite probe should not have a discriminator!");
1278 FunctionSamples &CallerProfile =
1279 *getOrCreateFunctionSamples(CallerNode);
1280 CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
1281 CallerProfile.addBodySamples(CallerIndex, CallerDiscriminator, Count);
1282 CallerProfile.addTotalSamples(Count);
1283 CallerProfile.addCalledTargetSamples(CallerIndex, CallerDiscriminator,
1284 ContextNode->getFuncName(), Count);
1289 // Assign zero count for remaining probes without sample hits to
1290 // differentiate from probes optimized away, of which the counts are unknown
1291 // and will be inferred by the compiler.
1292 for (auto &I : FrameSamples) {
1293 for (auto *FunctionProfile : I.second) {
1294 for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
1295 FunctionProfile->addBodySamples(Probe.getIndex(),
1296 Probe.getDiscriminator(), 0);
1302 void CSProfileGenerator::populateBoundarySamplesWithProbes(
1303 const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) {
1304 for (const auto &BI : BranchCounter) {
1305 uint64_t SourceAddress = BI.first.first;
1306 uint64_t TargetAddress = BI.first.second;
1307 uint64_t Count = BI.second;
1308 const MCDecodedPseudoProbe *CallProbe =
1309 Binary->getCallProbeForAddr(SourceAddress);
1310 if (CallProbe == nullptr)
1311 continue;
1312 FunctionSamples &FunctionProfile =
1313 getFunctionProfileForLeafProbe(CtxKey, CallProbe);
1314 FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count);
1315 FunctionProfile.addTotalSamples(Count);
1316 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1317 if (CalleeName.size() == 0)
1318 continue;
1319 FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(),
1320 CallProbe->getDiscriminator(),
1321 FunctionId(CalleeName), Count);
1325 ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe(
1326 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1328 const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context;
1329 SmallVector<uint64_t, 16> NewContext;
1331 if (InferMissingFrames) {
1332 SmallVector<uint64_t, 16> Context = CtxKey->Context;
1333 // Append leaf frame for a complete inference.
1334 Context.push_back(LeafProbe->getAddress());
1335 inferMissingFrames(Context, NewContext);
1336 // Pop out the leaf probe that was pushed in above.
1337 NewContext.pop_back();
1338 PContext = &NewContext;
1341 SampleContextFrameVector ContextStack;
1342 extractPrefixContextStack(ContextStack, *PContext, Binary);
1344 // Explicitly copy the context for appending the leaf context
1345 SampleContextFrameVector NewContextStack(ContextStack.begin(),
1346 ContextStack.end());
1347 Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true);
1348 // For leaf inlined context with the top frame, we should strip off the top
1349 // frame's probe id, like:
1350 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
1351 auto LeafFrame = NewContextStack.back();
1352 LeafFrame.Location = LineLocation(0, 0);
1353 NewContextStack.pop_back();
1354 // Compress the context string except for the leaf frame
1355 CSProfileGenerator::compressRecursionContext(NewContextStack);
1356 CSProfileGenerator::trimContext(NewContextStack);
1357 NewContextStack.push_back(LeafFrame);
1359 const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid());
1360 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
1361 ContextTrieNode *ContextNode =
1362 getOrCreateContextNode(NewContextStack, WasLeafInlined);
1363 ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash);
1364 return ContextNode;
1367 FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
1368 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1369 return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples();
1372 } // end namespace sampleprof
1373 } // end namespace llvm