1 //===-- PerfReader.cpp - perfscript reader ---------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 #include "PerfReader.h"
9 #include "ProfileGenerator.h"
10 #include "llvm/Support/FileSystem.h"
11 #include "llvm/Support/Process.h"
13 #define DEBUG_TYPE "perf-reader"
15 cl::opt
<bool> SkipSymbolization("skip-symbolization", cl::init(false),
17 cl::desc("Dump the unsymbolized profile to the "
18 "output file. It will show unwinder "
19 "output for CS profile generation."));
21 static cl::opt
<bool> ShowMmapEvents("show-mmap-events", cl::init(false),
23 cl::desc("Print binary load events."));
26 UseOffset("use-offset", cl::init(true), cl::ZeroOrMore
,
27 cl::desc("Work with `--skip-symbolization` or "
28 "`--unsymbolized-profile` to write/read the "
29 "offset instead of virtual address."));
31 static cl::opt
<bool> UseLoadableSegmentAsBase(
32 "use-first-loadable-segment-as-base", cl::init(false), cl::ZeroOrMore
,
33 cl::desc("Use first loadable segment address as base address "
34 "for offsets in unsymbolized profile. By default "
35 "first executable segment address is used"));
38 IgnoreStackSamples("ignore-stack-samples", cl::init(false), cl::ZeroOrMore
,
39 cl::desc("Ignore call stack samples for hybrid samples "
40 "and produce context-insensitive profile."));
41 cl::opt
<bool> ShowDetailedWarning("show-detailed-warning", cl::init(false),
43 cl::desc("Show detailed warning message."));
45 extern cl::opt
<std::string
> PerfTraceFilename
;
46 extern cl::opt
<bool> ShowDisassemblyOnly
;
47 extern cl::opt
<bool> ShowSourceLocations
;
48 extern cl::opt
<std::string
> OutputFilename
;
51 namespace sampleprof
{
53 void VirtualUnwinder::unwindCall(UnwindState
&State
) {
54 uint64_t Source
= State
.getCurrentLBRSource();
55 // An artificial return should push an external frame and an artificial call
56 // will match it and pop the external frame so that the context before and
57 // after the external call will be the same.
58 if (State
.getCurrentLBR().IsArtificial
) {
60 // A return is matched and pop the external frame.
61 if (State
.getParentFrame()->isExternalFrame()) {
64 // An artificial return is missing, it happens that the sample is just hit
65 // in the middle of the external code. In this case, the leading branch is
66 // a call to external, we just keep unwinding use a context-less stack.
67 if (State
.getParentFrame() != State
.getDummyRootPtr())
68 NumMissingExternalFrame
++;
69 State
.clearCallStack();
70 State
.pushFrame(Source
);
71 State
.InstPtr
.update(Source
);
76 auto *ParentFrame
= State
.getParentFrame();
77 // The 2nd frame after leaf could be missing if stack sample is
78 // taken when IP is within prolog/epilog, as frame chain isn't
79 // setup yet. Fill in the missing frame in that case.
80 // TODO: Currently we just assume all the addr that can't match the
81 // 2nd frame is in prolog/epilog. In the future, we will switch to
82 // pro/epi tracker(Dwarf CFI) for the precise check.
83 if (ParentFrame
== State
.getDummyRootPtr() ||
84 ParentFrame
->Address
!= Source
) {
85 State
.switchToFrame(Source
);
86 if (ParentFrame
!= State
.getDummyRootPtr()) {
87 if (State
.getCurrentLBR().IsArtificial
)
88 NumMismatchedExtCallBranch
++;
90 NumMismatchedProEpiBranch
++;
95 State
.InstPtr
.update(Source
);
98 void VirtualUnwinder::unwindLinear(UnwindState
&State
, uint64_t Repeat
) {
99 InstructionPointer
&IP
= State
.InstPtr
;
100 uint64_t Target
= State
.getCurrentLBRTarget();
101 uint64_t End
= IP
.Address
;
102 if (Binary
->usePseudoProbes()) {
103 // We don't need to top frame probe since it should be extracted
105 // The outcome of the virtual unwinding with pseudo probes is a
106 // map from a context key to the address range being unwound.
107 // This means basically linear unwinding is not needed for pseudo
108 // probes. The range will be simply recorded here and will be
109 // converted to a list of pseudo probes to report in ProfileGenerator.
110 State
.getParentFrame()->recordRangeCount(Target
, End
, Repeat
);
112 // Unwind linear execution part.
113 // Split and record the range by different inline context. For example:
114 // [0x01] ... main:1 # Target
116 // [0x03] ... main:3 @ foo:1
117 // [0x04] ... main:3 @ foo:2
118 // [0x05] ... main:3 @ foo:3
120 // [0x07] ... main:5 # End
121 // It will be recorded:
122 // [main:*] : [0x06, 0x07], [0x01, 0x02]
123 // [main:3 @ foo:*] : [0x03, 0x05]
124 while (IP
.Address
> Target
) {
125 uint64_t PrevIP
= IP
.Address
;
127 // Break into segments for implicit call/return due to inlining
128 bool SameInlinee
= Binary
->inlineContextEqual(PrevIP
, IP
.Address
);
130 State
.switchToFrame(PrevIP
);
131 State
.CurrentLeafFrame
->recordRangeCount(PrevIP
, End
, Repeat
);
135 assert(IP
.Address
== Target
&& "The last one must be the target address.");
136 // Record the remaining range, [0x01, 0x02] in the example
137 State
.switchToFrame(IP
.Address
);
138 State
.CurrentLeafFrame
->recordRangeCount(IP
.Address
, End
, Repeat
);
142 void VirtualUnwinder::unwindReturn(UnwindState
&State
) {
143 // Add extra frame as we unwind through the return
144 const LBREntry
&LBR
= State
.getCurrentLBR();
145 uint64_t CallAddr
= Binary
->getCallAddrFromFrameAddr(LBR
.Target
);
146 State
.switchToFrame(CallAddr
);
147 // Push an external frame for the case of returning to external
148 // address(callback), later if an aitificial call is matched and it will be
149 // popped up. This is to 1)avoid context being interrupted by callback,
150 // context before or after the callback should be the same. 2) the call stack
151 // of function called by callback should be truncated which is done during
152 // recording the context on trie. For example:
153 // main (call)--> foo (call)--> callback (call)--> bar (return)--> callback
154 // (return)--> foo (return)--> main
155 // Context for bar should not include main and foo.
156 // For the code of foo, the context of before and after callback should both
158 if (LBR
.IsArtificial
)
159 State
.pushFrame(ExternalAddr
);
160 State
.pushFrame(LBR
.Source
);
161 State
.InstPtr
.update(LBR
.Source
);
164 void VirtualUnwinder::unwindBranch(UnwindState
&State
) {
165 // TODO: Tolerate tail call for now, as we may see tail call from libraries.
166 // This is only for intra function branches, excluding tail calls.
167 uint64_t Source
= State
.getCurrentLBRSource();
168 State
.switchToFrame(Source
);
169 State
.InstPtr
.update(Source
);
172 std::shared_ptr
<StringBasedCtxKey
> FrameStack::getContextKey() {
173 std::shared_ptr
<StringBasedCtxKey
> KeyStr
=
174 std::make_shared
<StringBasedCtxKey
>();
175 KeyStr
->Context
= Binary
->getExpandedContext(Stack
, KeyStr
->WasLeafInlined
);
176 if (KeyStr
->Context
.empty())
181 std::shared_ptr
<ProbeBasedCtxKey
> ProbeStack::getContextKey() {
182 std::shared_ptr
<ProbeBasedCtxKey
> ProbeBasedKey
=
183 std::make_shared
<ProbeBasedCtxKey
>();
184 for (auto CallProbe
: Stack
) {
185 ProbeBasedKey
->Probes
.emplace_back(CallProbe
);
187 CSProfileGenerator::compressRecursionContext
<const MCDecodedPseudoProbe
*>(
188 ProbeBasedKey
->Probes
);
189 CSProfileGenerator::trimContext
<const MCDecodedPseudoProbe
*>(
190 ProbeBasedKey
->Probes
);
191 return ProbeBasedKey
;
194 template <typename T
>
195 void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame
*Cur
,
197 if (Cur
->RangeSamples
.empty() && Cur
->BranchSamples
.empty())
200 std::shared_ptr
<ContextKey
> Key
= Stack
.getContextKey();
203 auto Ret
= CtxCounterMap
->emplace(Hashable
<ContextKey
>(Key
), SampleCounter());
204 SampleCounter
&SCounter
= Ret
.first
->second
;
205 for (auto &Item
: Cur
->RangeSamples
) {
206 uint64_t StartOffset
= Binary
->virtualAddrToOffset(std::get
<0>(Item
));
207 uint64_t EndOffset
= Binary
->virtualAddrToOffset(std::get
<1>(Item
));
208 SCounter
.recordRangeCount(StartOffset
, EndOffset
, std::get
<2>(Item
));
211 for (auto &Item
: Cur
->BranchSamples
) {
212 uint64_t SourceOffset
= Binary
->virtualAddrToOffset(std::get
<0>(Item
));
213 uint64_t TargetOffset
= Binary
->virtualAddrToOffset(std::get
<1>(Item
));
214 SCounter
.recordBranchCount(SourceOffset
, TargetOffset
, std::get
<2>(Item
));
218 template <typename T
>
219 void VirtualUnwinder::collectSamplesFromFrameTrie(
220 UnwindState::ProfiledFrame
*Cur
, T
&Stack
) {
221 if (!Cur
->isDummyRoot()) {
222 // Truncate the context for external frame since this isn't a real call
223 // context the compiler will see.
224 if (Cur
->isExternalFrame() || !Stack
.pushFrame(Cur
)) {
225 // Process truncated context
226 // Start a new traversal ignoring its bottom context
227 T
EmptyStack(Binary
);
228 collectSamplesFromFrame(Cur
, EmptyStack
);
229 for (const auto &Item
: Cur
->Children
) {
230 collectSamplesFromFrameTrie(Item
.second
.get(), EmptyStack
);
233 // Keep note of untracked call site and deduplicate them
234 // for warning later.
235 if (!Cur
->isLeafFrame())
236 UntrackedCallsites
.insert(Cur
->Address
);
242 collectSamplesFromFrame(Cur
, Stack
);
243 // Process children frame
244 for (const auto &Item
: Cur
->Children
) {
245 collectSamplesFromFrameTrie(Item
.second
.get(), Stack
);
247 // Recover the call stack
251 void VirtualUnwinder::collectSamplesFromFrameTrie(
252 UnwindState::ProfiledFrame
*Cur
) {
253 if (Binary
->usePseudoProbes()) {
254 ProbeStack
Stack(Binary
);
255 collectSamplesFromFrameTrie
<ProbeStack
>(Cur
, Stack
);
257 FrameStack
Stack(Binary
);
258 collectSamplesFromFrameTrie
<FrameStack
>(Cur
, Stack
);
262 void VirtualUnwinder::recordBranchCount(const LBREntry
&Branch
,
263 UnwindState
&State
, uint64_t Repeat
) {
264 if (Branch
.IsArtificial
|| Branch
.Target
== ExternalAddr
)
267 if (Binary
->usePseudoProbes()) {
268 // Same as recordRangeCount, We don't need to top frame probe since we will
269 // extract it from branch's source address
270 State
.getParentFrame()->recordBranchCount(Branch
.Source
, Branch
.Target
,
273 State
.CurrentLeafFrame
->recordBranchCount(Branch
.Source
, Branch
.Target
,
278 bool VirtualUnwinder::unwind(const PerfSample
*Sample
, uint64_t Repeat
) {
279 // Capture initial state as starting point for unwinding.
280 UnwindState
State(Sample
, Binary
);
282 // Sanity check - making sure leaf of LBR aligns with leaf of stack sample
283 // Stack sample sometimes can be unreliable, so filter out bogus ones.
284 if (!State
.validateInitialState())
287 // Now process the LBR samples in parrallel with stack sample
288 // Note that we do not reverse the LBR entry order so we can
289 // unwind the sample stack as we walk through LBR entries.
290 while (State
.hasNextLBR()) {
291 State
.checkStateConsistency();
293 // Do not attempt linear unwind for the leaf range as it's incomplete.
294 if (!State
.IsLastLBR()) {
295 // Unwind implicit calls/returns from inlining, along the linear path,
296 // break into smaller sub section each with its own calling context.
297 unwindLinear(State
, Repeat
);
300 // Save the LBR branch before it gets unwound.
301 const LBREntry
&Branch
= State
.getCurrentLBR();
303 if (isCallState(State
)) {
304 // Unwind calls - we know we encountered call if LBR overlaps with
305 // transition between leaf the 2nd frame. Note that for calls that
306 // were not in the original stack sample, we should have added the
307 // extra frame when processing the return paired with this call.
309 } else if (isReturnState(State
)) {
310 // Unwind returns - check whether the IP is indeed at a return instruction
314 // For regular intra function branches, we only need to record branch with
315 // context. For an artificial branch cross function boundaries, we got an
316 // issue with returning to external code. Take the two LBR enties for
317 // example: [foo:8(RETURN), ext:1] [ext:3(CALL), bar:1] After perf reader,
318 // we only get[foo:8(RETURN), bar:1], unwinder will be confused like foo
319 // return to bar. Here we detect and treat this case as BRANCH instead of
320 // RETURN which only update the source address.
324 // Record `branch` with calling context after unwinding.
325 recordBranchCount(Branch
, State
, Repeat
);
327 // As samples are aggregated on trie, record them into counter map
328 collectSamplesFromFrameTrie(State
.getDummyRootPtr());
333 std::unique_ptr
<PerfReaderBase
>
334 PerfReaderBase::create(ProfiledBinary
*Binary
, PerfInputFile
&PerfInput
) {
335 std::unique_ptr
<PerfReaderBase
> PerfReader
;
337 if (PerfInput
.Format
== PerfFormat::UnsymbolizedProfile
) {
339 new UnsymbolizedProfileReader(Binary
, PerfInput
.InputFile
));
343 // For perf data input, we need to convert them into perf script first.
344 if (PerfInput
.Format
== PerfFormat::PerfData
)
345 PerfInput
= PerfScriptReader::convertPerfDataToTrace(Binary
, PerfInput
);
347 assert((PerfInput
.Format
== PerfFormat::PerfScript
) &&
348 "Should be a perfscript!");
351 PerfScriptReader::checkPerfScriptType(PerfInput
.InputFile
);
352 if (PerfInput
.Content
== PerfContent::LBRStack
) {
353 PerfReader
.reset(new HybridPerfReader(Binary
, PerfInput
.InputFile
));
354 } else if (PerfInput
.Content
== PerfContent::LBR
) {
355 PerfReader
.reset(new LBRPerfReader(Binary
, PerfInput
.InputFile
));
357 exitWithError("Unsupported perfscript!");
363 PerfInputFile
PerfScriptReader::convertPerfDataToTrace(ProfiledBinary
*Binary
,
364 PerfInputFile
&File
) {
365 StringRef PerfData
= File
.InputFile
;
366 // Run perf script to retrieve PIDs matching binary we're interested in.
367 auto PerfExecutable
= sys::Process::FindInEnvPath("PATH", "perf");
368 if (!PerfExecutable
) {
369 exitWithError("Perf not found.");
371 std::string PerfPath
= *PerfExecutable
;
372 std::string PerfTraceFile
= PerfData
.str() + ".script.tmp";
373 StringRef ScriptMMapArgs
[] = {PerfPath
, "script", "--show-mmap-events",
374 "-F", "comm,pid", "-i",
376 Optional
<StringRef
> Redirects
[] = {llvm::None
, // Stdin
377 StringRef(PerfTraceFile
), // Stdout
378 StringRef(PerfTraceFile
)}; // Stderr
379 sys::ExecuteAndWait(PerfPath
, ScriptMMapArgs
, llvm::None
, Redirects
);
382 TraceStream
TraceIt(PerfTraceFile
);
384 std::unordered_set
<uint32_t> PIDSet
;
385 while (!TraceIt
.isAtEoF()) {
387 if (isMMap2Event(TraceIt
.getCurrentLine()) &&
388 extractMMap2EventForBinary(Binary
, TraceIt
.getCurrentLine(), MMap
)) {
389 auto It
= PIDSet
.emplace(MMap
.PID
);
394 PIDs
.append(utostr(MMap
.PID
));
401 exitWithError("No relevant mmap event is found in perf data.");
404 // Run perf script again to retrieve events for PIDs collected above
405 StringRef ScriptSampleArgs
[] = {PerfPath
, "script", "--show-mmap-events",
406 "-F", "ip,brstack", "--pid",
407 PIDs
, "-i", PerfData
};
408 sys::ExecuteAndWait(PerfPath
, ScriptSampleArgs
, llvm::None
, Redirects
);
410 return {PerfTraceFile
, PerfFormat::PerfScript
, PerfContent::UnknownContent
};
413 void PerfScriptReader::updateBinaryAddress(const MMapEvent
&Event
) {
414 // Drop the event which doesn't belong to user-provided binary
415 StringRef BinaryName
= llvm::sys::path::filename(Event
.BinaryPath
);
416 if (Binary
->getName() != BinaryName
)
419 // Drop the event if its image is loaded at the same address
420 if (Event
.Address
== Binary
->getBaseAddress()) {
421 Binary
->setIsLoadedByMMap(true);
425 if (Event
.Offset
== Binary
->getTextSegmentOffset()) {
426 // A binary image could be unloaded and then reloaded at different
427 // place, so update binary load address.
428 // Only update for the first executable segment and assume all other
429 // segments are loaded at consecutive memory addresses, which is the case on
431 Binary
->setBaseAddress(Event
.Address
);
432 Binary
->setIsLoadedByMMap(true);
434 // Verify segments are loaded consecutively.
435 const auto &Offsets
= Binary
->getTextSegmentOffsets();
436 auto It
= std::lower_bound(Offsets
.begin(), Offsets
.end(), Event
.Offset
);
437 if (It
!= Offsets
.end() && *It
== Event
.Offset
) {
438 // The event is for loading a separate executable segment.
439 auto I
= std::distance(Offsets
.begin(), It
);
440 const auto &PreferredAddrs
= Binary
->getPreferredTextSegmentAddresses();
441 if (PreferredAddrs
[I
] - Binary
->getPreferredBaseAddress() !=
442 Event
.Address
- Binary
->getBaseAddress())
443 exitWithError("Executable segments not loaded consecutively");
445 if (It
== Offsets
.begin())
446 exitWithError("File offset not found");
448 // Find the segment the event falls in. A large segment could be loaded
449 // via multiple mmap calls with consecutive memory addresses.
451 assert(*It
< Event
.Offset
);
452 if (Event
.Offset
- *It
!= Event
.Address
- Binary
->getBaseAddress())
453 exitWithError("Segment not loaded by consecutive mmaps");
459 static std::string
getContextKeyStr(ContextKey
*K
,
460 const ProfiledBinary
*Binary
) {
461 if (const auto *CtxKey
= dyn_cast
<StringBasedCtxKey
>(K
)) {
462 return SampleContext::getContextString(CtxKey
->Context
);
463 } else if (const auto *CtxKey
= dyn_cast
<ProbeBasedCtxKey
>(K
)) {
464 SampleContextFrameVector ContextStack
;
465 for (const auto *Probe
: CtxKey
->Probes
) {
466 Binary
->getInlineContextForProbe(Probe
, ContextStack
, true);
468 // Probe context key at this point does not have leaf probe, so do not
469 // include the leaf inline location.
470 return SampleContext::getContextString(ContextStack
, true);
472 llvm_unreachable("unexpected key type");
476 void HybridPerfReader::unwindSamples() {
477 if (Binary
->useFSDiscriminator())
478 exitWithError("FS discriminator is not supported in CS profile.");
479 VirtualUnwinder
Unwinder(&SampleCounters
, Binary
);
480 for (const auto &Item
: AggregatedSamples
) {
481 const PerfSample
*Sample
= Item
.first
.getPtr();
482 Unwinder
.unwind(Sample
, Item
.second
);
485 // Warn about untracked frames due to missing probes.
486 if (ShowDetailedWarning
) {
487 for (auto Address
: Unwinder
.getUntrackedCallsites())
488 WithColor::warning() << "Profile context truncated due to missing probe "
489 << "for call instruction at "
490 << format("0x%" PRIx64
, Address
) << "\n";
493 emitWarningSummary(Unwinder
.getUntrackedCallsites().size(),
494 SampleCounters
.size(),
495 "of profiled contexts are truncated due to missing probe "
496 "for call instruction.");
499 Unwinder
.NumMismatchedExtCallBranch
, Unwinder
.NumTotalBranches
,
500 "of branches'source is a call instruction but doesn't match call frame "
501 "stack, likely due to unwinding error of external frame.");
504 Unwinder
.NumMismatchedProEpiBranch
, Unwinder
.NumTotalBranches
,
505 "of branches'source is a call instruction but doesn't match call frame "
506 "stack, likely due to frame in prolog/epilog.");
508 emitWarningSummary(Unwinder
.NumMissingExternalFrame
,
509 Unwinder
.NumExtCallBranch
,
510 "of artificial call branches but doesn't have an external "
514 bool PerfScriptReader::extractLBRStack(TraceStream
&TraceIt
,
515 SmallVectorImpl
<LBREntry
> &LBRStack
) {
516 // The raw format of LBR stack is like:
517 // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
518 // ... 0x4005c8/0x4005dc/P/-/-/0
519 // It's in FIFO order and seperated by whitespace.
520 SmallVector
<StringRef
, 32> Records
;
521 TraceIt
.getCurrentLine().split(Records
, " ", -1, false);
522 auto WarnInvalidLBR
= [](TraceStream
&TraceIt
) {
523 WithColor::warning() << "Invalid address in LBR record at line "
524 << TraceIt
.getLineNumber() << ": "
525 << TraceIt
.getCurrentLine() << "\n";
528 // Skip the leading instruction pointer.
530 uint64_t LeadingAddr
;
531 if (!Records
.empty() && !Records
[0].contains('/')) {
532 if (Records
[0].getAsInteger(16, LeadingAddr
)) {
533 WarnInvalidLBR(TraceIt
);
539 // Now extract LBR samples - note that we do not reverse the
540 // LBR entry order so we can unwind the sample stack as we walk
541 // through LBR entries.
542 uint64_t PrevTrDst
= 0;
544 while (Index
< Records
.size()) {
545 auto &Token
= Records
[Index
++];
546 if (Token
.size() == 0)
549 SmallVector
<StringRef
, 8> Addresses
;
550 Token
.split(Addresses
, "/");
554 // Stop at broken LBR records.
555 if (Addresses
.size() < 2 || Addresses
[0].substr(2).getAsInteger(16, Src
) ||
556 Addresses
[1].substr(2).getAsInteger(16, Dst
)) {
557 WarnInvalidLBR(TraceIt
);
561 bool SrcIsInternal
= Binary
->addressIsCode(Src
);
562 bool DstIsInternal
= Binary
->addressIsCode(Dst
);
563 bool IsExternal
= !SrcIsInternal
&& !DstIsInternal
;
564 bool IsIncoming
= !SrcIsInternal
&& DstIsInternal
;
565 bool IsOutgoing
= SrcIsInternal
&& !DstIsInternal
;
566 bool IsArtificial
= false;
568 // Ignore branches outside the current binary.
570 if (!PrevTrDst
&& !LBRStack
.empty()) {
572 << "Invalid transfer to external code in LBR record at line "
573 << TraceIt
.getLineNumber() << ": " << TraceIt
.getCurrentLine()
576 // Do not ignore the entire samples, the remaining LBR can still be
577 // unwound using a context-less stack.
583 // This is a leading outgoing LBR, we should keep processing the LBRs.
584 if (LBRStack
.empty()) {
585 NumLeadingOutgoingLBR
++;
586 // Record this LBR since current source and next LBR' target is still
588 LBRStack
.emplace_back(LBREntry(Src
, ExternalAddr
, false));
591 // This is middle unpaired outgoing jump which is likely due to
592 // interrupt or incomplete LBR trace. Ignore current and subsequent
593 // entries since they are likely in different contexts.
597 // For transition to external code, group the Source with the next
598 // availabe transition target.
604 // If we have seen an incoming transition from external code to internal
605 // code, but not a following outgoing transition, the incoming
606 // transition is likely due to interrupt which is usually unpaired.
607 // Ignore current and subsequent entries since they are likely in
608 // different contexts.
613 // For transition from external code (such as dynamic libraries) to
614 // the current binary, keep track of the branch target which will be
615 // grouped with the Source of the last transition from the current
622 // TODO: filter out buggy duplicate branches on Skylake
624 LBRStack
.emplace_back(LBREntry(Src
, Dst
, IsArtificial
));
627 return !LBRStack
.empty();
630 bool PerfScriptReader::extractCallstack(TraceStream
&TraceIt
,
631 SmallVectorImpl
<uint64_t> &CallStack
) {
632 // The raw format of call stack is like:
633 // 4005dc # leaf frame
635 // 400684 # root frame
636 // It's in bottom-up order with each frame in one line.
638 // Extract stack frames from sample
639 while (!TraceIt
.isAtEoF() && !TraceIt
.getCurrentLine().startswith(" 0x")) {
640 StringRef FrameStr
= TraceIt
.getCurrentLine().ltrim();
641 uint64_t FrameAddr
= 0;
642 if (FrameStr
.getAsInteger(16, FrameAddr
)) {
643 // We might parse a non-perf sample line like empty line and comments,
649 // Currently intermixed frame from different binaries is not supported.
650 if (!Binary
->addressIsCode(FrameAddr
)) {
651 if (CallStack
.empty())
652 NumLeafExternalFrame
++;
653 // Push a special value(ExternalAddr) for the external frames so that
654 // unwinder can still work on this with artificial Call/Return branch.
655 // After unwinding, the context will be truncated for external frame.
656 // Also deduplicate the consecutive external addresses.
657 if (CallStack
.empty() || CallStack
.back() != ExternalAddr
)
658 CallStack
.emplace_back(ExternalAddr
);
662 // We need to translate return address to call address for non-leaf frames.
663 if (!CallStack
.empty()) {
664 auto CallAddr
= Binary
->getCallAddrFromFrameAddr(FrameAddr
);
666 // Stop at an invalid return address caused by bad unwinding. This could
667 // happen to frame-pointer-based unwinding and the callee functions that
668 // do not have the frame pointer chain set up.
669 InvalidReturnAddresses
.insert(FrameAddr
);
672 FrameAddr
= CallAddr
;
675 CallStack
.emplace_back(FrameAddr
);
678 // Strip out the bottom external addr.
679 if (CallStack
.size() > 1 && CallStack
.back() == ExternalAddr
)
680 CallStack
.pop_back();
682 // Skip other unrelated line, find the next valid LBR line
683 // Note that even for empty call stack, we should skip the address at the
684 // bottom, otherwise the following pass may generate a truncated callstack
685 while (!TraceIt
.isAtEoF() && !TraceIt
.getCurrentLine().startswith(" 0x")) {
688 // Filter out broken stack sample. We may not have complete frame info
689 // if sample end up in prolog/epilog, the result is dangling context not
690 // connected to entry point. This should be relatively rare thus not much
691 // impact on overall profile quality. However we do want to filter them
692 // out to reduce the number of different calling contexts. One instance
693 // of such case - when sample landed in prolog/epilog, somehow stack
694 // walking will be broken in an unexpected way that higher frames will be
696 return !CallStack
.empty() &&
697 !Binary
->addressInPrologEpilog(CallStack
.front());
700 void PerfScriptReader::warnIfMissingMMap() {
701 if (!Binary
->getMissingMMapWarned() && !Binary
->getIsLoadedByMMap()) {
702 WithColor::warning() << "No relevant mmap event is matched for "
704 << ", will use preferred address ("
705 << format("0x%" PRIx64
,
706 Binary
->getPreferredBaseAddress())
707 << ") as the base loading address!\n";
708 // Avoid redundant warning, only warn at the first unmatched sample.
709 Binary
->setMissingMMapWarned(true);
713 void HybridPerfReader::parseSample(TraceStream
&TraceIt
, uint64_t Count
) {
714 // The raw hybird sample started with call stack in FILO order and followed
715 // intermediately by LBR sample
717 // 4005dc # call stack leaf
719 // 400684 # call stack root
720 // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
721 // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries
723 std::shared_ptr
<PerfSample
> Sample
= std::make_shared
<PerfSample
>();
725 // Parsing call stack and populate into PerfSample.CallStack
726 if (!extractCallstack(TraceIt
, Sample
->CallStack
)) {
727 // Skip the next LBR line matched current call stack
728 if (!TraceIt
.isAtEoF() && TraceIt
.getCurrentLine().startswith(" 0x"))
735 if (!TraceIt
.isAtEoF() && TraceIt
.getCurrentLine().startswith(" 0x")) {
736 // Parsing LBR stack and populate into PerfSample.LBRStack
737 if (extractLBRStack(TraceIt
, Sample
->LBRStack
)) {
738 if (IgnoreStackSamples
) {
739 Sample
->CallStack
.clear();
741 // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR
743 Sample
->CallStack
.front() = Sample
->LBRStack
[0].Target
;
745 // Record samples by aggregation
746 AggregatedSamples
[Hashable
<PerfSample
>(Sample
)] += Count
;
749 // LBR sample is encoded in single line after stack sample
750 exitWithError("'Hybrid perf sample is corrupted, No LBR sample line");
754 void PerfScriptReader::writeUnsymbolizedProfile(StringRef Filename
) {
756 raw_fd_ostream
OS(Filename
, EC
, llvm::sys::fs::OF_TextWithCRLF
);
758 exitWithError(EC
, Filename
);
759 writeUnsymbolizedProfile(OS
);
762 // Use ordered map to make the output deterministic
763 using OrderedCounterForPrint
= std::map
<std::string
, SampleCounter
*>;
765 void PerfScriptReader::writeUnsymbolizedProfile(raw_fd_ostream
&OS
) {
766 OrderedCounterForPrint OrderedCounters
;
767 for (auto &CI
: SampleCounters
) {
768 OrderedCounters
[getContextKeyStr(CI
.first
.getPtr(), Binary
)] = &CI
.second
;
771 auto SCounterPrinter
= [&](RangeSample
&Counter
, StringRef Separator
,
774 OS
<< Counter
.size() << "\n";
775 for (auto &I
: Counter
) {
776 uint64_t Start
= I
.first
.first
;
777 uint64_t End
= I
.first
.second
;
779 if (!UseOffset
|| (UseOffset
&& UseLoadableSegmentAsBase
)) {
780 Start
= Binary
->offsetToVirtualAddr(Start
);
781 End
= Binary
->offsetToVirtualAddr(End
);
784 if (UseOffset
&& UseLoadableSegmentAsBase
) {
785 Start
-= Binary
->getFirstLoadableAddress();
786 End
-= Binary
->getFirstLoadableAddress();
790 OS
<< Twine::utohexstr(Start
) << Separator
<< Twine::utohexstr(End
) << ":"
795 for (auto &CI
: OrderedCounters
) {
797 if (ProfileIsCSFlat
) {
798 // Context string key
799 OS
<< "[" << CI
.first
<< "]\n";
803 SampleCounter
&Counter
= *CI
.second
;
804 SCounterPrinter(Counter
.RangeCounter
, "-", Indent
);
805 SCounterPrinter(Counter
.BranchCounter
, "->", Indent
);
810 // number of entries in RangeCounter
811 // from_1-to_1:count_1
812 // from_2-to_2:count_2
814 // from_n-to_n:count_n
815 // number of entries in BranchCounter
816 // src_1->dst_1:count_1
817 // src_2->dst_2:count_2
819 // src_n->dst_n:count_n
820 void UnsymbolizedProfileReader::readSampleCounters(TraceStream
&TraceIt
,
821 SampleCounter
&SCounters
) {
822 auto exitWithErrorForTraceLine
= [](TraceStream
&TraceIt
) {
823 std::string Msg
= TraceIt
.isAtEoF()
824 ? "Invalid raw profile!"
825 : "Invalid raw profile at line " +
826 Twine(TraceIt
.getLineNumber()).str() + ": " +
827 TraceIt
.getCurrentLine().str();
830 auto ReadNumber
= [&](uint64_t &Num
) {
831 if (TraceIt
.isAtEoF())
832 exitWithErrorForTraceLine(TraceIt
);
833 if (TraceIt
.getCurrentLine().ltrim().getAsInteger(10, Num
))
834 exitWithErrorForTraceLine(TraceIt
);
838 auto ReadCounter
= [&](RangeSample
&Counter
, StringRef Separator
) {
842 if (TraceIt
.isAtEoF())
843 exitWithErrorForTraceLine(TraceIt
);
844 StringRef Line
= TraceIt
.getCurrentLine().ltrim();
847 auto LineSplit
= Line
.split(":");
848 if (LineSplit
.second
.empty() || LineSplit
.second
.getAsInteger(10, Count
))
849 exitWithErrorForTraceLine(TraceIt
);
853 auto Range
= LineSplit
.first
.split(Separator
);
854 if (Range
.second
.empty() || Range
.first
.getAsInteger(16, Source
) ||
855 Range
.second
.getAsInteger(16, Target
))
856 exitWithErrorForTraceLine(TraceIt
);
858 if (!UseOffset
|| (UseOffset
&& UseLoadableSegmentAsBase
)) {
859 uint64_t BaseAddr
= 0;
860 if (UseOffset
&& UseLoadableSegmentAsBase
)
861 BaseAddr
= Binary
->getFirstLoadableAddress();
863 Source
= Binary
->virtualAddrToOffset(Source
+ BaseAddr
);
864 Target
= Binary
->virtualAddrToOffset(Target
+ BaseAddr
);
867 Counter
[{Source
, Target
}] += Count
;
872 ReadCounter(SCounters
.RangeCounter
, "-");
873 ReadCounter(SCounters
.BranchCounter
, "->");
876 void UnsymbolizedProfileReader::readUnsymbolizedProfile(StringRef FileName
) {
877 TraceStream
TraceIt(FileName
);
878 while (!TraceIt
.isAtEoF()) {
879 std::shared_ptr
<StringBasedCtxKey
> Key
=
880 std::make_shared
<StringBasedCtxKey
>();
881 StringRef Line
= TraceIt
.getCurrentLine();
882 // Read context stack for CS profile.
883 if (Line
.startswith("[")) {
884 ProfileIsCSFlat
= true;
885 auto I
= ContextStrSet
.insert(Line
.str());
886 SampleContext::createCtxVectorFromStr(*I
.first
, Key
->Context
);
890 SampleCounters
.emplace(Hashable
<ContextKey
>(Key
), SampleCounter());
891 readSampleCounters(TraceIt
, Ret
.first
->second
);
895 void UnsymbolizedProfileReader::parsePerfTraces() {
896 readUnsymbolizedProfile(PerfTraceFile
);
899 void PerfScriptReader::computeCounterFromLBR(const PerfSample
*Sample
,
901 SampleCounter
&Counter
= SampleCounters
.begin()->second
;
902 uint64_t EndOffeset
= 0;
903 for (const LBREntry
&LBR
: Sample
->LBRStack
) {
904 assert(LBR
.Source
!= ExternalAddr
&&
905 "Branch' source should not be an external address, it should be "
906 "converted to aritificial branch.");
907 uint64_t SourceOffset
= Binary
->virtualAddrToOffset(LBR
.Source
);
908 uint64_t TargetOffset
= LBR
.Target
== static_cast<uint64_t>(ExternalAddr
)
909 ? static_cast<uint64_t>(ExternalAddr
)
910 : Binary
->virtualAddrToOffset(LBR
.Target
);
912 if (!LBR
.IsArtificial
&& TargetOffset
!= ExternalAddr
) {
913 Counter
.recordBranchCount(SourceOffset
, TargetOffset
, Repeat
);
916 // If this not the first LBR, update the range count between TO of current
917 // LBR and FROM of next LBR.
918 uint64_t StartOffset
= TargetOffset
;
920 Counter
.recordRangeCount(StartOffset
, EndOffeset
, Repeat
);
921 EndOffeset
= SourceOffset
;
925 void LBRPerfReader::parseSample(TraceStream
&TraceIt
, uint64_t Count
) {
926 std::shared_ptr
<PerfSample
> Sample
= std::make_shared
<PerfSample
>();
927 // Parsing LBR stack and populate into PerfSample.LBRStack
928 if (extractLBRStack(TraceIt
, Sample
->LBRStack
)) {
930 // Record LBR only samples by aggregation
931 AggregatedSamples
[Hashable
<PerfSample
>(Sample
)] += Count
;
935 void PerfScriptReader::generateUnsymbolizedProfile() {
936 // There is no context for LBR only sample, so initialize one entry with
937 // fake "empty" context key.
938 assert(SampleCounters
.empty() &&
939 "Sample counter map should be empty before raw profile generation");
940 std::shared_ptr
<StringBasedCtxKey
> Key
=
941 std::make_shared
<StringBasedCtxKey
>();
942 SampleCounters
.emplace(Hashable
<ContextKey
>(Key
), SampleCounter());
943 for (const auto &Item
: AggregatedSamples
) {
944 const PerfSample
*Sample
= Item
.first
.getPtr();
945 computeCounterFromLBR(Sample
, Item
.second
);
949 uint64_t PerfScriptReader::parseAggregatedCount(TraceStream
&TraceIt
) {
950 // The aggregated count is optional, so do not skip the line and return 1 if
953 if (!TraceIt
.getCurrentLine().getAsInteger(10, Count
))
958 void PerfScriptReader::parseSample(TraceStream
&TraceIt
) {
960 uint64_t Count
= parseAggregatedCount(TraceIt
);
961 assert(Count
>= 1 && "Aggregated count should be >= 1!");
962 parseSample(TraceIt
, Count
);
965 bool PerfScriptReader::extractMMap2EventForBinary(ProfiledBinary
*Binary
,
968 // Parse a line like:
969 // PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0
970 // 08:04 19532229 3585508847]: r-xp /usr/lib64/libdl-2.17.so
971 constexpr static const char *const Pattern
=
972 "PERF_RECORD_MMAP2 ([0-9]+)/[0-9]+: "
973 "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ "
974 "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)";
975 // Field 0 - whole line
977 // Field 2 - base address
978 // Field 3 - mmapped size
979 // Field 4 - page offset
980 // Field 5 - binary path
990 Regex
RegMmap2(Pattern
);
991 SmallVector
<StringRef
, 6> Fields
;
992 bool R
= RegMmap2
.match(Line
, &Fields
);
994 std::string ErrorMsg
= "Cannot parse mmap event: " + Line
.str() + " \n";
995 exitWithError(ErrorMsg
);
997 Fields
[PID
].getAsInteger(10, MMap
.PID
);
998 Fields
[MMAPPED_ADDRESS
].getAsInteger(0, MMap
.Address
);
999 Fields
[MMAPPED_SIZE
].getAsInteger(0, MMap
.Size
);
1000 Fields
[PAGE_OFFSET
].getAsInteger(0, MMap
.Offset
);
1001 MMap
.BinaryPath
= Fields
[BINARY_PATH
];
1002 if (ShowMmapEvents
) {
1003 outs() << "Mmap: Binary " << MMap
.BinaryPath
<< " loaded at "
1004 << format("0x%" PRIx64
":", MMap
.Address
) << " \n";
1007 StringRef BinaryName
= llvm::sys::path::filename(MMap
.BinaryPath
);
1008 return Binary
->getName() == BinaryName
;
1011 void PerfScriptReader::parseMMap2Event(TraceStream
&TraceIt
) {
1013 if (extractMMap2EventForBinary(Binary
, TraceIt
.getCurrentLine(), MMap
))
1014 updateBinaryAddress(MMap
);
1018 void PerfScriptReader::parseEventOrSample(TraceStream
&TraceIt
) {
1019 if (isMMap2Event(TraceIt
.getCurrentLine()))
1020 parseMMap2Event(TraceIt
);
1022 parseSample(TraceIt
);
1025 void PerfScriptReader::parseAndAggregateTrace() {
1026 // Trace line iterator
1027 TraceStream
TraceIt(PerfTraceFile
);
1028 while (!TraceIt
.isAtEoF())
1029 parseEventOrSample(TraceIt
);
1032 // A LBR sample is like:
1033 // 40062f 0x5c6313f/0x5c63170/P/-/-/0 0x5c630e7/0x5c63130/P/-/-/0 ...
1034 // A heuristic for fast detection by checking whether a
1035 // leading " 0x" and the '/' exist.
1036 bool PerfScriptReader::isLBRSample(StringRef Line
) {
1037 // Skip the leading instruction pointer
1038 SmallVector
<StringRef
, 32> Records
;
1039 Line
.trim().split(Records
, " ", 2, false);
1040 if (Records
.size() < 2)
1042 if (Records
[1].startswith("0x") && Records
[1].contains('/'))
1047 bool PerfScriptReader::isMMap2Event(StringRef Line
) {
1048 // Short cut to avoid string find is possible.
1049 if (Line
.empty() || Line
.size() < 50)
1052 if (std::isdigit(Line
[0]))
1055 // PERF_RECORD_MMAP2 does not appear at the beginning of the line
1056 // for ` perf script --show-mmap-events -i ...`
1057 return Line
.contains("PERF_RECORD_MMAP2");
1060 // The raw hybird sample is like
1062 // 4005dc # call stack leaf
1064 // 400684 # call stack root
1065 // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
1066 // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries
1067 // Determine the perfscript contains hybrid samples(call stack + LBRs) by
1068 // checking whether there is a non-empty call stack immediately followed by
1070 PerfContent
PerfScriptReader::checkPerfScriptType(StringRef FileName
) {
1071 TraceStream
TraceIt(FileName
);
1072 uint64_t FrameAddr
= 0;
1073 while (!TraceIt
.isAtEoF()) {
1074 // Skip the aggregated count
1075 if (!TraceIt
.getCurrentLine().getAsInteger(10, FrameAddr
))
1078 // Detect sample with call stack
1080 while (!TraceIt
.isAtEoF() &&
1081 !TraceIt
.getCurrentLine().ltrim().getAsInteger(16, FrameAddr
)) {
1085 if (!TraceIt
.isAtEoF()) {
1086 if (isLBRSample(TraceIt
.getCurrentLine())) {
1088 return PerfContent::LBRStack
;
1090 return PerfContent::LBR
;
1096 exitWithError("Invalid perf script input!");
1097 return PerfContent::UnknownContent
;
1100 void HybridPerfReader::generateUnsymbolizedProfile() {
1101 ProfileIsCSFlat
= !IgnoreStackSamples
;
1102 if (ProfileIsCSFlat
)
1105 PerfScriptReader::generateUnsymbolizedProfile();
1108 void PerfScriptReader::warnTruncatedStack() {
1109 if (ShowDetailedWarning
) {
1110 for (auto Address
: InvalidReturnAddresses
) {
1111 WithColor::warning()
1112 << "Truncated stack sample due to invalid return address at "
1113 << format("0x%" PRIx64
, Address
)
1114 << ", likely caused by frame pointer omission\n";
1118 InvalidReturnAddresses
.size(), AggregatedSamples
.size(),
1119 "of truncated stack samples due to invalid return address, "
1120 "likely caused by frame pointer omission.");
1123 void PerfScriptReader::warnInvalidRange() {
1124 std::unordered_map
<std::pair
<uint64_t, uint64_t>, uint64_t,
1125 pair_hash
<uint64_t, uint64_t>>
1128 for (const auto &Item
: AggregatedSamples
) {
1129 const PerfSample
*Sample
= Item
.first
.getPtr();
1130 uint64_t Count
= Item
.second
;
1131 uint64_t EndOffeset
= 0;
1132 for (const LBREntry
&LBR
: Sample
->LBRStack
) {
1133 uint64_t SourceOffset
= Binary
->virtualAddrToOffset(LBR
.Source
);
1134 uint64_t StartOffset
= Binary
->virtualAddrToOffset(LBR
.Target
);
1135 if (EndOffeset
!= 0)
1136 Ranges
[{StartOffset
, EndOffeset
}] += Count
;
1137 EndOffeset
= SourceOffset
;
1141 if (Ranges
.empty()) {
1142 WithColor::warning() << "No samples in perf script!\n";
1146 auto WarnInvalidRange
=
1147 [&](uint64_t StartOffset
, uint64_t EndOffset
, StringRef Msg
) {
1148 if (!ShowDetailedWarning
)
1150 WithColor::warning()
1152 << format("%8" PRIx64
, Binary
->offsetToVirtualAddr(StartOffset
))
1154 << format("%8" PRIx64
, Binary
->offsetToVirtualAddr(EndOffset
))
1155 << "]: " << Msg
<< "\n";
1158 const char *EndNotBoundaryMsg
= "Range is not on instruction boundary, "
1159 "likely due to profile and binary mismatch.";
1160 const char *DanglingRangeMsg
= "Range does not belong to any functions, "
1161 "likely from PLT, .init or .fini section.";
1162 const char *RangeCrossFuncMsg
=
1163 "Fall through range should not cross function boundaries, likely due to "
1164 "profile and binary mismatch.";
1166 uint64_t InstNotBoundary
= 0;
1167 uint64_t UnmatchedRange
= 0;
1168 uint64_t RangeCrossFunc
= 0;
1170 for (auto &I
: Ranges
) {
1171 uint64_t StartOffset
= I
.first
.first
;
1172 uint64_t EndOffset
= I
.first
.second
;
1174 if (!Binary
->offsetIsCode(StartOffset
) ||
1175 !Binary
->offsetIsTransfer(EndOffset
)) {
1177 WarnInvalidRange(StartOffset
, EndOffset
, EndNotBoundaryMsg
);
1180 auto *FRange
= Binary
->findFuncRangeForOffset(StartOffset
);
1183 WarnInvalidRange(StartOffset
, EndOffset
, DanglingRangeMsg
);
1187 if (EndOffset
>= FRange
->EndOffset
) {
1189 WarnInvalidRange(StartOffset
, EndOffset
, RangeCrossFuncMsg
);
1193 uint64_t TotalRangeNum
= Ranges
.size();
1194 emitWarningSummary(InstNotBoundary
, TotalRangeNum
,
1195 "of profiled ranges are not on instruction boundary.");
1196 emitWarningSummary(UnmatchedRange
, TotalRangeNum
,
1197 "of profiled ranges do not belong to any functions.");
1198 emitWarningSummary(RangeCrossFunc
, TotalRangeNum
,
1199 "of profiled ranges do cross function boundaries.");
1202 void PerfScriptReader::parsePerfTraces() {
1203 // Parse perf traces and do aggregation.
1204 parseAndAggregateTrace();
1206 emitWarningSummary(NumLeafExternalFrame
, NumTotalSample
,
1207 "of samples have leaf external frame in call stack.");
1208 emitWarningSummary(NumLeadingOutgoingLBR
, NumTotalSample
,
1209 "of samples have leading external LBR.");
1211 // Generate unsymbolized profile.
1212 warnTruncatedStack();
1214 generateUnsymbolizedProfile();
1216 if (SkipSymbolization
)
1217 writeUnsymbolizedProfile(OutputFilename
);
1220 } // end namespace sampleprof
1221 } // end namespace llvm