1 //===- bolt/runtime/instr.cpp ---------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // BOLT runtime instrumentation library for x86 Linux. Currently, BOLT does
10 // not support linking modules with dependencies on one another into the final
11 // binary (TODO?), which means this library has to be self-contained in a single
14 // All extern declarations here need to be defined by BOLT itself. Those will be
15 // undefined symbols that BOLT needs to resolve by emitting these symbols with
16 // MCStreamer. Currently, Passes/Instrumentation.cpp is the pass responsible
17 // for defining the symbols here and these two files have a tight coupling: one
18 // working statically when you run BOLT and another during program runtime when
19 // you run an instrumented binary. The main goal here is to output an fdata file
20 // (BOLT profile) with the instrumentation counters inserted by the static pass.
21 // Counters for indirect calls are an exception, as we can't know them
22 // statically. These counters are created and managed here. To allow this, we
23 // need a minimal framework for allocating memory dynamically. We provide this
24 // with the BumpPtrAllocator class (not LLVM's, but our own version of it).
26 // Since this code is intended to be inserted into any executable, we decided to
27 // make it standalone and do not depend on any external libraries (i.e. language
28 // support libraries, such as glibc or stdc++). To allow this, we provide a few
29 // light implementations of common OS interacting functionalities using direct
30 // syscall wrappers. Our simple allocator doesn't manage deallocations that
31 // fragment the memory space, so it's stack based. This is the minimal framework
32 // provided here to allow processing instrumented counters and writing fdata.
34 // In the C++ idiom used here, we never use or rely on constructors or
35 // destructors for global objects. That's because those need support from the
36 // linker in initialization/finalization code, and we want to keep our linker
37 // very simple. Similarly, we don't create any global objects that are zero
38 // initialized, since those would need to go .bss, which our simple linker also
39 // don't support (TODO?).
41 //===----------------------------------------------------------------------===//
45 // Enables a very verbose logging to stderr useful when debugging
46 //#define ENABLE_DEBUG
56 #pragma GCC visibility push(hidden)
60 #if defined(__APPLE__)
61 extern uint64_t* _bolt_instr_locations_getter();
62 extern uint32_t _bolt_num_counters_getter();
64 extern uint8_t* _bolt_instr_tables_getter();
65 extern uint32_t _bolt_instr_num_funcs_getter();
69 // Main counters inserted by instrumentation, incremented during runtime when
70 // points of interest (locations) in the program are reached. Those are direct
71 // calls and direct and indirect branches (local ones). There are also counters
72 // for basic block execution if they are a spanning tree leaf and need to be
73 // counted in order to infer the execution count of other edges of the CFG.
74 extern uint64_t __bolt_instr_locations
[];
75 extern uint32_t __bolt_num_counters
;
76 // Descriptions are serialized metadata about binary functions written by BOLT,
77 // so we have a minimal understanding about the program structure. For a
78 // reference on the exact format of this metadata, see *Description structs,
79 // Location, IntrumentedNode and EntryNode.
80 // Number of indirect call site descriptions
81 extern uint32_t __bolt_instr_num_ind_calls
;
82 // Number of indirect call target descriptions
83 extern uint32_t __bolt_instr_num_ind_targets
;
84 // Number of function descriptions
85 extern uint32_t __bolt_instr_num_funcs
;
86 // Time to sleep across dumps (when we write the fdata profile to disk)
87 extern uint32_t __bolt_instr_sleep_time
;
88 // Do not clear counters across dumps, rewrite file with the updated values
89 extern bool __bolt_instr_no_counters_clear
;
90 // Wait until all forks of instrumented process will finish
91 extern bool __bolt_instr_wait_forks
;
92 // Filename to dump data to
93 extern char __bolt_instr_filename
[];
94 // Instumented binary file path
95 extern char __bolt_instr_binpath
[];
96 // If true, append current PID to the fdata filename when creating it so
97 // different invocations of the same program can be differentiated.
98 extern bool __bolt_instr_use_pid
;
99 // Functions that will be used to instrument indirect calls. BOLT static pass
100 // will identify indirect calls and modify them to load the address in these
101 // trampolines and call this address instead. BOLT can't use direct calls to
102 // our handlers because our addresses here are not known at analysis time. We
103 // only support resolving dependencies from this file to the output of BOLT,
104 // *not* the other way around.
105 // TODO: We need better linking support to make that happen.
106 extern void (*__bolt_ind_call_counter_func_pointer
)();
107 extern void (*__bolt_ind_tailcall_counter_func_pointer
)();
108 // Function pointers to init/fini trampoline routines in the binary, so we can
109 // resume regular execution of these functions that we hooked
110 extern void __bolt_start_trampoline();
111 extern void __bolt_fini_trampoline();
118 /// A simple allocator that mmaps a fixed size region and manages this space
119 /// in a stack fashion, meaning you always deallocate the last element that
120 /// was allocated. In practice, we don't need to deallocate individual elements.
121 /// We monotonically increase our usage and then deallocate everything once we
122 /// are done processing something.
123 class BumpPtrAllocator
{
124 /// This is written before each allocation and act as a canary to detect when
125 /// a bug caused our program to cross allocation boundaries.
126 struct EntryMetadata
{
132 void *allocate(size_t Size
) {
135 if (StackBase
== nullptr) {
136 StackBase
= reinterpret_cast<uint8_t *>(
137 __mmap(0, MaxSize
, PROT_READ
| PROT_WRITE
,
138 (Shared
? MAP_SHARED
: MAP_PRIVATE
) | MAP_ANONYMOUS
, -1, 0));
139 assert(StackBase
!= MAP_FAILED
,
140 "BumpPtrAllocator: failed to mmap stack!");
144 Size
= alignTo(Size
+ sizeof(EntryMetadata
), 16);
145 uint8_t *AllocAddress
= StackBase
+ StackSize
+ sizeof(EntryMetadata
);
146 auto *M
= reinterpret_cast<EntryMetadata
*>(StackBase
+ StackSize
);
150 assert(StackSize
< MaxSize
, "allocator ran out of memory");
155 /// Element-wise deallocation is only used for debugging to catch memory
156 /// bugs by checking magic bytes. Ordinarily, we reset the allocator once
157 /// we are done with it. Reset is done with clear(). There's no need
158 /// to deallocate each element individually.
159 void deallocate(void *Ptr
) {
161 uint8_t MetadataOffset
= sizeof(EntryMetadata
);
162 auto *M
= reinterpret_cast<EntryMetadata
*>(
163 reinterpret_cast<uint8_t *>(Ptr
) - MetadataOffset
);
164 const uint8_t *StackTop
= StackBase
+ StackSize
+ MetadataOffset
;
166 if (Ptr
!= StackTop
- M
->AllocSize
) {
167 // Failed validation, check if it is a pointer returned by operator new []
169 sizeof(uint64_t); // Space for number of elements alloc'ed
170 M
= reinterpret_cast<EntryMetadata
*>(reinterpret_cast<uint8_t *>(Ptr
) -
172 // Ok, it failed both checks if this assertion fails. Stop the program, we
173 // have a memory bug.
174 assert(Ptr
== StackTop
- M
->AllocSize
,
175 "must deallocate the last element alloc'ed");
177 assert(M
->Magic
== Magic
, "allocator magic is corrupt");
178 StackSize
-= M
->AllocSize
;
181 void deallocate(void *) {}
189 /// Set mmap reservation size (only relevant before first allocation)
190 void setMaxSize(uint64_t Size
) { MaxSize
= Size
; }
192 /// Set mmap reservation privacy (only relevant before first allocation)
193 void setShared(bool S
) { Shared
= S
; }
196 if (StackBase
== nullptr)
198 __munmap(StackBase
, MaxSize
);
201 // Placement operator to construct allocator in possibly shared mmaped memory
202 static void *operator new(size_t, void *Ptr
) { return Ptr
; };
205 static constexpr uint64_t Magic
= 0x1122334455667788ull
;
206 uint64_t MaxSize
= 0xa00000;
207 uint8_t *StackBase
{nullptr};
208 uint64_t StackSize
{0};
213 /// Used for allocating indirect call instrumentation counters. Initialized by
214 /// __bolt_instr_setup, our initialization routine.
215 BumpPtrAllocator
*GlobalAlloc
;
217 // Base address which we substract from recorded PC values when searching for
218 // indirect call description entries. Needed because indCall descriptions are
219 // mapped read-only and contain static addresses. Initialized in
220 // __bolt_instr_setup.
221 uint64_t TextBaseAddress
= 0;
223 // Storage for GlobalAlloc which can be shared if not using
224 // instrumentation-file-append-pid.
225 void *GlobalMetadataStorage
;
227 } // anonymous namespace
229 // User-defined placement new operators. We only use those (as opposed to
230 // overriding the regular operator new) so we can keep our allocator in the
231 // stack instead of in a data section (global).
232 void *operator new(size_t Sz
, BumpPtrAllocator
&A
) { return A
.allocate(Sz
); }
233 void *operator new(size_t Sz
, BumpPtrAllocator
&A
, char C
) {
234 auto *Ptr
= reinterpret_cast<char *>(A
.allocate(Sz
));
238 void *operator new[](size_t Sz
, BumpPtrAllocator
&A
) {
239 return A
.allocate(Sz
);
241 void *operator new[](size_t Sz
, BumpPtrAllocator
&A
, char C
) {
242 auto *Ptr
= reinterpret_cast<char *>(A
.allocate(Sz
));
246 // Only called during exception unwinding (useless). We must manually dealloc.
247 // C++ language weirdness
248 void operator delete(void *Ptr
, BumpPtrAllocator
&A
) { A
.deallocate(Ptr
); }
252 // Disable instrumentation optimizations that sacrifice profile accuracy
253 extern "C" bool __bolt_instr_conservative
;
255 /// Basic key-val atom stored in our hash
256 struct SimpleHashTableEntryBase
{
259 void dump(const char *Msg
= nullptr) {
260 // TODO: make some sort of formatting function
261 // Currently we have to do it the ugly way because
262 // we want every message to be printed atomically via a single call to
263 // __write. If we use reportNumber() and others nultiple times, we'll get
264 // garbage in mulithreaded environment
267 Ptr
= intToStr(Ptr
, __getpid(), 10);
271 Ptr
= strCopy(Ptr
, Msg
, strLen(Msg
));
274 Ptr
= intToStr(Ptr
, (uint64_t)this, 16);
277 Ptr
= strCopy(Ptr
, "MapEntry(0x", sizeof("MapEntry(0x") - 1);
278 Ptr
= intToStr(Ptr
, Key
, 16);
283 Ptr
= intToStr(Ptr
, Val
, 16);
286 assert(Ptr
- Buf
< BufSize
, "Buffer overflow!");
287 // print everything all at once for atomicity
288 __write(2, Buf
, Ptr
- Buf
);
292 /// This hash table implementation starts by allocating a table of size
293 /// InitialSize. When conflicts happen in this main table, it resolves
294 /// them by chaining a new table of size IncSize. It never reallocs as our
295 /// allocator doesn't support it. The key is intended to be function pointers.
296 /// There's no clever hash function (it's just x mod size, size being prime).
297 /// I never tuned the coefficientes in the modular equation (TODO)
298 /// This is used for indirect calls (each call site has one of this, so it
299 /// should have a small footprint) and for tallying call counts globally for
300 /// each target to check if we missed the origin of some calls (this one is a
301 /// large instantiation of this template, since it is global for all call sites)
302 template <typename T
= SimpleHashTableEntryBase
, uint32_t InitialSize
= 7,
303 uint32_t IncSize
= 7>
304 class SimpleHashTable
{
308 /// Increment by 1 the value of \p Key. If it is not in this table, it will be
309 /// added to the table and its value set to 1.
310 void incrementVal(uint64_t Key
, BumpPtrAllocator
&Alloc
) {
311 if (!__bolt_instr_conservative
) {
315 auto &E
= getOrAllocEntry(Key
, Alloc
);
320 auto &E
= getOrAllocEntry(Key
, Alloc
);
324 /// Basic member accessing interface. Here we pass the allocator explicitly to
325 /// avoid storing a pointer to it as part of this table (remember there is one
326 /// hash for each indirect call site, so we want to minimize our footprint).
327 MapEntry
&get(uint64_t Key
, BumpPtrAllocator
&Alloc
) {
328 if (!__bolt_instr_conservative
) {
332 return getOrAllocEntry(Key
, Alloc
);
335 return getOrAllocEntry(Key
, Alloc
);
338 /// Traverses all elements in the table
339 template <typename
... Args
>
340 void forEachElement(void (*Callback
)(MapEntry
&, Args
...), Args
... args
) {
344 return forEachElement(Callback
, InitialSize
, TableRoot
, args
...);
347 void resetCounters();
350 constexpr static uint64_t VacantMarker
= 0;
351 constexpr static uint64_t FollowUpTableMarker
= 0x8000000000000000ull
;
353 MapEntry
*TableRoot
{nullptr};
357 template <typename
... Args
>
358 void forEachElement(void (*Callback
)(MapEntry
&, Args
...),
359 uint32_t NumEntries
, MapEntry
*Entries
, Args
... args
) {
360 for (uint32_t I
= 0; I
< NumEntries
; ++I
) {
361 MapEntry
&Entry
= Entries
[I
];
362 if (Entry
.Key
== VacantMarker
)
364 if (Entry
.Key
& FollowUpTableMarker
) {
366 reinterpret_cast<MapEntry
*>(Entry
.Key
& ~FollowUpTableMarker
);
367 assert(Next
!= Entries
, "Circular reference!");
368 forEachElement(Callback
, IncSize
, Next
, args
...);
371 Callback(Entry
, args
...);
375 MapEntry
&firstAllocation(uint64_t Key
, BumpPtrAllocator
&Alloc
) {
376 TableRoot
= new (Alloc
, 0) MapEntry
[InitialSize
];
377 MapEntry
&Entry
= TableRoot
[Key
% InitialSize
];
379 // DEBUG(Entry.dump("Created root entry: "));
383 MapEntry
&getEntry(MapEntry
*Entries
, uint64_t Key
, uint64_t Selector
,
384 BumpPtrAllocator
&Alloc
, int CurLevel
) {
385 // DEBUG(reportNumber("getEntry called, level ", CurLevel, 10));
386 const uint32_t NumEntries
= CurLevel
== 0 ? InitialSize
: IncSize
;
387 uint64_t Remainder
= Selector
/ NumEntries
;
388 Selector
= Selector
% NumEntries
;
389 MapEntry
&Entry
= Entries
[Selector
];
392 if (Entry
.Key
== Key
) {
393 // DEBUG(Entry.dump("Hit: "));
397 // Vacant - add new entry
398 if (Entry
.Key
== VacantMarker
) {
400 // DEBUG(Entry.dump("Adding new entry: "));
404 // Defer to the next level
405 if (Entry
.Key
& FollowUpTableMarker
) {
407 reinterpret_cast<MapEntry
*>(Entry
.Key
& ~FollowUpTableMarker
),
408 Key
, Remainder
, Alloc
, CurLevel
+ 1);
411 // Conflict - create the next level
412 // DEBUG(Entry.dump("Creating new level: "));
414 MapEntry
*NextLevelTbl
= new (Alloc
, 0) MapEntry
[IncSize
];
416 // reportNumber("Newly allocated level: 0x", uint64_t(NextLevelTbl),
418 uint64_t CurEntrySelector
= Entry
.Key
/ InitialSize
;
419 for (int I
= 0; I
< CurLevel
; ++I
)
420 CurEntrySelector
/= IncSize
;
421 CurEntrySelector
= CurEntrySelector
% IncSize
;
422 NextLevelTbl
[CurEntrySelector
] = Entry
;
423 Entry
.Key
= reinterpret_cast<uint64_t>(NextLevelTbl
) | FollowUpTableMarker
;
424 assert((NextLevelTbl
[CurEntrySelector
].Key
& ~FollowUpTableMarker
) !=
426 "circular reference created!\n");
427 // DEBUG(NextLevelTbl[CurEntrySelector].dump("New level entry: "));
428 // DEBUG(Entry.dump("Updated old entry: "));
429 return getEntry(NextLevelTbl
, Key
, Remainder
, Alloc
, CurLevel
+ 1);
432 MapEntry
&getOrAllocEntry(uint64_t Key
, BumpPtrAllocator
&Alloc
) {
434 MapEntry
&E
= getEntry(TableRoot
, Key
, Key
, Alloc
, 0);
435 assert(!(E
.Key
& FollowUpTableMarker
), "Invalid entry!");
438 return firstAllocation(Key
, Alloc
);
442 template <typename T
> void resetIndCallCounter(T
&Entry
) {
446 template <typename T
, uint32_t X
, uint32_t Y
>
447 void SimpleHashTable
<T
, X
, Y
>::resetCounters() {
448 forEachElement(resetIndCallCounter
);
451 /// Represents a hash table mapping a function target address to its counter.
452 using IndirectCallHashTable
= SimpleHashTable
<>;
454 /// Initialize with number 1 instead of 0 so we don't go into .bss. This is the
455 /// global array of all hash tables storing indirect call destinations happening
456 /// during runtime, one table per call site.
457 IndirectCallHashTable
*GlobalIndCallCounters
{
458 reinterpret_cast<IndirectCallHashTable
*>(1)};
460 /// Don't allow reentrancy in the fdata writing phase - only one thread writes
462 Mutex
*GlobalWriteProfileMutex
{reinterpret_cast<Mutex
*>(1)};
464 /// Store number of calls in additional to target address (Key) and frequency
465 /// as perceived by the basic block counter (Val).
466 struct CallFlowEntryBase
: public SimpleHashTableEntryBase
{
470 using CallFlowHashTableBase
= SimpleHashTable
<CallFlowEntryBase
, 11939, 233>;
472 /// This is a large table indexing all possible call targets (indirect and
473 /// direct ones). The goal is to find mismatches between number of calls (for
474 /// those calls we were able to track) and the entry basic block counter of the
475 /// callee. In most cases, these two should be equal. If not, there are two
476 /// possible scenarios here:
478 /// * Entry BB has higher frequency than all known calls to this function.
479 /// In this case, we have dynamic library code or any uninstrumented code
480 /// calling this function. We will write the profile for these untracked
481 /// calls as having source "0 [unknown] 0" in the fdata file.
483 /// * Number of known calls is higher than the frequency of entry BB
484 /// This only happens when there is no counter for the entry BB / callee
485 /// function is not simple (in BOLT terms). We don't do anything special
486 /// here and just ignore those (we still report all calls to the non-simple
487 /// function, though).
489 class CallFlowHashTable
: public CallFlowHashTableBase
{
491 CallFlowHashTable(BumpPtrAllocator
&Alloc
) : Alloc(Alloc
) {}
493 MapEntry
&get(uint64_t Key
) { return CallFlowHashTableBase::get(Key
, Alloc
); }
496 // Different than the hash table for indirect call targets, we do store the
497 // allocator here since there is only one call flow hash and space overhead
499 BumpPtrAllocator
&Alloc
;
503 /// Description metadata emitted by BOLT to describe the program - refer to
504 /// Passes/Instrumentation.cpp - Instrumentation::emitTablesAsELFNote()
507 uint32_t FunctionName
;
511 struct CallDescription
{
516 uint64_t TargetAddress
;
519 using IndCallDescription
= Location
;
521 struct IndCallTargetDescription
{
526 struct EdgeDescription
{
534 struct InstrumentedNode
{
544 struct FunctionDescription
{
545 uint32_t NumLeafNodes
;
546 const InstrumentedNode
*LeafNodes
;
548 const EdgeDescription
*Edges
;
550 const CallDescription
*Calls
;
551 uint32_t NumEntryNodes
;
552 const EntryNode
*EntryNodes
;
554 /// Constructor will parse the serialized function metadata written by BOLT
555 FunctionDescription(const uint8_t *FuncDesc
);
557 uint64_t getSize() const {
558 return 16 + NumLeafNodes
* sizeof(InstrumentedNode
) +
559 NumEdges
* sizeof(EdgeDescription
) +
560 NumCalls
* sizeof(CallDescription
) +
561 NumEntryNodes
* sizeof(EntryNode
);
565 /// The context is created when the fdata profile needs to be written to disk
566 /// and we need to interpret our runtime counters. It contains pointers to the
567 /// mmaped binary (only the BOLT written metadata section). Deserialization
568 /// should be straightforward as most data is POD or an array of POD elements.
569 /// This metadata is used to reconstruct function CFGs.
570 struct ProfileWriterContext
{
571 IndCallDescription
*IndCallDescriptions
;
572 IndCallTargetDescription
*IndCallTargets
;
573 uint8_t *FuncDescriptions
;
574 char *Strings
; // String table with function names used in this binary
575 int FileDesc
; // File descriptor for the file on disk backing this
576 // information in memory via mmap
577 void *MMapPtr
; // The mmap ptr
578 int MMapSize
; // The mmap size
580 /// Hash table storing all possible call destinations to detect untracked
581 /// calls and correctly report them as [unknown] in output fdata.
582 CallFlowHashTable
*CallFlowTable
;
584 /// Lookup the sorted indirect call target vector to fetch function name and
585 /// offset for an arbitrary function pointer.
586 const IndCallTargetDescription
*lookupIndCallTarget(uint64_t Target
) const;
589 /// Perform a string comparison and returns zero if Str1 matches Str2. Compares
590 /// at most Size characters.
591 int compareStr(const char *Str1
, const char *Str2
, int Size
) {
592 while (*Str1
== *Str2
) {
593 if (*Str1
== '\0' || --Size
== 0)
601 /// Output Location to the fdata file
602 char *serializeLoc(const ProfileWriterContext
&Ctx
, char *OutBuf
,
603 const Location Loc
, uint32_t BufSize
) {
604 // fdata location format: Type Name Offset
605 // Type 1 - regular symbol
606 OutBuf
= strCopy(OutBuf
, "1 ");
607 const char *Str
= Ctx
.Strings
+ Loc
.FunctionName
;
611 if (++Size
>= BufSize
)
614 assert(!*Str
, "buffer overflow, function name too large");
616 OutBuf
= intToStr(OutBuf
, Loc
.Offset
, 16);
621 /// Read and deserialize a function description written by BOLT. \p FuncDesc
622 /// points at the beginning of the function metadata structure in the file.
623 /// See Instrumentation::emitTablesAsELFNote()
624 FunctionDescription::FunctionDescription(const uint8_t *FuncDesc
) {
625 NumLeafNodes
= *reinterpret_cast<const uint32_t *>(FuncDesc
);
626 DEBUG(reportNumber("NumLeafNodes = ", NumLeafNodes
, 10));
627 LeafNodes
= reinterpret_cast<const InstrumentedNode
*>(FuncDesc
+ 4);
629 NumEdges
= *reinterpret_cast<const uint32_t *>(
630 FuncDesc
+ 4 + NumLeafNodes
* sizeof(InstrumentedNode
));
631 DEBUG(reportNumber("NumEdges = ", NumEdges
, 10));
632 Edges
= reinterpret_cast<const EdgeDescription
*>(
633 FuncDesc
+ 8 + NumLeafNodes
* sizeof(InstrumentedNode
));
635 NumCalls
= *reinterpret_cast<const uint32_t *>(
636 FuncDesc
+ 8 + NumLeafNodes
* sizeof(InstrumentedNode
) +
637 NumEdges
* sizeof(EdgeDescription
));
638 DEBUG(reportNumber("NumCalls = ", NumCalls
, 10));
639 Calls
= reinterpret_cast<const CallDescription
*>(
640 FuncDesc
+ 12 + NumLeafNodes
* sizeof(InstrumentedNode
) +
641 NumEdges
* sizeof(EdgeDescription
));
642 NumEntryNodes
= *reinterpret_cast<const uint32_t *>(
643 FuncDesc
+ 12 + NumLeafNodes
* sizeof(InstrumentedNode
) +
644 NumEdges
* sizeof(EdgeDescription
) + NumCalls
* sizeof(CallDescription
));
645 DEBUG(reportNumber("NumEntryNodes = ", NumEntryNodes
, 10));
646 EntryNodes
= reinterpret_cast<const EntryNode
*>(
647 FuncDesc
+ 16 + NumLeafNodes
* sizeof(InstrumentedNode
) +
648 NumEdges
* sizeof(EdgeDescription
) + NumCalls
* sizeof(CallDescription
));
651 /// Read and mmap descriptions written by BOLT from the executable's notes
653 #if defined(HAVE_ELF_H) and !defined(__APPLE__)
655 void *__attribute__((noinline
)) __get_pc() {
656 return __builtin_extract_return_addr(__builtin_return_address(0));
659 /// Get string with address and parse it to hex pair <StartAddress, EndAddress>
660 bool parseAddressRange(const char *Str
, uint64_t &StartAddress
,
661 uint64_t &EndAddress
) {
664 // Parsed string format: <hex1>-<hex2>
665 StartAddress
= hexToLong(Str
, '-');
666 while (*Str
&& *Str
!= '-')
670 ++Str
; // swallow '-'
671 EndAddress
= hexToLong(Str
);
675 /// Get full path to the real binary by getting current virtual address
676 /// and searching for the appropriate link in address range in
677 /// /proc/self/map_files
678 static char *getBinaryPath() {
679 const uint32_t BufSize
= 1024;
680 const uint32_t NameMax
= 4096;
681 const char DirPath
[] = "/proc/self/map_files/";
682 static char TargetPath
[NameMax
] = {};
685 if (__bolt_instr_binpath
[0] != '\0')
686 return __bolt_instr_binpath
;
688 if (TargetPath
[0] != '\0')
691 unsigned long CurAddr
= (unsigned long)__get_pc();
692 uint64_t FDdir
= __open(DirPath
, O_RDONLY
,
694 assert(static_cast<int64_t>(FDdir
) >= 0,
695 "failed to open /proc/self/map_files");
697 while (long Nread
= __getdents64(FDdir
, (struct dirent64
*)Buf
, BufSize
)) {
698 assert(static_cast<int64_t>(Nread
) != -1, "failed to get folder entries");
701 for (long Bpos
= 0; Bpos
< Nread
; Bpos
+= d
->d_reclen
) {
702 d
= (struct dirent64
*)(Buf
+ Bpos
);
704 uint64_t StartAddress
, EndAddress
;
705 if (!parseAddressRange(d
->d_name
, StartAddress
, EndAddress
))
707 if (CurAddr
< StartAddress
|| CurAddr
> EndAddress
)
709 char FindBuf
[NameMax
];
710 char *C
= strCopy(FindBuf
, DirPath
, NameMax
);
711 C
= strCopy(C
, d
->d_name
, NameMax
- (C
- FindBuf
));
713 uint32_t Ret
= __readlink(FindBuf
, TargetPath
, sizeof(TargetPath
));
714 assert(Ret
!= -1 && Ret
!= BufSize
, "readlink error");
715 TargetPath
[Ret
] = '\0';
722 ProfileWriterContext
readDescriptions() {
723 ProfileWriterContext Result
;
724 char *BinPath
= getBinaryPath();
725 assert(BinPath
&& BinPath
[0] != '\0', "failed to find binary path");
727 uint64_t FD
= __open(BinPath
, O_RDONLY
,
729 assert(static_cast<int64_t>(FD
) >= 0, "failed to open binary path");
731 Result
.FileDesc
= FD
;
733 // mmap our binary to memory
734 uint64_t Size
= __lseek(FD
, 0, SEEK_END
);
735 uint8_t *BinContents
= reinterpret_cast<uint8_t *>(
736 __mmap(0, Size
, PROT_READ
, MAP_PRIVATE
, FD
, 0));
737 assert(BinContents
!= MAP_FAILED
, "readDescriptions: Failed to mmap self!");
738 Result
.MMapPtr
= BinContents
;
739 Result
.MMapSize
= Size
;
740 Elf64_Ehdr
*Hdr
= reinterpret_cast<Elf64_Ehdr
*>(BinContents
);
741 Elf64_Shdr
*Shdr
= reinterpret_cast<Elf64_Shdr
*>(BinContents
+ Hdr
->e_shoff
);
742 Elf64_Shdr
*StringTblHeader
= reinterpret_cast<Elf64_Shdr
*>(
743 BinContents
+ Hdr
->e_shoff
+ Hdr
->e_shstrndx
* Hdr
->e_shentsize
);
745 // Find .bolt.instr.tables with the data we need and set pointers to it
746 for (int I
= 0; I
< Hdr
->e_shnum
; ++I
) {
747 char *SecName
= reinterpret_cast<char *>(
748 BinContents
+ StringTblHeader
->sh_offset
+ Shdr
->sh_name
);
749 if (compareStr(SecName
, ".bolt.instr.tables", 64) != 0) {
750 Shdr
= reinterpret_cast<Elf64_Shdr
*>(BinContents
+ Hdr
->e_shoff
+
751 (I
+ 1) * Hdr
->e_shentsize
);
754 // Actual contents of the ELF note start after offset 20 decimal:
755 // Offset 0: Producer name size (4 bytes)
756 // Offset 4: Contents size (4 bytes)
757 // Offset 8: Note type (4 bytes)
758 // Offset 12: Producer name (BOLT\0) (5 bytes + align to 4-byte boundary)
759 // Offset 20: Contents
760 uint32_t IndCallDescSize
=
761 *reinterpret_cast<uint32_t *>(BinContents
+ Shdr
->sh_offset
+ 20);
762 uint32_t IndCallTargetDescSize
= *reinterpret_cast<uint32_t *>(
763 BinContents
+ Shdr
->sh_offset
+ 24 + IndCallDescSize
);
764 uint32_t FuncDescSize
=
765 *reinterpret_cast<uint32_t *>(BinContents
+ Shdr
->sh_offset
+ 28 +
766 IndCallDescSize
+ IndCallTargetDescSize
);
767 Result
.IndCallDescriptions
= reinterpret_cast<IndCallDescription
*>(
768 BinContents
+ Shdr
->sh_offset
+ 24);
769 Result
.IndCallTargets
= reinterpret_cast<IndCallTargetDescription
*>(
770 BinContents
+ Shdr
->sh_offset
+ 28 + IndCallDescSize
);
771 Result
.FuncDescriptions
= BinContents
+ Shdr
->sh_offset
+ 32 +
772 IndCallDescSize
+ IndCallTargetDescSize
;
773 Result
.Strings
= reinterpret_cast<char *>(
774 BinContents
+ Shdr
->sh_offset
+ 32 + IndCallDescSize
+
775 IndCallTargetDescSize
+ FuncDescSize
);
778 const char ErrMsg
[] =
779 "BOLT instrumentation runtime error: could not find section "
780 ".bolt.instr.tables\n";
781 reportError(ErrMsg
, sizeof(ErrMsg
));
787 ProfileWriterContext
readDescriptions() {
788 ProfileWriterContext Result
;
789 uint8_t *Tables
= _bolt_instr_tables_getter();
790 uint32_t IndCallDescSize
= *reinterpret_cast<uint32_t *>(Tables
);
791 uint32_t IndCallTargetDescSize
=
792 *reinterpret_cast<uint32_t *>(Tables
+ 4 + IndCallDescSize
);
793 uint32_t FuncDescSize
= *reinterpret_cast<uint32_t *>(
794 Tables
+ 8 + IndCallDescSize
+ IndCallTargetDescSize
);
795 Result
.IndCallDescriptions
=
796 reinterpret_cast<IndCallDescription
*>(Tables
+ 4);
797 Result
.IndCallTargets
= reinterpret_cast<IndCallTargetDescription
*>(
798 Tables
+ 8 + IndCallDescSize
);
799 Result
.FuncDescriptions
=
800 Tables
+ 12 + IndCallDescSize
+ IndCallTargetDescSize
;
801 Result
.Strings
= reinterpret_cast<char *>(
802 Tables
+ 12 + IndCallDescSize
+ IndCallTargetDescSize
+ FuncDescSize
);
808 #if !defined(__APPLE__)
809 /// Debug by printing overall metadata global numbers to check it is sane
810 void printStats(const ProfileWriterContext
&Ctx
) {
811 char StatMsg
[BufSize
];
812 char *StatPtr
= StatMsg
;
815 "\nBOLT INSTRUMENTATION RUNTIME STATISTICS\n\nIndCallDescSize: ");
816 StatPtr
= intToStr(StatPtr
,
817 Ctx
.FuncDescriptions
-
818 reinterpret_cast<uint8_t *>(Ctx
.IndCallDescriptions
),
820 StatPtr
= strCopy(StatPtr
, "\nFuncDescSize: ");
823 reinterpret_cast<uint8_t *>(Ctx
.Strings
) - Ctx
.FuncDescriptions
, 10);
824 StatPtr
= strCopy(StatPtr
, "\n__bolt_instr_num_ind_calls: ");
825 StatPtr
= intToStr(StatPtr
, __bolt_instr_num_ind_calls
, 10);
826 StatPtr
= strCopy(StatPtr
, "\n__bolt_instr_num_funcs: ");
827 StatPtr
= intToStr(StatPtr
, __bolt_instr_num_funcs
, 10);
828 StatPtr
= strCopy(StatPtr
, "\n");
829 __write(2, StatMsg
, StatPtr
- StatMsg
);
834 /// This is part of a simple CFG representation in memory, where we store
835 /// a dynamically sized array of input and output edges per node, and store
836 /// a dynamically sized array of nodes per graph. We also store the spanning
837 /// tree edges for that CFG in a separate array of nodes in
838 /// \p SpanningTreeNodes, while the regular nodes live in \p CFGNodes.
840 uint32_t Node
; // Index in nodes array regarding the destination of this edge
841 uint32_t ID
; // Edge index in an array comprising all edges of the graph
844 /// A regular graph node or a spanning tree node
846 uint32_t NumInEdges
{0}; // Input edge count used to size InEdge
847 uint32_t NumOutEdges
{0}; // Output edge count used to size OutEdges
848 Edge
*InEdges
{nullptr}; // Created and managed by \p Graph
849 Edge
*OutEdges
{nullptr}; // ditto
852 /// Main class for CFG representation in memory. Manages object creation and
853 /// destruction, populates an array of CFG nodes as well as corresponding
854 /// spanning tree nodes.
858 Node
*SpanningTreeNodes
;
861 BumpPtrAllocator
&Alloc
;
862 const FunctionDescription
&D
;
864 /// Reads a list of edges from function description \p D and builds
865 /// the graph from it. Allocates several internal dynamic structures that are
866 /// later destroyed by ~Graph() and uses \p Alloc. D.LeafNodes contain all
867 /// spanning tree leaf nodes descriptions (their counters). They are the seed
868 /// used to compute the rest of the missing edge counts in a bottom-up
869 /// traversal of the spanning tree.
870 Graph(BumpPtrAllocator
&Alloc
, const FunctionDescription
&D
,
871 const uint64_t *Counters
, ProfileWriterContext
&Ctx
);
876 void computeEdgeFrequencies(const uint64_t *Counters
,
877 ProfileWriterContext
&Ctx
);
878 void dumpEdgeFreqs() const;
881 Graph::Graph(BumpPtrAllocator
&Alloc
, const FunctionDescription
&D
,
882 const uint64_t *Counters
, ProfileWriterContext
&Ctx
)
883 : Alloc(Alloc
), D(D
) {
884 DEBUG(reportNumber("G = 0x", (uint64_t)this, 16));
885 // First pass to determine number of nodes
886 int32_t MaxNodes
= -1;
889 for (int I
= 0; I
< D
.NumEdges
; ++I
) {
890 if (static_cast<int32_t>(D
.Edges
[I
].FromNode
) > MaxNodes
)
891 MaxNodes
= D
.Edges
[I
].FromNode
;
892 if (static_cast<int32_t>(D
.Edges
[I
].ToNode
) > MaxNodes
)
893 MaxNodes
= D
.Edges
[I
].ToNode
;
896 for (int I
= 0; I
< D
.NumLeafNodes
; ++I
)
897 if (static_cast<int32_t>(D
.LeafNodes
[I
].Node
) > MaxNodes
)
898 MaxNodes
= D
.LeafNodes
[I
].Node
;
900 for (int I
= 0; I
< D
.NumCalls
; ++I
)
901 if (static_cast<int32_t>(D
.Calls
[I
].FromNode
) > MaxNodes
)
902 MaxNodes
= D
.Calls
[I
].FromNode
;
904 // No nodes? Nothing to do
906 DEBUG(report("No nodes!\n"));
908 SpanningTreeNodes
= nullptr;
913 DEBUG(reportNumber("NumNodes = ", MaxNodes
, 10));
914 NumNodes
= static_cast<uint32_t>(MaxNodes
);
916 // Initial allocations
917 CFGNodes
= new (Alloc
) Node
[MaxNodes
];
919 DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes
, 16));
920 SpanningTreeNodes
= new (Alloc
) Node
[MaxNodes
];
921 DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
922 (uint64_t)SpanningTreeNodes
, 16));
924 // Figure out how much to allocate to each vector (in/out edge sets)
925 for (int I
= 0; I
< D
.NumEdges
; ++I
) {
926 CFGNodes
[D
.Edges
[I
].FromNode
].NumOutEdges
++;
927 CFGNodes
[D
.Edges
[I
].ToNode
].NumInEdges
++;
928 if (D
.Edges
[I
].Counter
!= 0xffffffff)
931 SpanningTreeNodes
[D
.Edges
[I
].FromNode
].NumOutEdges
++;
932 SpanningTreeNodes
[D
.Edges
[I
].ToNode
].NumInEdges
++;
935 // Allocate in/out edge sets
936 for (int I
= 0; I
< MaxNodes
; ++I
) {
937 if (CFGNodes
[I
].NumInEdges
> 0)
938 CFGNodes
[I
].InEdges
= new (Alloc
) Edge
[CFGNodes
[I
].NumInEdges
];
939 if (CFGNodes
[I
].NumOutEdges
> 0)
940 CFGNodes
[I
].OutEdges
= new (Alloc
) Edge
[CFGNodes
[I
].NumOutEdges
];
941 if (SpanningTreeNodes
[I
].NumInEdges
> 0)
942 SpanningTreeNodes
[I
].InEdges
=
943 new (Alloc
) Edge
[SpanningTreeNodes
[I
].NumInEdges
];
944 if (SpanningTreeNodes
[I
].NumOutEdges
> 0)
945 SpanningTreeNodes
[I
].OutEdges
=
946 new (Alloc
) Edge
[SpanningTreeNodes
[I
].NumOutEdges
];
947 CFGNodes
[I
].NumInEdges
= 0;
948 CFGNodes
[I
].NumOutEdges
= 0;
949 SpanningTreeNodes
[I
].NumInEdges
= 0;
950 SpanningTreeNodes
[I
].NumOutEdges
= 0;
953 // Fill in/out edge sets
954 for (int I
= 0; I
< D
.NumEdges
; ++I
) {
955 const uint32_t Src
= D
.Edges
[I
].FromNode
;
956 const uint32_t Dst
= D
.Edges
[I
].ToNode
;
957 Edge
*E
= &CFGNodes
[Src
].OutEdges
[CFGNodes
[Src
].NumOutEdges
++];
961 E
= &CFGNodes
[Dst
].InEdges
[CFGNodes
[Dst
].NumInEdges
++];
965 if (D
.Edges
[I
].Counter
!= 0xffffffff)
968 E
= &SpanningTreeNodes
[Src
]
969 .OutEdges
[SpanningTreeNodes
[Src
].NumOutEdges
++];
973 E
= &SpanningTreeNodes
[Dst
]
974 .InEdges
[SpanningTreeNodes
[Dst
].NumInEdges
++];
979 computeEdgeFrequencies(Counters
, Ctx
);
984 Alloc
.deallocate(CallFreqs
);
986 Alloc
.deallocate(EdgeFreqs
);
987 for (int I
= NumNodes
- 1; I
>= 0; --I
) {
988 if (SpanningTreeNodes
[I
].OutEdges
)
989 Alloc
.deallocate(SpanningTreeNodes
[I
].OutEdges
);
990 if (SpanningTreeNodes
[I
].InEdges
)
991 Alloc
.deallocate(SpanningTreeNodes
[I
].InEdges
);
992 if (CFGNodes
[I
].OutEdges
)
993 Alloc
.deallocate(CFGNodes
[I
].OutEdges
);
994 if (CFGNodes
[I
].InEdges
)
995 Alloc
.deallocate(CFGNodes
[I
].InEdges
);
997 if (SpanningTreeNodes
)
998 Alloc
.deallocate(SpanningTreeNodes
);
1000 Alloc
.deallocate(CFGNodes
);
1003 void Graph::dump() const {
1004 reportNumber("Dumping graph with number of nodes: ", NumNodes
, 10);
1005 report(" Full graph:\n");
1006 for (int I
= 0; I
< NumNodes
; ++I
) {
1007 const Node
*N
= &CFGNodes
[I
];
1008 reportNumber(" Node #", I
, 10);
1009 reportNumber(" InEdges total ", N
->NumInEdges
, 10);
1010 for (int J
= 0; J
< N
->NumInEdges
; ++J
)
1011 reportNumber(" ", N
->InEdges
[J
].Node
, 10);
1012 reportNumber(" OutEdges total ", N
->NumOutEdges
, 10);
1013 for (int J
= 0; J
< N
->NumOutEdges
; ++J
)
1014 reportNumber(" ", N
->OutEdges
[J
].Node
, 10);
1017 report(" Spanning tree:\n");
1018 for (int I
= 0; I
< NumNodes
; ++I
) {
1019 const Node
*N
= &SpanningTreeNodes
[I
];
1020 reportNumber(" Node #", I
, 10);
1021 reportNumber(" InEdges total ", N
->NumInEdges
, 10);
1022 for (int J
= 0; J
< N
->NumInEdges
; ++J
)
1023 reportNumber(" ", N
->InEdges
[J
].Node
, 10);
1024 reportNumber(" OutEdges total ", N
->NumOutEdges
, 10);
1025 for (int J
= 0; J
< N
->NumOutEdges
; ++J
)
1026 reportNumber(" ", N
->OutEdges
[J
].Node
, 10);
1031 void Graph::dumpEdgeFreqs() const {
1033 "Dumping edge frequencies for graph with num edges: ", D
.NumEdges
, 10);
1034 for (int I
= 0; I
< D
.NumEdges
; ++I
) {
1035 reportNumber("* Src: ", D
.Edges
[I
].FromNode
, 10);
1036 reportNumber(" Dst: ", D
.Edges
[I
].ToNode
, 10);
1037 reportNumber(" Cnt: ", EdgeFreqs
[I
], 10);
1041 /// Auxiliary map structure for fast lookups of which calls map to each node of
1042 /// the function CFG
1043 struct NodeToCallsMap
{
1049 BumpPtrAllocator
&Alloc
;
1050 const uint32_t NumNodes
;
1052 NodeToCallsMap(BumpPtrAllocator
&Alloc
, const FunctionDescription
&D
,
1054 : Alloc(Alloc
), NumNodes(NumNodes
) {
1055 Entries
= new (Alloc
, 0) MapEntry
[NumNodes
];
1056 for (int I
= 0; I
< D
.NumCalls
; ++I
) {
1057 DEBUG(reportNumber("Registering call in node ", D
.Calls
[I
].FromNode
, 10));
1058 ++Entries
[D
.Calls
[I
].FromNode
].NumCalls
;
1060 for (int I
= 0; I
< NumNodes
; ++I
) {
1061 Entries
[I
].Calls
= Entries
[I
].NumCalls
? new (Alloc
)
1062 uint32_t[Entries
[I
].NumCalls
]
1064 Entries
[I
].NumCalls
= 0;
1066 for (int I
= 0; I
< D
.NumCalls
; ++I
) {
1067 MapEntry
&Entry
= Entries
[D
.Calls
[I
].FromNode
];
1068 Entry
.Calls
[Entry
.NumCalls
++] = I
;
1072 /// Set the frequency of all calls in node \p NodeID to Freq. However, if
1073 /// the calls have their own counters and do not depend on the basic block
1074 /// counter, this means they have landing pads and throw exceptions. In this
1075 /// case, set their frequency with their counters and return the maximum
1076 /// value observed in such counters. This will be used as the new frequency
1077 /// at basic block entry. This is used to fix the CFG edge frequencies in the
1078 /// presence of exceptions.
1079 uint64_t visitAllCallsIn(uint32_t NodeID
, uint64_t Freq
, uint64_t *CallFreqs
,
1080 const FunctionDescription
&D
,
1081 const uint64_t *Counters
,
1082 ProfileWriterContext
&Ctx
) const {
1083 const MapEntry
&Entry
= Entries
[NodeID
];
1084 uint64_t MaxValue
= 0ull;
1085 for (int I
= 0, E
= Entry
.NumCalls
; I
!= E
; ++I
) {
1086 const uint32_t CallID
= Entry
.Calls
[I
];
1087 DEBUG(reportNumber(" Setting freq for call ID: ", CallID
, 10));
1088 const CallDescription
&CallDesc
= D
.Calls
[CallID
];
1089 if (CallDesc
.Counter
== 0xffffffff) {
1090 CallFreqs
[CallID
] = Freq
;
1091 DEBUG(reportNumber(" with : ", Freq
, 10));
1093 const uint64_t CounterVal
= Counters
[CallDesc
.Counter
];
1094 CallFreqs
[CallID
] = CounterVal
;
1095 MaxValue
= CounterVal
> MaxValue
? CounterVal
: MaxValue
;
1096 DEBUG(reportNumber(" with (private counter) : ", CounterVal
, 10));
1098 DEBUG(reportNumber(" Address: 0x", CallDesc
.TargetAddress
, 16));
1099 if (CallFreqs
[CallID
] > 0)
1100 Ctx
.CallFlowTable
->get(CallDesc
.TargetAddress
).Calls
+=
1107 for (int I
= NumNodes
- 1; I
>= 0; --I
)
1108 if (Entries
[I
].Calls
)
1109 Alloc
.deallocate(Entries
[I
].Calls
);
1110 Alloc
.deallocate(Entries
);
1114 /// Fill an array with the frequency of each edge in the function represented
1115 /// by G, as well as another array for each call.
1116 void Graph::computeEdgeFrequencies(const uint64_t *Counters
,
1117 ProfileWriterContext
&Ctx
) {
1121 EdgeFreqs
= D
.NumEdges
? new (Alloc
, 0) uint64_t [D
.NumEdges
] : nullptr;
1122 CallFreqs
= D
.NumCalls
? new (Alloc
, 0) uint64_t [D
.NumCalls
] : nullptr;
1124 // Setup a lookup for calls present in each node (BB)
1125 NodeToCallsMap
*CallMap
= new (Alloc
) NodeToCallsMap(Alloc
, D
, NumNodes
);
1127 // Perform a bottom-up, BFS traversal of the spanning tree in G. Edges in the
1128 // spanning tree don't have explicit counters. We must infer their value using
1129 // a linear combination of other counters (sum of counters of the outgoing
1130 // edges minus sum of counters of the incoming edges).
1131 uint32_t *Stack
= new (Alloc
) uint32_t [NumNodes
];
1132 uint32_t StackTop
= 0;
1133 enum Status
: uint8_t { S_NEW
= 0, S_VISITING
, S_VISITED
};
1134 Status
*Visited
= new (Alloc
, 0) Status
[NumNodes
];
1135 uint64_t *LeafFrequency
= new (Alloc
, 0) uint64_t[NumNodes
];
1136 uint64_t *EntryAddress
= new (Alloc
, 0) uint64_t[NumNodes
];
1138 // Setup a fast lookup for frequency of leaf nodes, which have special
1139 // basic block frequency instrumentation (they are not edge profiled).
1140 for (int I
= 0; I
< D
.NumLeafNodes
; ++I
) {
1141 LeafFrequency
[D
.LeafNodes
[I
].Node
] = Counters
[D
.LeafNodes
[I
].Counter
];
1143 if (Counters
[D
.LeafNodes
[I
].Counter
] > 0) {
1144 reportNumber("Leaf Node# ", D
.LeafNodes
[I
].Node
, 10);
1145 reportNumber(" Counter: ", Counters
[D
.LeafNodes
[I
].Counter
], 10);
1149 for (int I
= 0; I
< D
.NumEntryNodes
; ++I
) {
1150 EntryAddress
[D
.EntryNodes
[I
].Node
] = D
.EntryNodes
[I
].Address
;
1152 reportNumber("Entry Node# ", D
.EntryNodes
[I
].Node
, 10);
1153 reportNumber(" Address: ", D
.EntryNodes
[I
].Address
, 16);
1156 // Add all root nodes to the stack
1157 for (int I
= 0; I
< NumNodes
; ++I
)
1158 if (SpanningTreeNodes
[I
].NumInEdges
== 0)
1159 Stack
[StackTop
++] = I
;
1162 if (StackTop
== 0) {
1163 DEBUG(report("Empty stack!\n"));
1164 Alloc
.deallocate(EntryAddress
);
1165 Alloc
.deallocate(LeafFrequency
);
1166 Alloc
.deallocate(Visited
);
1167 Alloc
.deallocate(Stack
);
1168 CallMap
->~NodeToCallsMap();
1169 Alloc
.deallocate(CallMap
);
1171 Alloc
.deallocate(CallFreqs
);
1173 Alloc
.deallocate(EdgeFreqs
);
1174 EdgeFreqs
= nullptr;
1175 CallFreqs
= nullptr;
1178 // Add all known edge counts, will infer the rest
1179 for (int I
= 0; I
< D
.NumEdges
; ++I
) {
1180 const uint32_t C
= D
.Edges
[I
].Counter
;
1181 if (C
== 0xffffffff) // inferred counter - we will compute its value
1183 EdgeFreqs
[I
] = Counters
[C
];
1186 while (StackTop
> 0) {
1187 const uint32_t Cur
= Stack
[--StackTop
];
1189 if (Visited
[Cur
] == S_VISITING
)
1190 report("(visiting) ");
1193 reportNumber("Cur: ", Cur
, 10);
1196 // This shouldn't happen in a tree
1197 assert(Visited
[Cur
] != S_VISITED
, "should not have visited nodes in stack");
1198 if (Visited
[Cur
] == S_NEW
) {
1199 Visited
[Cur
] = S_VISITING
;
1200 Stack
[StackTop
++] = Cur
;
1201 assert(StackTop
<= NumNodes
, "stack grew too large");
1202 for (int I
= 0, E
= SpanningTreeNodes
[Cur
].NumOutEdges
; I
< E
; ++I
) {
1203 const uint32_t Succ
= SpanningTreeNodes
[Cur
].OutEdges
[I
].Node
;
1204 Stack
[StackTop
++] = Succ
;
1205 assert(StackTop
<= NumNodes
, "stack grew too large");
1209 Visited
[Cur
] = S_VISITED
;
1211 // Establish our node frequency based on outgoing edges, which should all be
1213 int64_t CurNodeFreq
= LeafFrequency
[Cur
];
1216 for (int I
= 0, E
= CFGNodes
[Cur
].NumOutEdges
; I
!= E
; ++I
) {
1217 const uint32_t SuccEdge
= CFGNodes
[Cur
].OutEdges
[I
].ID
;
1218 CurNodeFreq
+= EdgeFreqs
[SuccEdge
];
1221 if (CurNodeFreq
< 0)
1224 const uint64_t CallFreq
= CallMap
->visitAllCallsIn(
1225 Cur
, CurNodeFreq
> 0 ? CurNodeFreq
: 0, CallFreqs
, D
, Counters
, Ctx
);
1227 // Exception handling affected our output flow? Fix with calls info
1229 if (CallFreq
> CurNodeFreq
)
1230 report("Bumping node frequency with call info\n");
1232 CurNodeFreq
= CallFreq
> CurNodeFreq
? CallFreq
: CurNodeFreq
;
1234 if (CurNodeFreq
> 0) {
1235 if (uint64_t Addr
= EntryAddress
[Cur
]) {
1237 reportNumber(" Setting flow at entry point address 0x", Addr
, 16));
1238 DEBUG(reportNumber(" with: ", CurNodeFreq
, 10));
1239 Ctx
.CallFlowTable
->get(Addr
).Val
= CurNodeFreq
;
1243 // No parent? Reached a tree root, limit to call frequency updating.
1244 if (SpanningTreeNodes
[Cur
].NumInEdges
== 0)
1247 assert(SpanningTreeNodes
[Cur
].NumInEdges
== 1, "must have 1 parent");
1248 const uint32_t ParentEdge
= SpanningTreeNodes
[Cur
].InEdges
[0].ID
;
1250 // Calculate parent edge freq.
1251 int64_t ParentEdgeFreq
= CurNodeFreq
;
1252 for (int I
= 0, E
= CFGNodes
[Cur
].NumInEdges
; I
!= E
; ++I
) {
1253 const uint32_t PredEdge
= CFGNodes
[Cur
].InEdges
[I
].ID
;
1254 ParentEdgeFreq
-= EdgeFreqs
[PredEdge
];
1257 // Sometimes the conservative CFG that BOLT builds will lead to incorrect
1258 // flow computation. For example, in a BB that transitively calls the exit
1259 // syscall, BOLT will add a fall-through successor even though it should not
1260 // have any successors. So this block execution will likely be wrong. We
1261 // tolerate this imperfection since this case should be quite infrequent.
1262 if (ParentEdgeFreq
< 0) {
1263 DEBUG(dumpEdgeFreqs());
1264 DEBUG(report("WARNING: incorrect flow"));
1267 DEBUG(reportNumber(" Setting freq for ParentEdge: ", ParentEdge
, 10));
1268 DEBUG(reportNumber(" with ParentEdgeFreq: ", ParentEdgeFreq
, 10));
1269 EdgeFreqs
[ParentEdge
] = ParentEdgeFreq
;
1272 Alloc
.deallocate(EntryAddress
);
1273 Alloc
.deallocate(LeafFrequency
);
1274 Alloc
.deallocate(Visited
);
1275 Alloc
.deallocate(Stack
);
1276 CallMap
->~NodeToCallsMap();
1277 Alloc
.deallocate(CallMap
);
1278 DEBUG(dumpEdgeFreqs());
1281 /// Write to \p FD all of the edge profiles for function \p FuncDesc. Uses
1282 /// \p Alloc to allocate helper dynamic structures used to compute profile for
1283 /// edges that we do not explicitly instrument.
1284 const uint8_t *writeFunctionProfile(int FD
, ProfileWriterContext
&Ctx
,
1285 const uint8_t *FuncDesc
,
1286 BumpPtrAllocator
&Alloc
) {
1287 const FunctionDescription
F(FuncDesc
);
1288 const uint8_t *next
= FuncDesc
+ F
.getSize();
1290 #if !defined(__APPLE__)
1291 uint64_t *bolt_instr_locations
= __bolt_instr_locations
;
1293 uint64_t *bolt_instr_locations
= _bolt_instr_locations_getter();
1296 // Skip funcs we know are cold
1297 #ifndef ENABLE_DEBUG
1298 uint64_t CountersFreq
= 0;
1299 for (int I
= 0; I
< F
.NumLeafNodes
; ++I
)
1300 CountersFreq
+= bolt_instr_locations
[F
.LeafNodes
[I
].Counter
];
1302 if (CountersFreq
== 0) {
1303 for (int I
= 0; I
< F
.NumEdges
; ++I
) {
1304 const uint32_t C
= F
.Edges
[I
].Counter
;
1305 if (C
== 0xffffffff)
1307 CountersFreq
+= bolt_instr_locations
[C
];
1309 if (CountersFreq
== 0) {
1310 for (int I
= 0; I
< F
.NumCalls
; ++I
) {
1311 const uint32_t C
= F
.Calls
[I
].Counter
;
1312 if (C
== 0xffffffff)
1314 CountersFreq
+= bolt_instr_locations
[C
];
1316 if (CountersFreq
== 0)
1322 Graph
*G
= new (Alloc
) Graph(Alloc
, F
, bolt_instr_locations
, Ctx
);
1325 if (!G
->EdgeFreqs
&& !G
->CallFreqs
) {
1327 Alloc
.deallocate(G
);
1331 for (int I
= 0; I
< F
.NumEdges
; ++I
) {
1332 const uint64_t Freq
= G
->EdgeFreqs
[I
];
1335 const EdgeDescription
*Desc
= &F
.Edges
[I
];
1336 char LineBuf
[BufSize
];
1337 char *Ptr
= LineBuf
;
1338 Ptr
= serializeLoc(Ctx
, Ptr
, Desc
->From
, BufSize
);
1339 Ptr
= serializeLoc(Ctx
, Ptr
, Desc
->To
, BufSize
- (Ptr
- LineBuf
));
1340 Ptr
= strCopy(Ptr
, "0 ", BufSize
- (Ptr
- LineBuf
) - 22);
1341 Ptr
= intToStr(Ptr
, Freq
, 10);
1343 __write(FD
, LineBuf
, Ptr
- LineBuf
);
1346 for (int I
= 0; I
< F
.NumCalls
; ++I
) {
1347 const uint64_t Freq
= G
->CallFreqs
[I
];
1350 char LineBuf
[BufSize
];
1351 char *Ptr
= LineBuf
;
1352 const CallDescription
*Desc
= &F
.Calls
[I
];
1353 Ptr
= serializeLoc(Ctx
, Ptr
, Desc
->From
, BufSize
);
1354 Ptr
= serializeLoc(Ctx
, Ptr
, Desc
->To
, BufSize
- (Ptr
- LineBuf
));
1355 Ptr
= strCopy(Ptr
, "0 ", BufSize
- (Ptr
- LineBuf
) - 25);
1356 Ptr
= intToStr(Ptr
, Freq
, 10);
1358 __write(FD
, LineBuf
, Ptr
- LineBuf
);
1362 Alloc
.deallocate(G
);
1366 #if !defined(__APPLE__)
1367 const IndCallTargetDescription
*
1368 ProfileWriterContext::lookupIndCallTarget(uint64_t Target
) const {
1370 uint32_t E
= __bolt_instr_num_ind_targets
;
1374 uint32_t I
= (E
- B
) / 2 + B
;
1375 if (IndCallTargets
[I
].Address
== Target
)
1376 return &IndCallTargets
[I
];
1377 if (IndCallTargets
[I
].Address
< Target
)
1385 /// Write a single indirect call <src, target> pair to the fdata file
1386 void visitIndCallCounter(IndirectCallHashTable::MapEntry
&Entry
,
1387 int FD
, int CallsiteID
,
1388 ProfileWriterContext
*Ctx
) {
1391 DEBUG(reportNumber("Target func 0x", Entry
.Key
, 16));
1392 DEBUG(reportNumber("Target freq: ", Entry
.Val
, 10));
1393 const IndCallDescription
*CallsiteDesc
=
1394 &Ctx
->IndCallDescriptions
[CallsiteID
];
1395 const IndCallTargetDescription
*TargetDesc
=
1396 Ctx
->lookupIndCallTarget(Entry
.Key
- TextBaseAddress
);
1398 DEBUG(report("Failed to lookup indirect call target\n"));
1399 char LineBuf
[BufSize
];
1400 char *Ptr
= LineBuf
;
1401 Ptr
= serializeLoc(*Ctx
, Ptr
, *CallsiteDesc
, BufSize
);
1402 Ptr
= strCopy(Ptr
, "0 [unknown] 0 0 ", BufSize
- (Ptr
- LineBuf
) - 40);
1403 Ptr
= intToStr(Ptr
, Entry
.Val
, 10);
1405 __write(FD
, LineBuf
, Ptr
- LineBuf
);
1408 Ctx
->CallFlowTable
->get(TargetDesc
->Address
).Calls
+= Entry
.Val
;
1409 char LineBuf
[BufSize
];
1410 char *Ptr
= LineBuf
;
1411 Ptr
= serializeLoc(*Ctx
, Ptr
, *CallsiteDesc
, BufSize
);
1412 Ptr
= serializeLoc(*Ctx
, Ptr
, TargetDesc
->Loc
, BufSize
- (Ptr
- LineBuf
));
1413 Ptr
= strCopy(Ptr
, "0 ", BufSize
- (Ptr
- LineBuf
) - 25);
1414 Ptr
= intToStr(Ptr
, Entry
.Val
, 10);
1416 __write(FD
, LineBuf
, Ptr
- LineBuf
);
1419 /// Write to \p FD all of the indirect call profiles.
1420 void writeIndirectCallProfile(int FD
, ProfileWriterContext
&Ctx
) {
1421 for (int I
= 0; I
< __bolt_instr_num_ind_calls
; ++I
) {
1422 DEBUG(reportNumber("IndCallsite #", I
, 10));
1423 GlobalIndCallCounters
[I
].forEachElement(visitIndCallCounter
, FD
, I
, &Ctx
);
1427 /// Check a single call flow for a callee versus all known callers. If there are
1428 /// less callers than what the callee expects, write the difference with source
1429 /// [unknown] in the profile.
1430 void visitCallFlowEntry(CallFlowHashTable::MapEntry
&Entry
, int FD
,
1431 ProfileWriterContext
*Ctx
) {
1432 DEBUG(reportNumber("Call flow entry address: 0x", Entry
.Key
, 16));
1433 DEBUG(reportNumber("Calls: ", Entry
.Calls
, 10));
1434 DEBUG(reportNumber("Reported entry frequency: ", Entry
.Val
, 10));
1436 if (Entry
.Calls
> Entry
.Val
)
1437 report(" More calls than expected!\n");
1439 if (Entry
.Val
<= Entry
.Calls
)
1442 " Balancing calls with traffic: ", Entry
.Val
- Entry
.Calls
, 10));
1443 const IndCallTargetDescription
*TargetDesc
=
1444 Ctx
->lookupIndCallTarget(Entry
.Key
);
1446 // There is probably something wrong with this callee and this should be
1447 // investigated, but I don't want to assert and lose all data collected.
1448 DEBUG(report("WARNING: failed to look up call target!\n"));
1451 char LineBuf
[BufSize
];
1452 char *Ptr
= LineBuf
;
1453 Ptr
= strCopy(Ptr
, "0 [unknown] 0 ", BufSize
);
1454 Ptr
= serializeLoc(*Ctx
, Ptr
, TargetDesc
->Loc
, BufSize
- (Ptr
- LineBuf
));
1455 Ptr
= strCopy(Ptr
, "0 ", BufSize
- (Ptr
- LineBuf
) - 25);
1456 Ptr
= intToStr(Ptr
, Entry
.Val
- Entry
.Calls
, 10);
1458 __write(FD
, LineBuf
, Ptr
- LineBuf
);
1461 /// Open fdata file for writing and return a valid file descriptor, aborting
1462 /// program upon failure.
1464 // Build the profile name string by appending our PID
1466 uint64_t PID
= __getpid();
1467 char *Ptr
= strCopy(Buf
, __bolt_instr_filename
, BufSize
);
1468 if (__bolt_instr_use_pid
) {
1469 Ptr
= strCopy(Ptr
, ".", BufSize
- (Ptr
- Buf
+ 1));
1470 Ptr
= intToStr(Ptr
, PID
, 10);
1471 Ptr
= strCopy(Ptr
, ".fdata", BufSize
- (Ptr
- Buf
+ 1));
1474 uint64_t FD
= __open(Buf
, O_WRONLY
| O_TRUNC
| O_CREAT
,
1476 if (static_cast<int64_t>(FD
) < 0) {
1477 report("Error while trying to open profile file for writing: ");
1479 reportNumber("\nFailed with error number: 0x",
1480 0 - static_cast<int64_t>(FD
), 16);
1488 } // anonymous namespace
1490 #if !defined(__APPLE__)
1492 /// Reset all counters in case you want to start profiling a new phase of your
1493 /// program independently of prior phases.
1494 /// The address of this function is printed by BOLT and this can be called by
1495 /// any attached debugger during runtime. There is a useful oneliner for gdb:
1497 /// gdb -p $(pgrep -xo PROCESSNAME) -ex 'p ((void(*)())0xdeadbeef)()' \
1498 /// -ex 'set confirm off' -ex quit
1500 /// Where 0xdeadbeef is this function address and PROCESSNAME your binary file
1502 extern "C" void __bolt_instr_clear_counters() {
1503 memset(reinterpret_cast<char *>(__bolt_instr_locations
), 0,
1504 __bolt_num_counters
* 8);
1505 for (int I
= 0; I
< __bolt_instr_num_ind_calls
; ++I
)
1506 GlobalIndCallCounters
[I
].resetCounters();
1509 /// This is the entry point for profile writing.
1510 /// There are three ways of getting here:
1512 /// * Program execution ended, finalization methods are running and BOLT
1513 /// hooked into FINI from your binary dynamic section;
1514 /// * You used the sleep timer option and during initialization we forked
1515 /// a separate process that will call this function periodically;
1516 /// * BOLT prints this function address so you can attach a debugger and
1517 /// call this function directly to get your profile written to disk
1520 extern "C" void __attribute((force_align_arg_pointer
))
1521 __bolt_instr_data_dump(int FD
) {
1523 if (!GlobalWriteProfileMutex
->acquire())
1526 int ret
= __lseek(FD
, 0, SEEK_SET
);
1527 assert(ret
== 0, "Failed to lseek!");
1528 ret
= __ftruncate(FD
, 0);
1529 assert(ret
== 0, "Failed to ftruncate!");
1530 BumpPtrAllocator HashAlloc
;
1531 HashAlloc
.setMaxSize(0x6400000);
1532 ProfileWriterContext Ctx
= readDescriptions();
1533 Ctx
.CallFlowTable
= new (HashAlloc
, 0) CallFlowHashTable(HashAlloc
);
1535 DEBUG(printStats(Ctx
));
1537 BumpPtrAllocator Alloc
;
1538 Alloc
.setMaxSize(0x6400000);
1539 const uint8_t *FuncDesc
= Ctx
.FuncDescriptions
;
1540 for (int I
= 0, E
= __bolt_instr_num_funcs
; I
< E
; ++I
) {
1541 FuncDesc
= writeFunctionProfile(FD
, Ctx
, FuncDesc
, Alloc
);
1543 DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc
, 16));
1545 assert(FuncDesc
== (void *)Ctx
.Strings
,
1546 "FuncDesc ptr must be equal to stringtable");
1548 writeIndirectCallProfile(FD
, Ctx
);
1549 Ctx
.CallFlowTable
->forEachElement(visitCallFlowEntry
, FD
, &Ctx
);
1552 __munmap(Ctx
.MMapPtr
, Ctx
.MMapSize
);
1553 __close(Ctx
.FileDesc
);
1554 HashAlloc
.destroy();
1555 GlobalWriteProfileMutex
->release();
1556 DEBUG(report("Finished writing profile.\n"));
1559 /// Event loop for our child process spawned during setup to dump profile data
1560 /// at user-specified intervals
1561 void watchProcess() {
1563 uint64_t Ellapsed
= 0ull;
1564 int FD
= openProfile();
1566 if (__bolt_instr_wait_forks
) {
1567 // Store parent pgid
1568 ppid
= -__getpgid(0);
1569 // And leave parent process group
1575 // Parent already dead
1576 __bolt_instr_data_dump(FD
);
1584 __nanosleep(&ts
, &rem
);
1585 // This means our parent process or all its forks are dead,
1586 // so no need for us to keep dumping.
1587 if (__kill(ppid
, 0) < 0) {
1588 if (__bolt_instr_no_counters_clear
)
1589 __bolt_instr_data_dump(FD
);
1593 if (++Ellapsed
< __bolt_instr_sleep_time
)
1597 __bolt_instr_data_dump(FD
);
1598 if (__bolt_instr_no_counters_clear
== false)
1599 __bolt_instr_clear_counters();
1603 DEBUG(report("My parent process is dead, bye!\n"));
1608 extern "C" void __bolt_instr_indirect_call();
1609 extern "C" void __bolt_instr_indirect_tailcall();
1611 /// Initialization code
1612 extern "C" void __attribute((force_align_arg_pointer
)) __bolt_instr_setup() {
1613 __bolt_ind_call_counter_func_pointer
= __bolt_instr_indirect_call
;
1614 __bolt_ind_tailcall_counter_func_pointer
= __bolt_instr_indirect_tailcall
;
1615 TextBaseAddress
= getTextBaseAddress();
1617 const uint64_t CountersStart
=
1618 reinterpret_cast<uint64_t>(&__bolt_instr_locations
[0]);
1619 const uint64_t CountersEnd
= alignTo(
1620 reinterpret_cast<uint64_t>(&__bolt_instr_locations
[__bolt_num_counters
]),
1622 DEBUG(reportNumber("replace mmap start: ", CountersStart
, 16));
1623 DEBUG(reportNumber("replace mmap stop: ", CountersEnd
, 16));
1624 assert(CountersEnd
> CountersStart
, "no counters");
1626 const bool Shared
= !__bolt_instr_use_pid
;
1627 const uint64_t MapPrivateOrShared
= Shared
? MAP_SHARED
: MAP_PRIVATE
;
1630 __mmap(CountersStart
, CountersEnd
- CountersStart
, PROT_READ
| PROT_WRITE
,
1631 MAP_ANONYMOUS
| MapPrivateOrShared
| MAP_FIXED
, -1, 0);
1632 assert(Ret
!= MAP_FAILED
, "__bolt_instr_setup: Failed to mmap counters!");
1634 GlobalMetadataStorage
= __mmap(0, 4096, PROT_READ
| PROT_WRITE
,
1635 MapPrivateOrShared
| MAP_ANONYMOUS
, -1, 0);
1636 assert(GlobalMetadataStorage
!= MAP_FAILED
,
1637 "__bolt_instr_setup: failed to mmap page for metadata!");
1639 GlobalAlloc
= new (GlobalMetadataStorage
) BumpPtrAllocator
;
1640 // Conservatively reserve 100MiB
1641 GlobalAlloc
->setMaxSize(0x6400000);
1642 GlobalAlloc
->setShared(Shared
);
1643 GlobalWriteProfileMutex
= new (*GlobalAlloc
, 0) Mutex();
1644 if (__bolt_instr_num_ind_calls
> 0)
1645 GlobalIndCallCounters
=
1646 new (*GlobalAlloc
, 0) IndirectCallHashTable
[__bolt_instr_num_ind_calls
];
1648 if (__bolt_instr_sleep_time
!= 0) {
1649 // Separate instrumented process to the own process group
1650 if (__bolt_instr_wait_forks
)
1653 if (long PID
= __fork())
1659 extern "C" __attribute((force_align_arg_pointer
)) void
1660 instrumentIndirectCall(uint64_t Target
, uint64_t IndCallID
) {
1661 GlobalIndCallCounters
[IndCallID
].incrementVal(Target
, *GlobalAlloc
);
1664 /// We receive as in-stack arguments the identifier of the indirect call site
1665 /// as well as the target address for the call
1666 extern "C" __attribute((naked
)) void __bolt_instr_indirect_call()
1668 #if defined(__aarch64__)
1670 __asm__
__volatile__(SAVE_ALL
1671 "ldp x0, x1, [sp, #288]\n"
1672 "bl instrumentIndirectCall\n"
1679 __asm__
__volatile__(SAVE_ALL
1680 "mov 0xa0(%%rsp), %%rdi\n"
1681 "mov 0x98(%%rsp), %%rsi\n"
1682 "call instrumentIndirectCall\n"
1690 extern "C" __attribute((naked
)) void __bolt_instr_indirect_tailcall()
1692 #if defined(__aarch64__)
1694 __asm__
__volatile__(SAVE_ALL
1695 "ldp x0, x1, [sp, #288]\n"
1696 "bl instrumentIndirectCall\n"
1703 __asm__
__volatile__(SAVE_ALL
1704 "mov 0x98(%%rsp), %%rdi\n"
1705 "mov 0x90(%%rsp), %%rsi\n"
1706 "call instrumentIndirectCall\n"
1714 /// This is hooking ELF's entry, it needs to save all machine state.
1715 extern "C" __attribute((naked
)) void __bolt_instr_start()
1717 #if defined(__aarch64__)
1719 __asm__
__volatile__(SAVE_ALL
1720 "bl __bolt_instr_setup\n"
1722 "adrp x16, __bolt_start_trampoline\n"
1723 "add x16, x16, #:lo12:__bolt_start_trampoline\n"
1729 __asm__
__volatile__(SAVE_ALL
1730 "call __bolt_instr_setup\n"
1732 "jmp __bolt_start_trampoline\n"
1738 /// This is hooking into ELF's DT_FINI
1739 extern "C" void __bolt_instr_fini() {
1740 #if defined(__aarch64__)
1742 __asm__
__volatile__(SAVE_ALL
1743 "adrp x16, __bolt_fini_trampoline\n"
1744 "add x16, x16, #:lo12:__bolt_fini_trampoline\n"
1750 __asm__
__volatile__("call __bolt_fini_trampoline\n" :::);
1752 if (__bolt_instr_sleep_time
== 0) {
1753 int FD
= openProfile();
1754 __bolt_instr_data_dump(FD
);
1757 DEBUG(report("Finished.\n"));
1762 #if defined(__APPLE__)
1764 extern "C" void __bolt_instr_data_dump() {
1765 ProfileWriterContext Ctx
= readDescriptions();
1768 BumpPtrAllocator Alloc
;
1769 const uint8_t *FuncDesc
= Ctx
.FuncDescriptions
;
1770 uint32_t bolt_instr_num_funcs
= _bolt_instr_num_funcs_getter();
1772 for (int I
= 0, E
= bolt_instr_num_funcs
; I
< E
; ++I
) {
1773 FuncDesc
= writeFunctionProfile(FD
, Ctx
, FuncDesc
, Alloc
);
1775 DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc
, 16));
1777 assert(FuncDesc
== (void *)Ctx
.Strings
,
1778 "FuncDesc ptr must be equal to stringtable");
1781 // On OSX/iOS the final symbol name of an extern "C" function/variable contains
1782 // one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
1784 __attribute__((section("__TEXT,__setup")))
1785 __attribute__((force_align_arg_pointer
))
1786 void _bolt_instr_setup() {
1787 __asm__
__volatile__(SAVE_ALL :::);
1791 __asm__
__volatile__(RESTORE_ALL :::);
1795 __attribute__((section("__TEXT,__fini")))
1796 __attribute__((force_align_arg_pointer
))
1797 void _bolt_instr_fini() {
1799 __bolt_instr_data_dump();