1 //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "CtxInstrProfiling.h"
10 #include "sanitizer_common/sanitizer_allocator_internal.h"
11 #include "sanitizer_common/sanitizer_common.h"
12 #include "sanitizer_common/sanitizer_dense_map.h"
13 #include "sanitizer_common/sanitizer_libc.h"
14 #include "sanitizer_common/sanitizer_mutex.h"
15 #include "sanitizer_common/sanitizer_placement_new.h"
16 #include "sanitizer_common/sanitizer_thread_safety.h"
17 #include "sanitizer_common/sanitizer_vector.h"
21 using namespace __ctx_profile
;
24 // Keep track of all the context roots we actually saw, so we can then traverse
25 // them when the user asks for the profile in __llvm_ctx_profile_fetch
26 __sanitizer::SpinMutex AllContextsMutex
;
27 SANITIZER_GUARDED_BY(AllContextsMutex
)
28 __sanitizer::Vector
<ContextRoot
*> AllContextRoots
;
30 // utility to taint a pointer by setting the LSB. There is an assumption
31 // throughout that the addresses of contexts are even (really, they should be
32 // align(8), but "even"-ness is the minimum assumption)
33 // "scratch contexts" are buffers that we return in certain cases - they are
34 // large enough to allow for memory safe counter access, but they don't link
35 // subcontexts below them (the runtime recognizes them and enforces that)
36 ContextNode
*markAsScratch(const ContextNode
*Ctx
) {
37 return reinterpret_cast<ContextNode
*>(reinterpret_cast<uint64_t>(Ctx
) | 1);
40 // Used when getting the data from TLS. We don't *really* need to reset, but
41 // it's a simpler system if we do.
42 template <typename T
> inline T
consume(T
&V
) {
48 // We allocate at least kBuffSize Arena pages. The scratch buffer is also that
50 constexpr size_t kPower
= 20;
51 constexpr size_t kBuffSize
= 1 << kPower
;
53 // Highly unlikely we need more than kBuffSize for a context.
54 size_t getArenaAllocSize(size_t Needed
) {
55 if (Needed
>= kBuffSize
)
60 // verify the structural integrity of the context
61 bool validate(const ContextRoot
*Root
) {
62 // all contexts should be laid out in some arena page. Go over each arena
63 // allocated for this Root, and jump over contained contexts based on
64 // self-reported sizes.
65 __sanitizer::DenseMap
<uint64_t, bool> ContextStartAddrs
;
66 for (const auto *Mem
= Root
->FirstMemBlock
; Mem
; Mem
= Mem
->next()) {
67 const auto *Pos
= Mem
->start();
68 while (Pos
< Mem
->pos()) {
69 const auto *Ctx
= reinterpret_cast<const ContextNode
*>(Pos
);
70 if (!ContextStartAddrs
.insert({reinterpret_cast<uint64_t>(Ctx
), true})
77 // Now traverse the contexts again the same way, but validate all nonull
78 // subcontext addresses appear in the set computed above.
79 for (const auto *Mem
= Root
->FirstMemBlock
; Mem
; Mem
= Mem
->next()) {
80 const auto *Pos
= Mem
->start();
81 while (Pos
< Mem
->pos()) {
82 const auto *Ctx
= reinterpret_cast<const ContextNode
*>(Pos
);
83 for (uint32_t I
= 0; I
< Ctx
->callsites_size(); ++I
)
84 for (auto *Sub
= Ctx
->subContexts()[I
]; Sub
; Sub
= Sub
->next())
85 if (!ContextStartAddrs
.find(reinterpret_cast<uint64_t>(Sub
)))
94 inline ContextNode
*allocContextNode(char *Place
, GUID Guid
,
96 uint32_t NumCallsites
,
97 ContextNode
*Next
= nullptr) {
98 assert(reinterpret_cast<uint64_t>(Place
) % ExpectedAlignment
== 0);
99 return new (Place
) ContextNode(Guid
, NumCounters
, NumCallsites
, Next
);
102 void resetContextNode(ContextNode
&Node
) {
103 // FIXME(mtrofin): this is std::memset, which we can probably use if we
104 // drop/reduce the dependency on sanitizer_common.
105 for (uint32_t I
= 0; I
< Node
.counters_size(); ++I
)
106 Node
.counters()[I
] = 0;
107 for (uint32_t I
= 0; I
< Node
.callsites_size(); ++I
)
108 for (auto *Next
= Node
.subContexts()[I
]; Next
; Next
= Next
->next())
109 resetContextNode(*Next
);
112 void onContextEnter(ContextNode
&Node
) { ++Node
.counters()[0]; }
116 // the scratch buffer - what we give when we can't produce a real context (the
117 // scratch isn't "real" in that it's expected to be clobbered carelessly - we
118 // don't read it). The other important thing is that the callees from a scratch
119 // context also get a scratch context.
120 // Eventually this can be replaced with per-function buffers, a'la the typical
121 // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
122 // the part about determining the nature of the subcontexts does.
123 __thread
char __Buffer
[kBuffSize
] = {0};
125 #define TheScratchContext \
126 markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
129 __thread
void *volatile __llvm_ctx_profile_expected_callee
[2] = {nullptr,
131 __thread ContextNode
**volatile __llvm_ctx_profile_callsite
[2] = {0, 0};
133 __thread ContextRoot
*volatile __llvm_ctx_profile_current_context_root
=
136 Arena::Arena(uint32_t Size
) : Size(Size
) {
137 __sanitizer::internal_memset(start(), 0, Size
);
140 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
141 // the dependency on the latter.
142 Arena
*Arena::allocateNewArena(size_t Size
, Arena
*Prev
) {
143 assert(!Prev
|| Prev
->Next
== nullptr);
144 Arena
*NewArena
= new (__sanitizer::InternalAlloc(
145 Size
+ sizeof(Arena
), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment
))
148 Prev
->Next
= NewArena
;
152 void Arena::freeArenaList(Arena
*&A
) {
154 for (auto *I
= A
; I
!= nullptr;) {
157 __sanitizer::InternalFree(Current
);
162 // If this is the first time we hit a callsite with this (Guid) particular
163 // callee, we need to allocate.
164 ContextNode
*getCallsiteSlow(GUID Guid
, ContextNode
**InsertionPoint
,
165 uint32_t NumCounters
, uint32_t NumCallsites
) {
166 auto AllocSize
= ContextNode::getAllocSize(NumCounters
, NumCallsites
);
167 auto *Mem
= __llvm_ctx_profile_current_context_root
->CurrentMem
;
168 char *AllocPlace
= Mem
->tryBumpAllocate(AllocSize
);
170 // if we failed to allocate on the current arena, allocate a new arena,
171 // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
172 // find it from now on for other cases when we need to getCallsiteSlow.
173 // Note that allocateNewArena will link the allocated memory in the list of
175 __llvm_ctx_profile_current_context_root
->CurrentMem
= Mem
=
176 Mem
->allocateNewArena(getArenaAllocSize(AllocSize
), Mem
);
177 AllocPlace
= Mem
->tryBumpAllocate(AllocSize
);
179 auto *Ret
= allocContextNode(AllocPlace
, Guid
, NumCounters
, NumCallsites
,
181 *InsertionPoint
= Ret
;
185 ContextNode
*__llvm_ctx_profile_get_context(void *Callee
, GUID Guid
,
186 uint32_t NumCounters
,
187 uint32_t NumCallsites
) {
188 // fast "out" if we're not even doing contextual collection.
189 if (!__llvm_ctx_profile_current_context_root
)
190 return TheScratchContext
;
192 // also fast "out" if the caller is scratch. We can see if it's scratch by
193 // looking at the interior pointer into the subcontexts vector that the caller
194 // provided, which, if the context is scratch, so is that interior pointer
195 // (because all the address calculations are using even values. Or more
196 // precisely, aligned - 8 values)
197 auto **CallsiteContext
= consume(__llvm_ctx_profile_callsite
[0]);
198 if (!CallsiteContext
|| isScratch(CallsiteContext
))
199 return TheScratchContext
;
201 // if the callee isn't the expected one, return scratch.
202 // Signal handler(s) could have been invoked at any point in the execution.
203 // Should that have happened, and had it (the handler) be built with
204 // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
205 // Its sub call graph would have then populated
206 // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
207 // The normal call graph may be impacted in that, if the signal handler
208 // happened somewhere before we read the TLS here, we'd see the TLS reset and
209 // we'd also fail here. That would just mean we would loose counter values for
210 // the normal subgraph, this time around. That should be very unlikely, but if
211 // it happens too frequently, we should be able to detect discrepancies in
212 // entry counts (caller-callee). At the moment, the design goes on the
213 // assumption that is so unfrequent, though, that it's not worth doing more
215 auto *ExpectedCallee
= consume(__llvm_ctx_profile_expected_callee
[0]);
216 if (ExpectedCallee
!= Callee
)
217 return TheScratchContext
;
219 auto *Callsite
= *CallsiteContext
;
220 // in the case of indirect calls, we will have all seen targets forming a
221 // linked list here. Find the one corresponding to this callee.
222 while (Callsite
&& Callsite
->guid() != Guid
) {
223 Callsite
= Callsite
->next();
225 auto *Ret
= Callsite
? Callsite
226 : getCallsiteSlow(Guid
, CallsiteContext
, NumCounters
,
228 if (Ret
->callsites_size() != NumCallsites
||
229 Ret
->counters_size() != NumCounters
)
230 __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
231 "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
232 reinterpret_cast<void *>(Ret
), Guid
, NumCallsites
,
233 NumCounters
, Ret
->guid(), Ret
->callsites_size(),
234 Ret
->counters_size());
235 onContextEnter(*Ret
);
239 // This should be called once for a Root. Allocate the first arena, set up the
241 void setupContext(ContextRoot
*Root
, GUID Guid
, uint32_t NumCounters
,
242 uint32_t NumCallsites
) {
243 __sanitizer::GenericScopedLock
<__sanitizer::SpinMutex
> Lock(
245 // Re-check - we got here without having had taken a lock.
246 if (Root
->FirstMemBlock
)
248 const auto Needed
= ContextNode::getAllocSize(NumCounters
, NumCallsites
);
249 auto *M
= Arena::allocateNewArena(getArenaAllocSize(Needed
));
250 Root
->FirstMemBlock
= M
;
251 Root
->CurrentMem
= M
;
252 Root
->FirstNode
= allocContextNode(M
->tryBumpAllocate(Needed
), Guid
,
253 NumCounters
, NumCallsites
);
254 AllContextRoots
.PushBack(Root
);
257 ContextNode
*__llvm_ctx_profile_start_context(
258 ContextRoot
*Root
, GUID Guid
, uint32_t Counters
,
259 uint32_t Callsites
) SANITIZER_NO_THREAD_SAFETY_ANALYSIS
{
260 if (!Root
->FirstMemBlock
) {
261 setupContext(Root
, Guid
, Counters
, Callsites
);
263 if (Root
->Taken
.TryLock()) {
264 __llvm_ctx_profile_current_context_root
= Root
;
265 onContextEnter(*Root
->FirstNode
);
266 return Root
->FirstNode
;
268 // If this thread couldn't take the lock, return scratch context.
269 __llvm_ctx_profile_current_context_root
= nullptr;
270 return TheScratchContext
;
273 void __llvm_ctx_profile_release_context(ContextRoot
*Root
)
274 SANITIZER_NO_THREAD_SAFETY_ANALYSIS
{
275 if (__llvm_ctx_profile_current_context_root
) {
276 __llvm_ctx_profile_current_context_root
= nullptr;
277 Root
->Taken
.Unlock();
281 void __llvm_ctx_profile_start_collection() {
282 size_t NumMemUnits
= 0;
283 __sanitizer::GenericScopedLock
<__sanitizer::SpinMutex
> Lock(
285 for (uint32_t I
= 0; I
< AllContextRoots
.Size(); ++I
) {
286 auto *Root
= AllContextRoots
[I
];
287 __sanitizer::GenericScopedLock
<__sanitizer::StaticSpinMutex
> Lock(
289 for (auto *Mem
= Root
->FirstMemBlock
; Mem
; Mem
= Mem
->next())
292 resetContextNode(*Root
->FirstNode
);
294 __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits
);
297 bool __llvm_ctx_profile_fetch(void *Data
,
298 bool (*Writer
)(void *W
, const ContextNode
&)) {
300 __sanitizer::GenericScopedLock
<__sanitizer::SpinMutex
> Lock(
303 for (int I
= 0, E
= AllContextRoots
.Size(); I
< E
; ++I
) {
304 auto *Root
= AllContextRoots
[I
];
305 __sanitizer::GenericScopedLock
<__sanitizer::StaticSpinMutex
> TakenLock(
307 if (!validate(Root
)) {
308 __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
311 if (!Writer(Data
, *Root
->FirstNode
))
317 void __llvm_ctx_profile_free() {
318 __sanitizer::GenericScopedLock
<__sanitizer::SpinMutex
> Lock(
320 for (int I
= 0, E
= AllContextRoots
.Size(); I
< E
; ++I
)
321 for (auto *A
= AllContextRoots
[I
]->FirstMemBlock
; A
;) {
324 __sanitizer::InternalFree(C
);
326 AllContextRoots
.Reset();