1 //===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file is a part of XRay, a dynamic runtime instrumentation system.
11 // This file defines the interface for a function call trie.
13 //===----------------------------------------------------------------------===//
14 #ifndef XRAY_FUNCTION_CALL_TRIE_H
15 #define XRAY_FUNCTION_CALL_TRIE_H
17 #include "xray_buffer_queue.h"
18 #include "xray_defs.h"
19 #include "xray_profiling_flags.h"
20 #include "xray_segmented_array.h"
22 #include <memory> // For placement new.
27 /// A FunctionCallTrie represents the stack traces of XRay instrumented
28 /// functions that we've encountered, where a node corresponds to a function and
29 /// the path from the root to the node its stack trace. Each node in the trie
30 /// will contain some useful values, including:
32 /// * The cumulative amount of time spent in this particular node/stack.
33 /// * The number of times this stack has appeared.
34 /// * A histogram of latencies for that particular node.
36 /// Each node in the trie will also contain a list of callees, represented using
37 /// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function
38 /// ID of the callee, and a pointer to the node.
40 /// If we visualise this data structure, we'll find the following potential
43 /// [function id node] -> [callees] [cumulative time]
44 /// [call counter] [latency histogram]
46 /// As an example, when we have a function in this pseudocode:
51 /// for i := 1..N { j() }
54 /// We may end up with a trie of the following form:
56 /// f -> [ g, h, j ] [...] [1] [...]
57 /// g -> [ ... ] [...] [1] [...]
58 /// h -> [ ... ] [...] [1] [...]
59 /// j -> [ ... ] [...] [N] [...]
61 /// If for instance the function g() called j() like so:
64 /// for i := 1..10 { j() }
67 /// We'll find the following updated trie:
69 /// f -> [ g, h, j ] [...] [1] [...]
70 /// g -> [ j' ] [...] [1] [...]
71 /// h -> [ ... ] [...] [1] [...]
72 /// j -> [ ... ] [...] [N] [...]
73 /// j' -> [ ... ] [...] [10] [...]
75 /// Note that we'll have a new node representing the path `f -> g -> j'` with
76 /// isolated data. This isolation gives us a means of representing the stack
77 /// traces as a path, as opposed to a key in a table. The alternative
78 /// implementation here would be to use a separate table for the path, and use
79 /// hashes of the path as an identifier to accumulate the information. We've
80 /// moved away from this approach as it takes a lot of time to compute the hash
81 /// every time we need to update a function's call information as we're handling
82 /// the entry and exit events.
84 /// This approach allows us to maintain a shadow stack, which represents the
85 /// currently executing path, and on function exits quickly compute the amount
86 /// of time elapsed from the entry, then update the counters for the node
87 /// already represented in the trie. This necessitates an efficient
88 /// representation of the various data structures (the list of callees must be
89 /// cache-aware and efficient to look up, and the histogram must be compact and
90 /// quick to update) to enable us to keep the overheads of this implementation
92 class FunctionCallTrie
{
96 // We use a NodeIdPair type instead of a std::pair<...> to not rely on the
97 // standard library types in this header.
103 using NodeIdPairArray
= Array
<NodeIdPair
>;
104 using NodeIdPairAllocatorType
= NodeIdPairArray::AllocatorType
;
106 // A Node in the FunctionCallTrie gives us a list of callees, the cumulative
107 // number of times this node actually appeared, the cumulative amount of time
108 // for this particular node including its children call times, and just the
109 // local time spent on this node. Each Node will have the ID of the XRay
110 // instrumented function that it is associated to.
113 NodeIdPairArray Callees
;
115 uint64_t CumulativeLocalTime
; // Typically in TSC deltas, not wall-time.
118 // TODO: Include the compact histogram.
122 struct ShadowStackEntry
{
128 using NodeArray
= Array
<Node
>;
129 using RootArray
= Array
<Node
*>;
130 using ShadowStackArray
= Array
<ShadowStackEntry
>;
133 // We collate the allocators we need into a single struct, as a convenience to
134 // allow us to initialize these as a group.
136 using NodeAllocatorType
= NodeArray::AllocatorType
;
137 using RootAllocatorType
= RootArray::AllocatorType
;
138 using ShadowStackAllocatorType
= ShadowStackArray::AllocatorType
;
140 // Use hosted aligned storage members to allow for trivial move and init.
141 // This also allows us to sidestep the potential-failing allocation issue.
142 alignas(NodeAllocatorType
) std::byte
143 NodeAllocatorStorage
[sizeof(NodeAllocatorType
)];
144 alignas(RootAllocatorType
) std::byte
145 RootAllocatorStorage
[sizeof(RootAllocatorType
)];
146 alignas(ShadowStackAllocatorType
) std::byte
147 ShadowStackAllocatorStorage
[sizeof(ShadowStackAllocatorType
)];
148 alignas(NodeIdPairAllocatorType
) std::byte
149 NodeIdPairAllocatorStorage
[sizeof(NodeIdPairAllocatorType
)];
151 NodeAllocatorType
*NodeAllocator
= nullptr;
152 RootAllocatorType
*RootAllocator
= nullptr;
153 ShadowStackAllocatorType
*ShadowStackAllocator
= nullptr;
154 NodeIdPairAllocatorType
*NodeIdPairAllocator
= nullptr;
156 Allocators() = default;
157 Allocators(const Allocators
&) = delete;
158 Allocators
&operator=(const Allocators
&) = delete;
161 BufferQueue::Buffer NodeBuffer
;
162 BufferQueue::Buffer RootsBuffer
;
163 BufferQueue::Buffer ShadowStackBuffer
;
164 BufferQueue::Buffer NodeIdPairBuffer
;
167 explicit Allocators(Buffers
&B
) XRAY_NEVER_INSTRUMENT
{
168 new (&NodeAllocatorStorage
)
169 NodeAllocatorType(B
.NodeBuffer
.Data
, B
.NodeBuffer
.Size
);
171 reinterpret_cast<NodeAllocatorType
*>(&NodeAllocatorStorage
);
173 new (&RootAllocatorStorage
)
174 RootAllocatorType(B
.RootsBuffer
.Data
, B
.RootsBuffer
.Size
);
176 reinterpret_cast<RootAllocatorType
*>(&RootAllocatorStorage
);
178 new (&ShadowStackAllocatorStorage
) ShadowStackAllocatorType(
179 B
.ShadowStackBuffer
.Data
, B
.ShadowStackBuffer
.Size
);
180 ShadowStackAllocator
= reinterpret_cast<ShadowStackAllocatorType
*>(
181 &ShadowStackAllocatorStorage
);
183 new (&NodeIdPairAllocatorStorage
) NodeIdPairAllocatorType(
184 B
.NodeIdPairBuffer
.Data
, B
.NodeIdPairBuffer
.Size
);
185 NodeIdPairAllocator
= reinterpret_cast<NodeIdPairAllocatorType
*>(
186 &NodeIdPairAllocatorStorage
);
189 explicit Allocators(uptr Max
) XRAY_NEVER_INSTRUMENT
{
190 new (&NodeAllocatorStorage
) NodeAllocatorType(Max
);
192 reinterpret_cast<NodeAllocatorType
*>(&NodeAllocatorStorage
);
194 new (&RootAllocatorStorage
) RootAllocatorType(Max
);
196 reinterpret_cast<RootAllocatorType
*>(&RootAllocatorStorage
);
198 new (&ShadowStackAllocatorStorage
) ShadowStackAllocatorType(Max
);
199 ShadowStackAllocator
= reinterpret_cast<ShadowStackAllocatorType
*>(
200 &ShadowStackAllocatorStorage
);
202 new (&NodeIdPairAllocatorStorage
) NodeIdPairAllocatorType(Max
);
203 NodeIdPairAllocator
= reinterpret_cast<NodeIdPairAllocatorType
*>(
204 &NodeIdPairAllocatorStorage
);
207 Allocators(Allocators
&&O
) XRAY_NEVER_INSTRUMENT
{
208 // Here we rely on the safety of memcpy'ing contents of the storage
209 // members, and then pointing the source pointers to nullptr.
210 internal_memcpy(&NodeAllocatorStorage
, &O
.NodeAllocatorStorage
,
211 sizeof(NodeAllocatorType
));
212 internal_memcpy(&RootAllocatorStorage
, &O
.RootAllocatorStorage
,
213 sizeof(RootAllocatorType
));
214 internal_memcpy(&ShadowStackAllocatorStorage
,
215 &O
.ShadowStackAllocatorStorage
,
216 sizeof(ShadowStackAllocatorType
));
217 internal_memcpy(&NodeIdPairAllocatorStorage
,
218 &O
.NodeIdPairAllocatorStorage
,
219 sizeof(NodeIdPairAllocatorType
));
222 reinterpret_cast<NodeAllocatorType
*>(&NodeAllocatorStorage
);
224 reinterpret_cast<RootAllocatorType
*>(&RootAllocatorStorage
);
225 ShadowStackAllocator
= reinterpret_cast<ShadowStackAllocatorType
*>(
226 &ShadowStackAllocatorStorage
);
227 NodeIdPairAllocator
= reinterpret_cast<NodeIdPairAllocatorType
*>(
228 &NodeIdPairAllocatorStorage
);
230 O
.NodeAllocator
= nullptr;
231 O
.RootAllocator
= nullptr;
232 O
.ShadowStackAllocator
= nullptr;
233 O
.NodeIdPairAllocator
= nullptr;
236 Allocators
&operator=(Allocators
&&O
) XRAY_NEVER_INSTRUMENT
{
237 // When moving into an existing instance, we ensure that we clean up the
238 // current allocators.
240 NodeAllocator
->~NodeAllocatorType();
241 if (O
.NodeAllocator
) {
242 new (&NodeAllocatorStorage
)
243 NodeAllocatorType(std::move(*O
.NodeAllocator
));
245 reinterpret_cast<NodeAllocatorType
*>(&NodeAllocatorStorage
);
246 O
.NodeAllocator
= nullptr;
248 NodeAllocator
= nullptr;
252 RootAllocator
->~RootAllocatorType();
253 if (O
.RootAllocator
) {
254 new (&RootAllocatorStorage
)
255 RootAllocatorType(std::move(*O
.RootAllocator
));
257 reinterpret_cast<RootAllocatorType
*>(&RootAllocatorStorage
);
258 O
.RootAllocator
= nullptr;
260 RootAllocator
= nullptr;
263 if (ShadowStackAllocator
)
264 ShadowStackAllocator
->~ShadowStackAllocatorType();
265 if (O
.ShadowStackAllocator
) {
266 new (&ShadowStackAllocatorStorage
)
267 ShadowStackAllocatorType(std::move(*O
.ShadowStackAllocator
));
268 ShadowStackAllocator
= reinterpret_cast<ShadowStackAllocatorType
*>(
269 &ShadowStackAllocatorStorage
);
270 O
.ShadowStackAllocator
= nullptr;
272 ShadowStackAllocator
= nullptr;
275 if (NodeIdPairAllocator
)
276 NodeIdPairAllocator
->~NodeIdPairAllocatorType();
277 if (O
.NodeIdPairAllocator
) {
278 new (&NodeIdPairAllocatorStorage
)
279 NodeIdPairAllocatorType(std::move(*O
.NodeIdPairAllocator
));
280 NodeIdPairAllocator
= reinterpret_cast<NodeIdPairAllocatorType
*>(
281 &NodeIdPairAllocatorStorage
);
282 O
.NodeIdPairAllocator
= nullptr;
284 NodeIdPairAllocator
= nullptr;
290 ~Allocators() XRAY_NEVER_INSTRUMENT
{
291 if (NodeAllocator
!= nullptr)
292 NodeAllocator
->~NodeAllocatorType();
293 if (RootAllocator
!= nullptr)
294 RootAllocator
->~RootAllocatorType();
295 if (ShadowStackAllocator
!= nullptr)
296 ShadowStackAllocator
->~ShadowStackAllocatorType();
297 if (NodeIdPairAllocator
!= nullptr)
298 NodeIdPairAllocator
->~NodeIdPairAllocatorType();
302 static Allocators
InitAllocators() XRAY_NEVER_INSTRUMENT
{
303 return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max
);
306 static Allocators
InitAllocatorsCustom(uptr Max
) XRAY_NEVER_INSTRUMENT
{
312 InitAllocatorsFromBuffers(Allocators::Buffers
&Bufs
) XRAY_NEVER_INSTRUMENT
{
320 ShadowStackArray ShadowStack
;
321 NodeIdPairAllocatorType
*NodeIdPairAllocator
;
322 uint32_t OverflowedFunctions
;
325 explicit FunctionCallTrie(const Allocators
&A
) XRAY_NEVER_INSTRUMENT
326 : Nodes(*A
.NodeAllocator
),
327 Roots(*A
.RootAllocator
),
328 ShadowStack(*A
.ShadowStackAllocator
),
329 NodeIdPairAllocator(A
.NodeIdPairAllocator
),
330 OverflowedFunctions(0) {}
332 FunctionCallTrie() = delete;
333 FunctionCallTrie(const FunctionCallTrie
&) = delete;
334 FunctionCallTrie
&operator=(const FunctionCallTrie
&) = delete;
336 FunctionCallTrie(FunctionCallTrie
&&O
) XRAY_NEVER_INSTRUMENT
337 : Nodes(std::move(O
.Nodes
)),
338 Roots(std::move(O
.Roots
)),
339 ShadowStack(std::move(O
.ShadowStack
)),
340 NodeIdPairAllocator(O
.NodeIdPairAllocator
),
341 OverflowedFunctions(O
.OverflowedFunctions
) {}
343 FunctionCallTrie
&operator=(FunctionCallTrie
&&O
) XRAY_NEVER_INSTRUMENT
{
344 Nodes
= std::move(O
.Nodes
);
345 Roots
= std::move(O
.Roots
);
346 ShadowStack
= std::move(O
.ShadowStack
);
347 NodeIdPairAllocator
= O
.NodeIdPairAllocator
;
348 OverflowedFunctions
= O
.OverflowedFunctions
;
352 ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT
{}
354 void enterFunction(const int32_t FId
, uint64_t TSC
,
355 uint16_t CPU
) XRAY_NEVER_INSTRUMENT
{
358 // If we're already overflowed the function call stack, do not bother
359 // attempting to record any more function entries.
360 if (UNLIKELY(OverflowedFunctions
)) {
361 ++OverflowedFunctions
;
365 // If this is the first function we've encountered, we want to set up the
366 // node(s) and treat it as a root.
367 if (UNLIKELY(ShadowStack
.empty())) {
368 auto *NewRoot
= Nodes
.AppendEmplace(
369 nullptr, NodeIdPairArray(*NodeIdPairAllocator
), 0u, 0u, FId
);
370 if (UNLIKELY(NewRoot
== nullptr))
372 if (Roots
.AppendEmplace(NewRoot
) == nullptr) {
376 if (ShadowStack
.AppendEmplace(TSC
, NewRoot
, CPU
) == nullptr) {
379 ++OverflowedFunctions
;
385 // From this point on, we require that the stack is not empty.
386 DCHECK(!ShadowStack
.empty());
387 auto TopNode
= ShadowStack
.back().NodePtr
;
388 DCHECK_NE(TopNode
, nullptr);
390 // If we've seen this callee before, then we access that node and place that
391 // on the top of the stack.
392 auto* Callee
= TopNode
->Callees
.find_element(
393 [FId
](const NodeIdPair
&NR
) { return NR
.FId
== FId
; });
394 if (Callee
!= nullptr) {
395 CHECK_NE(Callee
->NodePtr
, nullptr);
396 if (ShadowStack
.AppendEmplace(TSC
, Callee
->NodePtr
, CPU
) == nullptr)
397 ++OverflowedFunctions
;
401 // This means we've never seen this stack before, create a new node here.
402 auto* NewNode
= Nodes
.AppendEmplace(
403 TopNode
, NodeIdPairArray(*NodeIdPairAllocator
), 0u, 0u, FId
);
404 if (UNLIKELY(NewNode
== nullptr))
406 DCHECK_NE(NewNode
, nullptr);
407 TopNode
->Callees
.AppendEmplace(NewNode
, FId
);
408 if (ShadowStack
.AppendEmplace(TSC
, NewNode
, CPU
) == nullptr)
409 ++OverflowedFunctions
;
413 void exitFunction(int32_t FId
, uint64_t TSC
,
414 uint16_t CPU
) XRAY_NEVER_INSTRUMENT
{
415 // If we're exiting functions that have "overflowed" or don't fit into the
416 // stack due to allocator constraints, we then decrement that count first.
417 if (OverflowedFunctions
) {
418 --OverflowedFunctions
;
422 // When we exit a function, we look up the ShadowStack to see whether we've
423 // entered this function before. We do as little processing here as we can,
424 // since most of the hard work would have already been done at function
426 uint64_t CumulativeTreeTime
= 0;
428 while (!ShadowStack
.empty()) {
429 const auto &Top
= ShadowStack
.back();
430 auto TopNode
= Top
.NodePtr
;
431 DCHECK_NE(TopNode
, nullptr);
433 // We may encounter overflow on the TSC we're provided, which may end up
434 // being less than the TSC when we first entered the function.
436 // To get the accurate measurement of cycles, we need to check whether
437 // we've overflowed (TSC < Top.EntryTSC) and then account the difference
438 // between the entry TSC and the max for the TSC counter (max of uint64_t)
439 // then add the value of TSC. We can prove that the maximum delta we will
440 // get is at most the 64-bit unsigned value, since the difference between
441 // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
444 // NOTE: This assumes that TSCs are synchronised across CPUs.
445 // TODO: Count the number of times we've seen CPU migrations.
448 ? (std::numeric_limits
<uint64_t>::max() - Top
.EntryTSC
) + TSC
449 : TSC
- Top
.EntryTSC
;
450 TopNode
->CallCount
++;
451 TopNode
->CumulativeLocalTime
+= LocalTime
- CumulativeTreeTime
;
452 CumulativeTreeTime
+= LocalTime
;
455 // TODO: Update the histogram for the node.
456 if (TopNode
->FId
== FId
)
461 const RootArray
&getRoots() const XRAY_NEVER_INSTRUMENT
{ return Roots
; }
463 // The deepCopyInto operation will update the provided FunctionCallTrie by
464 // re-creating the contents of this particular FunctionCallTrie in the other
465 // FunctionCallTrie. It will do this using a Depth First Traversal from the
466 // roots, and while doing so recreating the traversal in the provided
469 // This operation will *not* destroy the state in `O`, and thus may cause some
470 // duplicate entries in `O` if it is not empty.
472 // This function is *not* thread-safe, and may require external
473 // synchronisation of both "this" and |O|.
475 // This function must *not* be called with a non-empty FunctionCallTrie |O|.
476 void deepCopyInto(FunctionCallTrie
&O
) const XRAY_NEVER_INSTRUMENT
{
477 DCHECK(O
.getRoots().empty());
479 // We then push the root into a stack, to use as the parent marker for new
480 // nodes we push in as we're traversing depth-first down the call tree.
481 struct NodeAndParent
{
482 FunctionCallTrie::Node
*Node
;
483 FunctionCallTrie::Node
*NewNode
;
485 using Stack
= Array
<NodeAndParent
>;
487 typename
Stack::AllocatorType
StackAllocator(
488 profilingFlags()->stack_allocator_max
);
489 Stack
DFSStack(StackAllocator
);
491 for (const auto Root
: getRoots()) {
492 // Add a node in O for this root.
493 auto NewRoot
= O
.Nodes
.AppendEmplace(
494 nullptr, NodeIdPairArray(*O
.NodeIdPairAllocator
), Root
->CallCount
,
495 Root
->CumulativeLocalTime
, Root
->FId
);
497 // Because we cannot allocate more memory we should bail out right away.
498 if (UNLIKELY(NewRoot
== nullptr))
501 if (UNLIKELY(O
.Roots
.Append(NewRoot
) == nullptr))
504 // TODO: Figure out what to do if we fail to allocate any more stack
505 // space. Maybe warn or report once?
506 if (DFSStack
.AppendEmplace(Root
, NewRoot
) == nullptr)
508 while (!DFSStack
.empty()) {
509 NodeAndParent NP
= DFSStack
.back();
510 DCHECK_NE(NP
.Node
, nullptr);
511 DCHECK_NE(NP
.NewNode
, nullptr);
513 for (const auto Callee
: NP
.Node
->Callees
) {
514 auto NewNode
= O
.Nodes
.AppendEmplace(
515 NP
.NewNode
, NodeIdPairArray(*O
.NodeIdPairAllocator
),
516 Callee
.NodePtr
->CallCount
, Callee
.NodePtr
->CumulativeLocalTime
,
518 if (UNLIKELY(NewNode
== nullptr))
520 if (UNLIKELY(NP
.NewNode
->Callees
.AppendEmplace(NewNode
, Callee
.FId
) ==
523 if (UNLIKELY(DFSStack
.AppendEmplace(Callee
.NodePtr
, NewNode
) ==
531 // The mergeInto operation will update the provided FunctionCallTrie by
532 // traversing the current trie's roots and updating (i.e. merging) the data in
533 // the nodes with the data in the target's nodes. If the node doesn't exist in
534 // the provided trie, we add a new one in the right position, and inherit the
535 // data from the original (current) trie, along with all its callees.
537 // This function is *not* thread-safe, and may require external
538 // synchronisation of both "this" and |O|.
539 void mergeInto(FunctionCallTrie
&O
) const XRAY_NEVER_INSTRUMENT
{
540 struct NodeAndTarget
{
541 FunctionCallTrie::Node
*OrigNode
;
542 FunctionCallTrie::Node
*TargetNode
;
544 using Stack
= Array
<NodeAndTarget
>;
545 typename
Stack::AllocatorType
StackAllocator(
546 profilingFlags()->stack_allocator_max
);
547 Stack
DFSStack(StackAllocator
);
549 for (const auto Root
: getRoots()) {
550 Node
*TargetRoot
= nullptr;
551 auto R
= O
.Roots
.find_element(
552 [&](const Node
*Node
) { return Node
->FId
== Root
->FId
; });
554 TargetRoot
= O
.Nodes
.AppendEmplace(
555 nullptr, NodeIdPairArray(*O
.NodeIdPairAllocator
), 0u, 0u,
557 if (UNLIKELY(TargetRoot
== nullptr))
560 O
.Roots
.Append(TargetRoot
);
565 DFSStack
.AppendEmplace(Root
, TargetRoot
);
566 while (!DFSStack
.empty()) {
567 NodeAndTarget NT
= DFSStack
.back();
568 DCHECK_NE(NT
.OrigNode
, nullptr);
569 DCHECK_NE(NT
.TargetNode
, nullptr);
571 // TODO: Update the histogram as well when we have it ready.
572 NT
.TargetNode
->CallCount
+= NT
.OrigNode
->CallCount
;
573 NT
.TargetNode
->CumulativeLocalTime
+= NT
.OrigNode
->CumulativeLocalTime
;
574 for (const auto Callee
: NT
.OrigNode
->Callees
) {
575 auto TargetCallee
= NT
.TargetNode
->Callees
.find_element(
576 [&](const FunctionCallTrie::NodeIdPair
&C
) {
577 return C
.FId
== Callee
.FId
;
579 if (TargetCallee
== nullptr) {
580 auto NewTargetNode
= O
.Nodes
.AppendEmplace(
581 NT
.TargetNode
, NodeIdPairArray(*O
.NodeIdPairAllocator
), 0u, 0u,
584 if (UNLIKELY(NewTargetNode
== nullptr))
588 NT
.TargetNode
->Callees
.AppendEmplace(NewTargetNode
, Callee
.FId
);
590 DFSStack
.AppendEmplace(Callee
.NodePtr
, TargetCallee
->NodePtr
);
597 } // namespace __xray
599 #endif // XRAY_FUNCTION_CALL_TRIE_H