compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

   1 //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "CtxInstrProfiling.h"
  10 #include "sanitizer_common/sanitizer_allocator_internal.h"
  11 #include "sanitizer_common/sanitizer_common.h"
  12 #include "sanitizer_common/sanitizer_dense_map.h"
  13 #include "sanitizer_common/sanitizer_libc.h"
  14 #include "sanitizer_common/sanitizer_mutex.h"
  15 #include "sanitizer_common/sanitizer_placement_new.h"
  16 #include "sanitizer_common/sanitizer_thread_safety.h"
  17 #include "sanitizer_common/sanitizer_vector.h"
  18
  19 #include <assert.h>
  20
  21 using namespace __ctx_profile;
  22
  23 namespace {
  24 // Keep track of all the context roots we actually saw, so we can then traverse
  25 // them when the user asks for the profile in __llvm_ctx_profile_fetch
  26 __sanitizer::SpinMutex AllContextsMutex;
  27 SANITIZER_GUARDED_BY(AllContextsMutex)
  28 __sanitizer::Vector<ContextRoot *> AllContextRoots;
  29
  30 // utility to taint a pointer by setting the LSB. There is an assumption
  31 // throughout that the addresses of contexts are even (really, they should be
  32 // align(8), but "even"-ness is the minimum assumption)
  33 // "scratch contexts" are buffers that we return in certain cases - they are
  34 // large enough to allow for memory safe counter access, but they don't link
  35 // subcontexts below them (the runtime recognizes them and enforces that)
  36 ContextNode *markAsScratch(const ContextNode *Ctx) {
  37   return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
  38 }
  39
  40 // Used when getting the data from TLS. We don't *really* need to reset, but
  41 // it's a simpler system if we do.
  42 template <typename T> inline T consume(T &V) {
  43   auto R = V;
  44   V = {0};
  45   return R;
  46 }
  47
  48 // We allocate at least kBuffSize Arena pages. The scratch buffer is also that
  49 // large.
  50 constexpr size_t kPower = 20;
  51 constexpr size_t kBuffSize = 1 << kPower;
  52
  53 // Highly unlikely we need more than kBuffSize for a context.
  54 size_t getArenaAllocSize(size_t Needed) {
  55   if (Needed >= kBuffSize)
  56     return 2 * Needed;
  57   return kBuffSize;
  58 }
  59
  60 // verify the structural integrity of the context
  61 bool validate(const ContextRoot *Root) {
  62   // all contexts should be laid out in some arena page. Go over each arena
  63   // allocated for this Root, and jump over contained contexts based on
  64   // self-reported sizes.
  65   __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
  66   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
  67     const auto *Pos = Mem->start();
  68     while (Pos < Mem->pos()) {
  69       const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
  70       if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
  71                .second)
  72         return false;
  73       Pos += Ctx->size();
  74     }
  75   }
  76
  77   // Now traverse the contexts again the same way, but validate all nonull
  78   // subcontext addresses appear in the set computed above.
  79   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
  80     const auto *Pos = Mem->start();
  81     while (Pos < Mem->pos()) {
  82       const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
  83       for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
  84         for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
  85           if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
  86             return false;
  87
  88       Pos += Ctx->size();
  89     }
  90   }
  91   return true;
  92 }
  93
  94 inline ContextNode *allocContextNode(char *Place, GUID Guid,
  95                                      uint32_t NumCounters,
  96                                      uint32_t NumCallsites,
  97                                      ContextNode *Next = nullptr) {
  98   assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
  99   return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next);
 100 }
 101
 102 void resetContextNode(ContextNode &Node) {
 103   // FIXME(mtrofin): this is std::memset, which we can probably use if we
 104   // drop/reduce the dependency on sanitizer_common.
 105   for (uint32_t I = 0; I < Node.counters_size(); ++I)
 106     Node.counters()[I] = 0;
 107   for (uint32_t I = 0; I < Node.callsites_size(); ++I)
 108     for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
 109       resetContextNode(*Next);
 110 }
 111
 112 void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
 113
 114 } // namespace
 115
 116 // the scratch buffer - what we give when we can't produce a real context (the
 117 // scratch isn't "real" in that it's expected to be clobbered carelessly - we
 118 // don't read it). The other important thing is that the callees from a scratch
 119 // context also get a scratch context.
 120 // Eventually this can be replaced with per-function buffers, a'la the typical
 121 // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
 122 // the part about determining the nature of the subcontexts does.
 123 __thread char __Buffer[kBuffSize] = {0};
 124
 125 #define TheScratchContext                                                      \
 126   markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
 127
 128 // init the TLSes
 129 __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
 130                                                                  nullptr};
 131 __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
 132
 133 __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
 134     nullptr;
 135
 136 Arena::Arena(uint32_t Size) : Size(Size) {
 137   __sanitizer::internal_memset(start(), 0, Size);
 138 }
 139
 140 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
 141 // the dependency on the latter.
 142 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
 143   assert(!Prev || Prev->Next == nullptr);
 144   Arena *NewArena = new (__sanitizer::InternalAlloc(
 145       Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
 146       Arena(Size);
 147   if (Prev)
 148     Prev->Next = NewArena;
 149   return NewArena;
 150 }
 151
 152 void Arena::freeArenaList(Arena *&A) {
 153   assert(A);
 154   for (auto *I = A; I != nullptr;) {
 155     auto *Current = I;
 156     I = I->Next;
 157     __sanitizer::InternalFree(Current);
 158   }
 159   A = nullptr;
 160 }
 161
 162 // If this is the first time we hit a callsite with this (Guid) particular
 163 // callee, we need to allocate.
 164 ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
 165                              uint32_t NumCounters, uint32_t NumCallsites) {
 166   auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites);
 167   auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
 168   char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
 169   if (!AllocPlace) {
 170     // if we failed to allocate on the current arena, allocate a new arena,
 171     // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
 172     // find it from now on for other cases when we need to getCallsiteSlow.
 173     // Note that allocateNewArena will link the allocated memory in the list of
 174     // Arenas.
 175     __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
 176         Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
 177     AllocPlace = Mem->tryBumpAllocate(AllocSize);
 178   }
 179   auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites,
 180                                *InsertionPoint);
 181   *InsertionPoint = Ret;
 182   return Ret;
 183 }
 184
 185 ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
 186                                             uint32_t NumCounters,
 187                                             uint32_t NumCallsites) {
 188   // fast "out" if we're not even doing contextual collection.
 189   if (!__llvm_ctx_profile_current_context_root)
 190     return TheScratchContext;
 191
 192   // also fast "out" if the caller is scratch. We can see if it's scratch by
 193   // looking at the interior pointer into the subcontexts vector that the caller
 194   // provided, which, if the context is scratch, so is that interior pointer
 195   // (because all the address calculations are using even values. Or more
 196   // precisely, aligned - 8 values)
 197   auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
 198   if (!CallsiteContext || isScratch(CallsiteContext))
 199     return TheScratchContext;
 200
 201   // if the callee isn't the expected one, return scratch.
 202   // Signal handler(s) could have been invoked at any point in the execution.
 203   // Should that have happened, and had it (the handler) be built with
 204   // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
 205   // Its sub call graph would have then populated
 206   // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
 207   // The normal call graph may be impacted in that, if the signal handler
 208   // happened somewhere before we read the TLS here, we'd see the TLS reset and
 209   // we'd also fail here. That would just mean we would loose counter values for
 210   // the normal subgraph, this time around. That should be very unlikely, but if
 211   // it happens too frequently, we should be able to detect discrepancies in
 212   // entry counts (caller-callee). At the moment, the design goes on the
 213   // assumption that is so unfrequent, though, that it's not worth doing more
 214   // for that case.
 215   auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
 216   if (ExpectedCallee != Callee)
 217     return TheScratchContext;
 218
 219   auto *Callsite = *CallsiteContext;
 220   // in the case of indirect calls, we will have all seen targets forming a
 221   // linked list here. Find the one corresponding to this callee.
 222   while (Callsite && Callsite->guid() != Guid) {
 223     Callsite = Callsite->next();
 224   }
 225   auto *Ret = Callsite ? Callsite
 226                        : getCallsiteSlow(Guid, CallsiteContext, NumCounters,
 227                                          NumCallsites);
 228   if (Ret->callsites_size() != NumCallsites ||
 229       Ret->counters_size() != NumCounters)
 230     __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
 231                         "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
 232                         reinterpret_cast<void *>(Ret), Guid, NumCallsites,
 233                         NumCounters, Ret->guid(), Ret->callsites_size(),
 234                         Ret->counters_size());
 235   onContextEnter(*Ret);
 236   return Ret;
 237 }
 238
 239 // This should be called once for a Root. Allocate the first arena, set up the
 240 // first context.
 241 void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
 242                   uint32_t NumCallsites) {
 243   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
 244       &AllContextsMutex);
 245   // Re-check - we got here without having had taken a lock.
 246   if (Root->FirstMemBlock)
 247     return;
 248   const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites);
 249   auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
 250   Root->FirstMemBlock = M;
 251   Root->CurrentMem = M;
 252   Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,
 253                                      NumCounters, NumCallsites);
 254   AllContextRoots.PushBack(Root);
 255 }
 256
 257 ContextNode *__llvm_ctx_profile_start_context(
 258     ContextRoot *Root, GUID Guid, uint32_t Counters,
 259     uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
 260   if (!Root->FirstMemBlock) {
 261     setupContext(Root, Guid, Counters, Callsites);
 262   }
 263   if (Root->Taken.TryLock()) {
 264     __llvm_ctx_profile_current_context_root = Root;
 265     onContextEnter(*Root->FirstNode);
 266     return Root->FirstNode;
 267   }
 268   // If this thread couldn't take the lock, return scratch context.
 269   __llvm_ctx_profile_current_context_root = nullptr;
 270   return TheScratchContext;
 271 }
 272
 273 void __llvm_ctx_profile_release_context(ContextRoot *Root)
 274     SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
 275   if (__llvm_ctx_profile_current_context_root) {
 276     __llvm_ctx_profile_current_context_root = nullptr;
 277     Root->Taken.Unlock();
 278   }
 279 }
 280
 281 void __llvm_ctx_profile_start_collection() {
 282   size_t NumMemUnits = 0;
 283   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
 284       &AllContextsMutex);
 285   for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
 286     auto *Root = AllContextRoots[I];
 287     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
 288         &Root->Taken);
 289     for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
 290       ++NumMemUnits;
 291
 292     resetContextNode(*Root->FirstNode);
 293   }
 294   __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
 295 }
 296
 297 bool __llvm_ctx_profile_fetch(void *Data,
 298                               bool (*Writer)(void *W, const ContextNode &)) {
 299   assert(Writer);
 300   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
 301       &AllContextsMutex);
 302
 303   for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
 304     auto *Root = AllContextRoots[I];
 305     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
 306         &Root->Taken);
 307     if (!validate(Root)) {
 308       __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
 309       return false;
 310     }
 311     if (!Writer(Data, *Root->FirstNode))
 312       return false;
 313   }
 314   return true;
 315 }
 316
 317 void __llvm_ctx_profile_free() {
 318   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
 319       &AllContextsMutex);
 320   for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
 321     for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
 322       auto *C = A;
 323       A = A->next();
 324       __sanitizer::InternalFree(C);
 325     }
 326   AllContextRoots.Reset();
 327 }