libc/benchmarks/LibcBenchmark.h

   1 //===-- Benchmark function --------------------------------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 // This file mainly defines a `Benchmark` function.
  10 //
  11 // The benchmarking process is as follows:
  12 // - We start by measuring the time it takes to run the function
  13 // `InitialIterations` times. This is called a Sample. From this we can derive
  14 // the time it took to run a single iteration.
  15 //
  16 // - We repeat the previous step with a greater number of iterations to lower
  17 // the impact of the measurement. We can derive a more precise estimation of the
  18 // runtime for a single iteration.
  19 //
  20 // - Each sample gives a more accurate estimation of the runtime for a single
  21 // iteration but also takes more time to run. We stop the process when:
  22 //   * The measure stabilize under a certain precision (Epsilon),
  23 //   * The overall benchmarking time is greater than MaxDuration,
  24 //   * The overall sample count is greater than MaxSamples,
  25 //   * The last sample used more than MaxIterations iterations.
  26 //
  27 // - We also makes sure that the benchmark doesn't run for a too short period of
  28 // time by defining MinDuration and MinSamples.
  29
  30 #ifndef LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
  31 #define LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
  32
  33 #include "benchmark/benchmark.h"
  34 #include "llvm/ADT/ArrayRef.h"
  35 #include "llvm/ADT/SmallVector.h"
  36 #include <array>
  37 #include <chrono>
  38 #include <cmath>
  39 #include <cstdint>
  40 #include <optional>
  41
  42 namespace llvm {
  43 namespace libc_benchmarks {
  44
  45 using Duration = std::chrono::duration<double>;
  46
  47 enum class BenchmarkLog {
  48   None, // Don't keep the internal state of the benchmark.
  49   Last, // Keep only the last batch.
  50   Full  // Keep all iterations states, useful for testing or debugging.
  51 };
  52
  53 // An object to configure the benchmark stopping conditions.
  54 // See documentation at the beginning of the file for the overall algorithm and
  55 // meaning of each field.
  56 struct BenchmarkOptions {
  57   // The minimum time for which the benchmark is running.
  58   Duration MinDuration = std::chrono::seconds(0);
  59   // The maximum time for which the benchmark is running.
  60   Duration MaxDuration = std::chrono::seconds(10);
  61   // The number of iterations in the first sample.
  62   uint32_t InitialIterations = 1;
  63   // The maximum number of iterations for any given sample.
  64   uint32_t MaxIterations = 10000000;
  65   // The minimum number of samples.
  66   uint32_t MinSamples = 4;
  67   // The maximum number of samples.
  68   uint32_t MaxSamples = 1000;
  69   // The benchmark will stop if the relative difference between the current and
  70   // the last estimation is less than epsilon. This is 1% by default.
  71   double Epsilon = 0.01;
  72   // The number of iterations grows exponentially between each sample.
  73   // Must be greater or equal to 1.
  74   double ScalingFactor = 1.4;
  75   BenchmarkLog Log = BenchmarkLog::None;
  76 };
  77
  78 // The state of a benchmark.
  79 enum class BenchmarkStatus {
  80   Running,
  81   MaxDurationReached,
  82   MaxIterationsReached,
  83   MaxSamplesReached,
  84   PrecisionReached,
  85 };
  86
  87 // The internal state of the benchmark, useful to debug, test or report
  88 // statistics.
  89 struct BenchmarkState {
  90   size_t LastSampleIterations;
  91   Duration LastBatchElapsed;
  92   BenchmarkStatus CurrentStatus;
  93   Duration CurrentBestGuess; // The time estimation for a single run of `foo`.
  94   double ChangeRatio; // The change in time estimation between previous and
  95                       // current samples.
  96 };
  97
  98 // A lightweight result for a benchmark.
  99 struct BenchmarkResult {
 100   BenchmarkStatus TerminationStatus = BenchmarkStatus::Running;
 101   Duration BestGuess = {};
 102   std::optional<llvm::SmallVector<BenchmarkState, 16>> MaybeBenchmarkLog;
 103 };
 104
 105 // Stores information about a cache in the host memory system.
 106 struct CacheInfo {
 107   std::string Type; //  e.g. "Instruction", "Data", "Unified".
 108   int Level;        // 0 is closest to processing unit.
 109   int Size;         // In bytes.
 110   int NumSharing;   // The number of processing units (Hyper-Threading Thread)
 111                     // with which this cache is shared.
 112 };
 113
 114 // Stores information about the host.
 115 struct HostState {
 116   std::string CpuName; // returns a string compatible with the -march option.
 117   double CpuFrequency; // in Hertz.
 118   std::vector<CacheInfo> Caches;
 119
 120   static HostState get();
 121 };
 122
 123 namespace internal {
 124
 125 struct Measurement {
 126   size_t Iterations = 0;
 127   Duration Elapsed = {};
 128 };
 129
 130 // Updates the estimation of the elapsed time for a single iteration.
 131 class RefinableRuntimeEstimation {
 132   Duration TotalTime = {};
 133   size_t TotalIterations = 0;
 134
 135 public:
 136   Duration update(const Measurement &M) {
 137     assert(M.Iterations > 0);
 138     // Duration is encoded as a double (see definition).
 139     // `TotalTime` and `M.Elapsed` are of the same magnitude so we don't expect
 140     // loss of precision due to radically different scales.
 141     TotalTime += M.Elapsed;
 142     TotalIterations += M.Iterations;
 143     return TotalTime / TotalIterations;
 144   }
 145 };
 146
 147 // This class tracks the progression of the runtime estimation.
 148 class RuntimeEstimationProgression {
 149   RefinableRuntimeEstimation RRE;
 150
 151 public:
 152   Duration CurrentEstimation = {};
 153
 154   // Returns the change ratio between our best guess so far and the one from the
 155   // new measurement.
 156   double computeImprovement(const Measurement &M) {
 157     const Duration NewEstimation = RRE.update(M);
 158     const double Ratio = fabs(((CurrentEstimation / NewEstimation) - 1.0));
 159     CurrentEstimation = NewEstimation;
 160     return Ratio;
 161   }
 162 };
 163
 164 } // namespace internal
 165
 166 // Measures the runtime of `foo` until conditions defined by `Options` are met.
 167 //
 168 // To avoid measurement's imprecisions we measure batches of `foo`.
 169 // The batch size is growing by `ScalingFactor` to minimize the effect of
 170 // measuring.
 171 //
 172 // Note: The benchmark is not responsible for serializing the executions of
 173 // `foo`. It is not suitable for measuring, very small & side effect free
 174 // functions, as the processor is free to execute several executions in
 175 // parallel.
 176 //
 177 // - Options: A set of parameters controlling the stopping conditions for the
 178 //     benchmark.
 179 // - foo: The function under test. It takes one value and returns one value.
 180 //     The input value is used to randomize the execution of `foo` as part of a
 181 //     batch to mitigate the effect of the branch predictor. Signature:
 182 //     `ProductType foo(ParameterProvider::value_type value);`
 183 //     The output value is a product of the execution of `foo` and prevents the
 184 //     compiler from optimizing out foo's body.
 185 // - ParameterProvider: An object responsible for providing a range of
 186 //     `Iterations` values to use as input for `foo`. The `value_type` of the
 187 //     returned container has to be compatible with `foo` argument.
 188 //     Must implement one of:
 189 //     `Container<ParameterType> generateBatch(size_t Iterations);`
 190 //     `const Container<ParameterType>& generateBatch(size_t Iterations);`
 191 // - Clock: An object providing the current time. Must implement:
 192 //     `std::chrono::time_point now();`
 193 template <typename Function, typename ParameterProvider,
 194           typename BenchmarkClock = const std::chrono::high_resolution_clock>
 195 BenchmarkResult benchmark(const BenchmarkOptions &Options,
 196                           ParameterProvider &PP, Function foo,
 197                           BenchmarkClock &Clock = BenchmarkClock()) {
 198   BenchmarkResult Result;
 199   internal::RuntimeEstimationProgression REP;
 200   Duration TotalBenchmarkDuration = {};
 201   size_t Iterations = std::max(Options.InitialIterations, uint32_t(1));
 202   size_t Samples = 0;
 203   if (Options.ScalingFactor < 1.0)
 204     report_fatal_error("ScalingFactor should be >= 1");
 205   if (Options.Log != BenchmarkLog::None)
 206     Result.MaybeBenchmarkLog.emplace();
 207   for (;;) {
 208     // Request a new Batch of size `Iterations`.
 209     const auto &Batch = PP.generateBatch(Iterations);
 210
 211     // Measuring this Batch.
 212     const auto StartTime = Clock.now();
 213     for (const auto Parameter : Batch) {
 214       auto Production = foo(Parameter);
 215       benchmark::DoNotOptimize(Production);
 216     }
 217     const auto EndTime = Clock.now();
 218     const Duration Elapsed = EndTime - StartTime;
 219
 220     // Updating statistics.
 221     ++Samples;
 222     TotalBenchmarkDuration += Elapsed;
 223     const double ChangeRatio = REP.computeImprovement({Iterations, Elapsed});
 224     Result.BestGuess = REP.CurrentEstimation;
 225
 226     // Stopping condition.
 227     if (TotalBenchmarkDuration >= Options.MinDuration &&
 228         Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon)
 229       Result.TerminationStatus = BenchmarkStatus::PrecisionReached;
 230     else if (Samples >= Options.MaxSamples)
 231       Result.TerminationStatus = BenchmarkStatus::MaxSamplesReached;
 232     else if (TotalBenchmarkDuration >= Options.MaxDuration)
 233       Result.TerminationStatus = BenchmarkStatus::MaxDurationReached;
 234     else if (Iterations >= Options.MaxIterations)
 235       Result.TerminationStatus = BenchmarkStatus::MaxIterationsReached;
 236
 237     if (Result.MaybeBenchmarkLog) {
 238       auto &BenchmarkLog = *Result.MaybeBenchmarkLog;
 239       if (Options.Log == BenchmarkLog::Last && !BenchmarkLog.empty())
 240         BenchmarkLog.pop_back();
 241       BenchmarkState BS;
 242       BS.LastSampleIterations = Iterations;
 243       BS.LastBatchElapsed = Elapsed;
 244       BS.CurrentStatus = Result.TerminationStatus;
 245       BS.CurrentBestGuess = Result.BestGuess;
 246       BS.ChangeRatio = ChangeRatio;
 247       BenchmarkLog.push_back(BS);
 248     }
 249
 250     if (Result.TerminationStatus != BenchmarkStatus::Running)
 251       return Result;
 252
 253     if (Options.ScalingFactor > 1 &&
 254         Iterations * Options.ScalingFactor == Iterations)
 255       report_fatal_error(
 256           "`Iterations *= ScalingFactor` is idempotent, increase ScalingFactor "
 257           "or InitialIterations.");
 258
 259     Iterations *= Options.ScalingFactor;
 260   }
 261 }
 262
 263 // Interprets `Array` as a circular buffer of `Size` elements.
 264 template <typename T> class CircularArrayRef {
 265   llvm::ArrayRef<T> Array;
 266   size_t Size;
 267
 268 public:
 269   using value_type = T;
 270   using reference = T &;
 271   using const_reference = const T &;
 272   using difference_type = ssize_t;
 273   using size_type = size_t;
 274
 275   class const_iterator {
 276     using iterator_category = std::input_iterator_tag;
 277     llvm::ArrayRef<T> Array;
 278     size_t Index;
 279     size_t Offset;
 280
 281   public:
 282     explicit const_iterator(llvm::ArrayRef<T> Array, size_t Index = 0)
 283         : Array(Array), Index(Index), Offset(Index % Array.size()) {}
 284     const_iterator &operator++() {
 285       ++Index;
 286       ++Offset;
 287       if (Offset == Array.size())
 288         Offset = 0;
 289       return *this;
 290     }
 291     bool operator==(const_iterator Other) const { return Index == Other.Index; }
 292     bool operator!=(const_iterator Other) const { return !(*this == Other); }
 293     const T &operator*() const { return Array[Offset]; }
 294   };
 295
 296   CircularArrayRef(llvm::ArrayRef<T> Array, size_t Size)
 297       : Array(Array), Size(Size) {
 298     assert(Array.size() > 0);
 299   }
 300
 301   const_iterator begin() const { return const_iterator(Array); }
 302   const_iterator end() const { return const_iterator(Array, Size); }
 303 };
 304
 305 // A convenient helper to produce a CircularArrayRef from an ArrayRef.
 306 template <typename T>
 307 CircularArrayRef<T> cycle(llvm::ArrayRef<T> Array, size_t Size) {
 308   return {Array, Size};
 309 }
 310
 311 // Creates an std::array which storage size is constrained under `Bytes`.
 312 template <typename T, size_t Bytes>
 313 using ByteConstrainedArray = std::array<T, Bytes / sizeof(T)>;
 314
 315 // A convenient helper to produce a CircularArrayRef from a
 316 // ByteConstrainedArray.
 317 template <typename T, size_t N>
 318 CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) {
 319   return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size};
 320 }
 321
 322 // Makes sure the binary was compiled in release mode and that frequency
 323 // governor is set on performance.
 324 void checkRequirements();
 325
 326 } // namespace libc_benchmarks
 327 } // namespace llvm
 328
 329 #endif // LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H