libc/benchmarks/gpu/LibcGpuBenchmark.h

   1 #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
   2 #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
   3
   4 #include "benchmarks/gpu/BenchmarkLogger.h"
   5 #include "benchmarks/gpu/timing/timing.h"
   6 #include "src/__support/CPP/array.h"
   7 #include "src/__support/CPP/functional.h"
   8 #include "src/__support/CPP/limits.h"
   9 #include "src/__support/CPP/string_view.h"
  10 #include "src/__support/CPP/type_traits.h"
  11 #include "src/__support/FPUtil/FPBits.h"
  12 #include "src/__support/macros/config.h"
  13 #include "src/stdlib/rand.h"
  14 #include "src/time/clock.h"
  15
  16 #include <stdint.h>
  17
  18 namespace LIBC_NAMESPACE_DECL {
  19
  20 namespace benchmarks {
  21
  22 struct BenchmarkOptions {
  23   uint32_t initial_iterations = 1;
  24   uint32_t min_iterations = 1;
  25   uint32_t max_iterations = 10000000;
  26   uint32_t min_samples = 4;
  27   uint32_t max_samples = 1000;
  28   int64_t min_duration = 500 * 1000;         // 500 * 1000 nanoseconds = 500 us
  29   int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
  30   double epsilon = 0.0001;
  31   double scaling_factor = 1.4;
  32 };
  33
  34 struct Measurement {
  35   uint32_t iterations = 0;
  36   uint64_t elapsed_cycles = 0;
  37 };
  38
  39 class RefinableRuntimeEstimation {
  40   uint64_t total_cycles = 0;
  41   uint32_t total_iterations = 0;
  42
  43 public:
  44   uint64_t update(const Measurement &M) {
  45     total_cycles += M.elapsed_cycles;
  46     total_iterations += M.iterations;
  47     return total_cycles / total_iterations;
  48   }
  49 };
  50
  51 // Tracks the progression of the runtime estimation
  52 class RuntimeEstimationProgression {
  53   RefinableRuntimeEstimation rre;
  54
  55 public:
  56   uint64_t current_estimation = 0;
  57
  58   double compute_improvement(const Measurement &M) {
  59     const uint64_t new_estimation = rre.update(M);
  60     double ratio =
  61         (static_cast<double>(current_estimation) / new_estimation) - 1.0;
  62
  63     // Get absolute value
  64     if (ratio < 0)
  65       ratio *= -1;
  66
  67     current_estimation = new_estimation;
  68     return ratio;
  69   }
  70 };
  71
  72 struct BenchmarkResult {
  73   uint64_t cycles = 0;
  74   double standard_deviation = 0;
  75   uint64_t min = UINT64_MAX;
  76   uint64_t max = 0;
  77   uint32_t samples = 0;
  78   uint32_t total_iterations = 0;
  79   clock_t total_time = 0;
  80 };
  81
  82 BenchmarkResult benchmark(const BenchmarkOptions &options,
  83                           cpp::function<uint64_t(void)> wrapper_func);
  84
  85 class Benchmark {
  86   const cpp::function<uint64_t(void)> func;
  87   const cpp::string_view suite_name;
  88   const cpp::string_view test_name;
  89   const uint32_t num_threads;
  90
  91 public:
  92   Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
  93             char const *test_name, uint32_t num_threads)
  94       : func(func), suite_name(suite_name), test_name(test_name),
  95         num_threads(num_threads) {
  96     add_benchmark(this);
  97   }
  98
  99   static void run_benchmarks();
 100   const cpp::string_view get_suite_name() const { return suite_name; }
 101   const cpp::string_view get_test_name() const { return test_name; }
 102
 103 protected:
 104   static void add_benchmark(Benchmark *benchmark);
 105
 106 private:
 107   BenchmarkResult run() {
 108     BenchmarkOptions options;
 109     return benchmark(options, func);
 110   }
 111 };
 112
 113 // We want our random values to be approximately
 114 // Output: a random number with the exponent field between min_exp and max_exp,
 115 // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
 116 // Caveats:
 117 //   -EXP_BIAS corresponding to denormal values,
 118 //   EXP_BIAS + 1 corresponding to inf or nan.
 119 template <typename T>
 120 static T
 121 get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
 122                int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
 123   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
 124
 125   // Required to correctly instantiate FPBits for floats and doubles.
 126   using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
 127                                                uint64_t, uint32_t>;
 128   RandType bits;
 129   if constexpr (cpp::is_same_v<T, uint64_t>)
 130     bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
 131            static_cast<uint64_t>(LIBC_NAMESPACE::rand());
 132   else
 133     bits = LIBC_NAMESPACE::rand();
 134   double scale =
 135       static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
 136   FPBits fp(bits);
 137   fp.set_biased_exponent(
 138       static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
 139   return fp.get_val();
 140 }
 141
 142 template <typename T> class MathPerf {
 143   using FPBits = fputil::FPBits<T>;
 144   using StorageType = typename FPBits::StorageType;
 145   static constexpr StorageType UIntMax =
 146       cpp::numeric_limits<StorageType>::max();
 147
 148 public:
 149   template <size_t N = 1>
 150   static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
 151     cpp::array<T, N> inputs;
 152     for (size_t i = 0; i < N; ++i)
 153       inputs[i] = get_rand_input<T>(min_exp, max_exp);
 154
 155     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
 156
 157     return total_time / N;
 158   }
 159
 160   // Throughput benchmarking for functions that take 2 inputs.
 161   template <size_t N = 1>
 162   static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
 163                                           int arg1_max_exp, int arg2_min_exp,
 164                                           int arg2_max_exp) {
 165     cpp::array<T, N> inputs1;
 166     cpp::array<T, N> inputs2;
 167     for (size_t i = 0; i < N; ++i) {
 168       inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
 169       inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
 170     }
 171
 172     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
 173
 174     return total_time / N;
 175   }
 176 };
 177
 178 } // namespace benchmarks
 179 } // namespace LIBC_NAMESPACE_DECL
 180
 181 // Passing -1 indicates the benchmark should be run with as many threads as
 182 // allocated by the user in the benchmark's CMake.
 183 #define BENCHMARK(SuiteName, TestName, Func)                                   \
 184   LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
 185       Func, #SuiteName, #TestName, -1)
 186
 187 #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads)             \
 188   LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
 189       Func, #SuiteName, #TestName, NumThreads)
 190
 191 #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func)                   \
 192   BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
 193
 194 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
 195   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
 196                       LIBC_NAMESPACE::gpu::get_lane_size())
 197 #endif