[libc] Switch to using the generic `<gpuintrin.h>` implementations (#121810)
[llvm-project.git] / libc / benchmarks / gpu / LibcGpuBenchmark.h
blobf2cfbfbfdcdf0db1ac92a232f13f6152a3091369
1 #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
2 #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
4 #include "benchmarks/gpu/BenchmarkLogger.h"
5 #include "benchmarks/gpu/timing/timing.h"
6 #include "src/__support/CPP/array.h"
7 #include "src/__support/CPP/functional.h"
8 #include "src/__support/CPP/limits.h"
9 #include "src/__support/CPP/string_view.h"
10 #include "src/__support/CPP/type_traits.h"
11 #include "src/__support/FPUtil/FPBits.h"
12 #include "src/__support/macros/config.h"
13 #include "src/stdlib/rand.h"
14 #include "src/time/clock.h"
16 #include <stdint.h>
18 namespace LIBC_NAMESPACE_DECL {
20 namespace benchmarks {
22 struct BenchmarkOptions {
23 uint32_t initial_iterations = 1;
24 uint32_t min_iterations = 1;
25 uint32_t max_iterations = 10000000;
26 uint32_t min_samples = 4;
27 uint32_t max_samples = 1000;
28 int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us
29 int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
30 double epsilon = 0.0001;
31 double scaling_factor = 1.4;
34 struct Measurement {
35 uint32_t iterations = 0;
36 uint64_t elapsed_cycles = 0;
39 class RefinableRuntimeEstimation {
40 uint64_t total_cycles = 0;
41 uint32_t total_iterations = 0;
43 public:
44 uint64_t update(const Measurement &M) {
45 total_cycles += M.elapsed_cycles;
46 total_iterations += M.iterations;
47 return total_cycles / total_iterations;
51 // Tracks the progression of the runtime estimation
52 class RuntimeEstimationProgression {
53 RefinableRuntimeEstimation rre;
55 public:
56 uint64_t current_estimation = 0;
58 double compute_improvement(const Measurement &M) {
59 const uint64_t new_estimation = rre.update(M);
60 double ratio =
61 (static_cast<double>(current_estimation) / new_estimation) - 1.0;
63 // Get absolute value
64 if (ratio < 0)
65 ratio *= -1;
67 current_estimation = new_estimation;
68 return ratio;
72 struct BenchmarkResult {
73 uint64_t cycles = 0;
74 double standard_deviation = 0;
75 uint64_t min = UINT64_MAX;
76 uint64_t max = 0;
77 uint32_t samples = 0;
78 uint32_t total_iterations = 0;
79 clock_t total_time = 0;
82 BenchmarkResult benchmark(const BenchmarkOptions &options,
83 cpp::function<uint64_t(void)> wrapper_func);
85 class Benchmark {
86 const cpp::function<uint64_t(void)> func;
87 const cpp::string_view suite_name;
88 const cpp::string_view test_name;
89 const uint32_t num_threads;
91 public:
92 Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
93 char const *test_name, uint32_t num_threads)
94 : func(func), suite_name(suite_name), test_name(test_name),
95 num_threads(num_threads) {
96 add_benchmark(this);
99 static void run_benchmarks();
100 const cpp::string_view get_suite_name() const { return suite_name; }
101 const cpp::string_view get_test_name() const { return test_name; }
103 protected:
104 static void add_benchmark(Benchmark *benchmark);
106 private:
107 BenchmarkResult run() {
108 BenchmarkOptions options;
109 return benchmark(options, func);
113 // We want our random values to be approximately
114 // Output: a random number with the exponent field between min_exp and max_exp,
115 // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
116 // Caveats:
117 // -EXP_BIAS corresponding to denormal values,
118 // EXP_BIAS + 1 corresponding to inf or nan.
119 template <typename T>
120 static T
121 get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
122 int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
123 using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
125 // Required to correctly instantiate FPBits for floats and doubles.
126 using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
127 uint64_t, uint32_t>;
128 RandType bits;
129 if constexpr (cpp::is_same_v<T, uint64_t>)
130 bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
131 static_cast<uint64_t>(LIBC_NAMESPACE::rand());
132 else
133 bits = LIBC_NAMESPACE::rand();
134 double scale =
135 static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
136 FPBits fp(bits);
137 fp.set_biased_exponent(
138 static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
139 return fp.get_val();
142 template <typename T> class MathPerf {
143 using FPBits = fputil::FPBits<T>;
144 using StorageType = typename FPBits::StorageType;
145 static constexpr StorageType UIntMax =
146 cpp::numeric_limits<StorageType>::max();
148 public:
149 template <size_t N = 1>
150 static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
151 cpp::array<T, N> inputs;
152 for (size_t i = 0; i < N; ++i)
153 inputs[i] = get_rand_input<T>(min_exp, max_exp);
155 uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
157 return total_time / N;
160 // Throughput benchmarking for functions that take 2 inputs.
161 template <size_t N = 1>
162 static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
163 int arg1_max_exp, int arg2_min_exp,
164 int arg2_max_exp) {
165 cpp::array<T, N> inputs1;
166 cpp::array<T, N> inputs2;
167 for (size_t i = 0; i < N; ++i) {
168 inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
169 inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
172 uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
174 return total_time / N;
178 } // namespace benchmarks
179 } // namespace LIBC_NAMESPACE_DECL
181 // Passing -1 indicates the benchmark should be run with as many threads as
182 // allocated by the user in the benchmark's CMake.
183 #define BENCHMARK(SuiteName, TestName, Func) \
184 LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
185 Func, #SuiteName, #TestName, -1)
187 #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \
188 LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
189 Func, #SuiteName, #TestName, NumThreads)
191 #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \
192 BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
194 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
195 BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
196 LIBC_NAMESPACE::gpu::get_lane_size())
197 #endif