1 #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
2 #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
4 #include "benchmarks/gpu/BenchmarkLogger.h"
5 #include "benchmarks/gpu/timing/timing.h"
6 #include "src/__support/CPP/array.h"
7 #include "src/__support/CPP/functional.h"
8 #include "src/__support/CPP/limits.h"
9 #include "src/__support/CPP/string_view.h"
10 #include "src/__support/CPP/type_traits.h"
11 #include "src/__support/FPUtil/FPBits.h"
12 #include "src/__support/macros/config.h"
13 #include "src/stdlib/rand.h"
14 #include "src/time/clock.h"
18 namespace LIBC_NAMESPACE_DECL
{
20 namespace benchmarks
{
22 struct BenchmarkOptions
{
23 uint32_t initial_iterations
= 1;
24 uint32_t min_iterations
= 1;
25 uint32_t max_iterations
= 10000000;
26 uint32_t min_samples
= 4;
27 uint32_t max_samples
= 1000;
28 int64_t min_duration
= 500 * 1000; // 500 * 1000 nanoseconds = 500 us
29 int64_t max_duration
= 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
30 double epsilon
= 0.0001;
31 double scaling_factor
= 1.4;
35 uint32_t iterations
= 0;
36 uint64_t elapsed_cycles
= 0;
39 class RefinableRuntimeEstimation
{
40 uint64_t total_cycles
= 0;
41 uint32_t total_iterations
= 0;
44 uint64_t update(const Measurement
&M
) {
45 total_cycles
+= M
.elapsed_cycles
;
46 total_iterations
+= M
.iterations
;
47 return total_cycles
/ total_iterations
;
51 // Tracks the progression of the runtime estimation
52 class RuntimeEstimationProgression
{
53 RefinableRuntimeEstimation rre
;
56 uint64_t current_estimation
= 0;
58 double compute_improvement(const Measurement
&M
) {
59 const uint64_t new_estimation
= rre
.update(M
);
61 (static_cast<double>(current_estimation
) / new_estimation
) - 1.0;
67 current_estimation
= new_estimation
;
72 struct BenchmarkResult
{
74 double standard_deviation
= 0;
75 uint64_t min
= UINT64_MAX
;
78 uint32_t total_iterations
= 0;
79 clock_t total_time
= 0;
82 BenchmarkResult
benchmark(const BenchmarkOptions
&options
,
83 cpp::function
<uint64_t(void)> wrapper_func
);
86 const cpp::function
<uint64_t(void)> func
;
87 const cpp::string_view suite_name
;
88 const cpp::string_view test_name
;
89 const uint32_t num_threads
;
92 Benchmark(cpp::function
<uint64_t(void)> func
, char const *suite_name
,
93 char const *test_name
, uint32_t num_threads
)
94 : func(func
), suite_name(suite_name
), test_name(test_name
),
95 num_threads(num_threads
) {
99 static void run_benchmarks();
100 const cpp::string_view
get_suite_name() const { return suite_name
; }
101 const cpp::string_view
get_test_name() const { return test_name
; }
104 static void add_benchmark(Benchmark
*benchmark
);
107 BenchmarkResult
run() {
108 BenchmarkOptions options
;
109 return benchmark(options
, func
);
113 // We want our random values to be approximately
114 // Output: a random number with the exponent field between min_exp and max_exp,
115 // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
117 // -EXP_BIAS corresponding to denormal values,
118 // EXP_BIAS + 1 corresponding to inf or nan.
119 template <typename T
>
121 get_rand_input(int max_exp
= LIBC_NAMESPACE::fputil::FPBits
<T
>::EXP_BIAS
,
122 int min_exp
= -LIBC_NAMESPACE::fputil::FPBits
<T
>::EXP_BIAS
) {
123 using FPBits
= LIBC_NAMESPACE::fputil::FPBits
<T
>;
125 // Required to correctly instantiate FPBits for floats and doubles.
126 using RandType
= typename
cpp::conditional_t
<(cpp::is_same_v
<T
, double>),
129 if constexpr (cpp::is_same_v
<T
, uint64_t>)
130 bits
= (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
131 static_cast<uint64_t>(LIBC_NAMESPACE::rand());
133 bits
= LIBC_NAMESPACE::rand();
135 static_cast<double>(max_exp
- min_exp
+ 1) / (2 * FPBits::EXP_BIAS
+ 1);
137 fp
.set_biased_exponent(
138 static_cast<uint32_t>(fp
.get_biased_exponent() * scale
+ min_exp
));
142 template <typename T
> class MathPerf
{
143 using FPBits
= fputil::FPBits
<T
>;
144 using StorageType
= typename
FPBits::StorageType
;
145 static constexpr StorageType UIntMax
=
146 cpp::numeric_limits
<StorageType
>::max();
149 template <size_t N
= 1>
150 static uint64_t run_throughput_in_range(T
f(T
), int min_exp
, int max_exp
) {
151 cpp::array
<T
, N
> inputs
;
152 for (size_t i
= 0; i
< N
; ++i
)
153 inputs
[i
] = get_rand_input
<T
>(min_exp
, max_exp
);
155 uint64_t total_time
= LIBC_NAMESPACE::throughput(f
, inputs
);
157 return total_time
/ N
;
160 // Throughput benchmarking for functions that take 2 inputs.
161 template <size_t N
= 1>
162 static uint64_t run_throughput_in_range(T
f(T
, T
), int arg1_min_exp
,
163 int arg1_max_exp
, int arg2_min_exp
,
165 cpp::array
<T
, N
> inputs1
;
166 cpp::array
<T
, N
> inputs2
;
167 for (size_t i
= 0; i
< N
; ++i
) {
168 inputs1
[i
] = get_rand_input
<T
>(arg1_min_exp
, arg1_max_exp
);
169 inputs2
[i
] = get_rand_input
<T
>(arg2_min_exp
, arg2_max_exp
);
172 uint64_t total_time
= LIBC_NAMESPACE::throughput(f
, inputs1
, inputs2
);
174 return total_time
/ N
;
178 } // namespace benchmarks
179 } // namespace LIBC_NAMESPACE_DECL
181 // Passing -1 indicates the benchmark should be run with as many threads as
182 // allocated by the user in the benchmark's CMake.
183 #define BENCHMARK(SuiteName, TestName, Func) \
184 LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
185 Func, #SuiteName, #TestName, -1)
187 #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \
188 LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
189 Func, #SuiteName, #TestName, NumThreads)
191 #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \
192 BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
194 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
195 BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
196 LIBC_NAMESPACE::gpu::get_lane_size())