1 #include "LibcGpuBenchmark.h"
2 #include "src/__support/CPP/algorithm.h"
3 #include "src/__support/CPP/array.h"
4 #include "src/__support/CPP/atomic.h"
5 #include "src/__support/CPP/string.h"
6 #include "src/__support/FPUtil/sqrt.h"
7 #include "src/__support/GPU/utils.h"
8 #include "src/__support/fixedvector.h"
9 #include "src/__support/macros/config.h"
10 #include "src/stdio/printf.h"
11 #include "src/time/gpu/time_utils.h"
13 namespace LIBC_NAMESPACE_DECL
{
14 namespace benchmarks
{
16 FixedVector
<Benchmark
*, 64> benchmarks
;
18 void Benchmark::add_benchmark(Benchmark
*benchmark
) {
19 benchmarks
.push_back(benchmark
);
22 struct AtomicBenchmarkSums
{
23 cpp::Atomic
<uint64_t> cycles_sum
= 0;
24 cpp::Atomic
<uint64_t> standard_deviation_sum
= 0;
25 cpp::Atomic
<uint64_t> min
= UINT64_MAX
;
26 cpp::Atomic
<uint64_t> max
= 0;
27 cpp::Atomic
<uint32_t> samples_sum
= 0;
28 cpp::Atomic
<uint32_t> iterations_sum
= 0;
29 cpp::Atomic
<clock_t> time_sum
= 0;
30 cpp::Atomic
<uint64_t> active_threads
= 0;
33 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
34 active_threads
.store(0, cpp::MemoryOrder::RELAXED
);
35 cycles_sum
.store(0, cpp::MemoryOrder::RELAXED
);
36 standard_deviation_sum
.store(0, cpp::MemoryOrder::RELAXED
);
37 min
.store(UINT64_MAX
, cpp::MemoryOrder::RELAXED
);
38 max
.store(0, cpp::MemoryOrder::RELAXED
);
39 samples_sum
.store(0, cpp::MemoryOrder::RELAXED
);
40 iterations_sum
.store(0, cpp::MemoryOrder::RELAXED
);
41 time_sum
.store(0, cpp::MemoryOrder::RELAXED
);
42 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
45 void update(const BenchmarkResult
&result
) {
46 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
47 active_threads
.fetch_add(1, cpp::MemoryOrder::RELAXED
);
49 cycles_sum
.fetch_add(result
.cycles
, cpp::MemoryOrder::RELAXED
);
50 standard_deviation_sum
.fetch_add(
51 static_cast<uint64_t>(result
.standard_deviation
),
52 cpp::MemoryOrder::RELAXED
);
54 // Perform a CAS loop to atomically update the min
55 uint64_t orig_min
= min
.load(cpp::MemoryOrder::RELAXED
);
56 while (!min
.compare_exchange_strong(
57 orig_min
, cpp::min(orig_min
, result
.min
), cpp::MemoryOrder::ACQUIRE
,
58 cpp::MemoryOrder::RELAXED
))
61 // Perform a CAS loop to atomically update the max
62 uint64_t orig_max
= max
.load(cpp::MemoryOrder::RELAXED
);
63 while (!max
.compare_exchange_strong(
64 orig_max
, cpp::max(orig_max
, result
.max
), cpp::MemoryOrder::ACQUIRE
,
65 cpp::MemoryOrder::RELAXED
))
68 samples_sum
.fetch_add(result
.samples
, cpp::MemoryOrder::RELAXED
);
69 iterations_sum
.fetch_add(result
.total_iterations
,
70 cpp::MemoryOrder::RELAXED
);
71 time_sum
.fetch_add(result
.total_time
, cpp::MemoryOrder::RELAXED
);
72 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
76 AtomicBenchmarkSums all_results
;
77 constexpr auto GREEN
= "\033[32m";
78 constexpr auto RESET
= "\033[0m";
80 void print_results(Benchmark
*b
) {
81 BenchmarkResult result
;
82 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
83 int num_threads
= all_results
.active_threads
.load(cpp::MemoryOrder::RELAXED
);
85 all_results
.cycles_sum
.load(cpp::MemoryOrder::RELAXED
) / num_threads
;
86 result
.standard_deviation
=
87 all_results
.standard_deviation_sum
.load(cpp::MemoryOrder::RELAXED
) /
89 result
.min
= all_results
.min
.load(cpp::MemoryOrder::RELAXED
);
90 result
.max
= all_results
.max
.load(cpp::MemoryOrder::RELAXED
);
92 all_results
.samples_sum
.load(cpp::MemoryOrder::RELAXED
) / num_threads
;
93 result
.total_iterations
=
94 all_results
.iterations_sum
.load(cpp::MemoryOrder::RELAXED
) / num_threads
;
95 const uint64_t duration_ns
=
96 all_results
.time_sum
.load(cpp::MemoryOrder::RELAXED
) / num_threads
;
97 const uint64_t duration_us
= duration_ns
/ 1000;
98 const uint64_t duration_ms
= duration_ns
/ (1000 * 1000);
99 uint64_t converted_duration
= duration_ns
;
100 const char *time_unit
;
101 if (duration_ms
!= 0) {
102 converted_duration
= duration_ms
;
104 } else if (duration_us
!= 0) {
105 converted_duration
= duration_us
;
108 converted_duration
= duration_ns
;
111 result
.total_time
= converted_duration
;
112 // result.total_time =
113 // all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
114 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
116 LIBC_NAMESPACE::printf(
117 "%-20s |%8ld |%8ld |%8ld |%11d |%9ld %2s |%9ld |%9d |\n",
118 b
->get_test_name().data(), result
.cycles
, result
.min
, result
.max
,
119 result
.total_iterations
, result
.total_time
, time_unit
,
120 static_cast<uint64_t>(result
.standard_deviation
), num_threads
);
123 void print_header() {
124 LIBC_NAMESPACE::printf("%s", GREEN
);
125 LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
126 benchmarks
[0]->get_suite_name().data());
127 LIBC_NAMESPACE::printf("%s", RESET
);
128 LIBC_NAMESPACE::printf("Benchmark | Cycles | Min | Max | "
130 "Time | Stddev | Threads |\n");
131 LIBC_NAMESPACE::printf(
132 "---------------------------------------------------------------------"
133 "--------------------------------\n");
136 void Benchmark::run_benchmarks() {
137 uint64_t id
= gpu::get_thread_id();
144 for (Benchmark
*b
: benchmarks
) {
149 if (b
->num_threads
== static_cast<uint32_t>(-1) || id
< b
->num_threads
) {
150 auto current_result
= b
->run();
151 all_results
.update(current_result
);
161 BenchmarkResult
benchmark(const BenchmarkOptions
&options
,
162 cpp::function
<uint64_t(void)> wrapper_func
) {
163 BenchmarkResult result
;
164 RuntimeEstimationProgression rep
;
165 uint32_t total_iterations
= 0;
166 uint32_t iterations
= options
.initial_iterations
;
170 uint32_t samples
= 0;
171 uint64_t total_time
= 0;
172 uint64_t best_guess
= 0;
173 uint64_t cycles_squared
= 0;
174 uint64_t min
= UINT64_MAX
;
177 uint64_t overhead
= UINT64_MAX
;
178 int overhead_iterations
= 10;
179 for (int i
= 0; i
< overhead_iterations
; i
++)
180 overhead
= cpp::min(overhead
, LIBC_NAMESPACE::overhead());
182 for (int64_t time_budget
= options
.max_duration
; time_budget
>= 0;) {
183 uint64_t sample_cycles
= 0;
184 const clock_t start
= static_cast<double>(clock());
185 for (uint32_t i
= 0; i
< iterations
; i
++) {
186 auto wrapper_intermediate
= wrapper_func();
187 uint64_t current_result
= wrapper_intermediate
- overhead
;
188 max
= cpp::max(max
, current_result
);
189 min
= cpp::min(min
, current_result
);
190 sample_cycles
+= current_result
;
192 const clock_t end
= clock();
193 const clock_t duration_ns
=
194 ((end
- start
) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC
;
195 total_time
+= duration_ns
;
196 time_budget
-= duration_ns
;
198 cycles_squared
+= sample_cycles
* sample_cycles
;
200 total_iterations
+= iterations
;
201 const double change_ratio
=
202 rep
.compute_improvement({iterations
, sample_cycles
});
203 best_guess
= rep
.current_estimation
;
205 if (samples
>= options
.max_samples
|| iterations
>= options
.max_iterations
)
207 if (total_time
>= options
.min_duration
&& samples
>= options
.min_samples
&&
208 change_ratio
< options
.epsilon
)
211 iterations
*= options
.scaling_factor
;
213 result
.cycles
= best_guess
;
214 result
.standard_deviation
= fputil::sqrt
<double>(
215 static_cast<double>(cycles_squared
) / total_iterations
-
216 static_cast<double>(best_guess
* best_guess
));
219 result
.samples
= samples
;
220 result
.total_iterations
= total_iterations
;
221 result
.total_time
= total_time
;
225 } // namespace benchmarks
226 } // namespace LIBC_NAMESPACE_DECL