1 #include "LibcGpuBenchmark.h"
2 #include "src/__support/CPP/algorithm.h"
3 #include "src/__support/CPP/array.h"
4 #include "src/__support/CPP/atomic.h"
5 #include "src/__support/CPP/string.h"
6 #include "src/__support/FPUtil/sqrt.h"
7 #include "src/__support/GPU/utils.h"
8 #include "src/__support/fixedvector.h"
9 #include "src/__support/macros/config.h"
10 #include "src/stdio/printf.h"
11 #include "src/stdlib/srand.h"
12 #include "src/time/gpu/time_utils.h"
14 namespace LIBC_NAMESPACE_DECL
{
15 namespace benchmarks
{
17 FixedVector
<Benchmark
*, 64> benchmarks
;
19 void Benchmark::add_benchmark(Benchmark
*benchmark
) {
20 benchmarks
.push_back(benchmark
);
23 struct AtomicBenchmarkSums
{
24 cpp::Atomic
<uint64_t> cycles_sum
= 0;
25 cpp::Atomic
<uint64_t> standard_deviation_sum
= 0;
26 cpp::Atomic
<uint64_t> min
= UINT64_MAX
;
27 cpp::Atomic
<uint64_t> max
= 0;
28 cpp::Atomic
<uint32_t> samples_sum
= 0;
29 cpp::Atomic
<uint32_t> iterations_sum
= 0;
30 cpp::Atomic
<clock_t> time_sum
= 0;
31 cpp::Atomic
<uint64_t> active_threads
= 0;
34 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
35 active_threads
.store(0, cpp::MemoryOrder::RELAXED
);
36 cycles_sum
.store(0, cpp::MemoryOrder::RELAXED
);
37 standard_deviation_sum
.store(0, cpp::MemoryOrder::RELAXED
);
38 min
.store(UINT64_MAX
, cpp::MemoryOrder::RELAXED
);
39 max
.store(0, cpp::MemoryOrder::RELAXED
);
40 samples_sum
.store(0, cpp::MemoryOrder::RELAXED
);
41 iterations_sum
.store(0, cpp::MemoryOrder::RELAXED
);
42 time_sum
.store(0, cpp::MemoryOrder::RELAXED
);
43 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
46 void update(const BenchmarkResult
&result
) {
47 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
48 active_threads
.fetch_add(1, cpp::MemoryOrder::RELAXED
);
50 cycles_sum
.fetch_add(result
.cycles
, cpp::MemoryOrder::RELAXED
);
51 standard_deviation_sum
.fetch_add(
52 static_cast<uint64_t>(result
.standard_deviation
),
53 cpp::MemoryOrder::RELAXED
);
55 // Perform a CAS loop to atomically update the min
56 uint64_t orig_min
= min
.load(cpp::MemoryOrder::RELAXED
);
57 while (!min
.compare_exchange_strong(
58 orig_min
, cpp::min(orig_min
, result
.min
), cpp::MemoryOrder::ACQUIRE
,
59 cpp::MemoryOrder::RELAXED
))
62 // Perform a CAS loop to atomically update the max
63 uint64_t orig_max
= max
.load(cpp::MemoryOrder::RELAXED
);
64 while (!max
.compare_exchange_strong(
65 orig_max
, cpp::max(orig_max
, result
.max
), cpp::MemoryOrder::ACQUIRE
,
66 cpp::MemoryOrder::RELAXED
))
69 samples_sum
.fetch_add(result
.samples
, cpp::MemoryOrder::RELAXED
);
70 iterations_sum
.fetch_add(result
.total_iterations
,
71 cpp::MemoryOrder::RELAXED
);
72 time_sum
.fetch_add(result
.total_time
, cpp::MemoryOrder::RELAXED
);
73 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
77 AtomicBenchmarkSums all_results
;
78 constexpr auto GREEN
= "\033[32m";
79 constexpr auto RESET
= "\033[0m";
81 void print_results(Benchmark
*b
) {
82 BenchmarkResult result
;
83 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
84 int num_threads
= all_results
.active_threads
.load(cpp::MemoryOrder::RELAXED
);
86 all_results
.cycles_sum
.load(cpp::MemoryOrder::RELAXED
) / num_threads
;
87 result
.standard_deviation
=
88 all_results
.standard_deviation_sum
.load(cpp::MemoryOrder::RELAXED
) /
90 result
.min
= all_results
.min
.load(cpp::MemoryOrder::RELAXED
);
91 result
.max
= all_results
.max
.load(cpp::MemoryOrder::RELAXED
);
93 all_results
.samples_sum
.load(cpp::MemoryOrder::RELAXED
) / num_threads
;
94 result
.total_iterations
=
95 all_results
.iterations_sum
.load(cpp::MemoryOrder::RELAXED
) / num_threads
;
96 const uint64_t duration_ns
=
97 all_results
.time_sum
.load(cpp::MemoryOrder::RELAXED
) / num_threads
;
98 const uint64_t duration_us
= duration_ns
/ 1000;
99 const uint64_t duration_ms
= duration_ns
/ (1000 * 1000);
100 uint64_t converted_duration
= duration_ns
;
101 const char *time_unit
;
102 if (duration_ms
!= 0) {
103 converted_duration
= duration_ms
;
105 } else if (duration_us
!= 0) {
106 converted_duration
= duration_us
;
109 converted_duration
= duration_ns
;
112 result
.total_time
= converted_duration
;
113 // result.total_time =
114 // all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
115 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE
);
117 LIBC_NAMESPACE::printf(
118 "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
119 b
->get_test_name().data(), result
.cycles
, result
.min
, result
.max
,
120 result
.total_iterations
, result
.total_time
, time_unit
,
121 static_cast<uint64_t>(result
.standard_deviation
), num_threads
);
124 void print_header() {
125 LIBC_NAMESPACE::printf("%s", GREEN
);
126 LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
127 benchmarks
[0]->get_suite_name().data());
128 LIBC_NAMESPACE::printf("%s", RESET
);
130 "Benchmark | Cycles | Min | Max | "
131 "Iterations | Time / Iteration | Stddev | Threads |\n";
132 LIBC_NAMESPACE::printf(titles
.data());
134 cpp::string
separator(titles
.size(), '-');
135 separator
[titles
.size() - 1] = '\n';
136 LIBC_NAMESPACE::printf(separator
.data());
139 void Benchmark::run_benchmarks() {
140 uint64_t id
= gpu::get_thread_id();
144 LIBC_NAMESPACE::srand(gpu::processor_clock());
149 for (Benchmark
*b
: benchmarks
) {
154 if (b
->num_threads
== static_cast<uint32_t>(-1) || id
< b
->num_threads
) {
155 auto current_result
= b
->run();
156 all_results
.update(current_result
);
166 BenchmarkResult
benchmark(const BenchmarkOptions
&options
,
167 cpp::function
<uint64_t(void)> wrapper_func
) {
168 BenchmarkResult result
;
169 RuntimeEstimationProgression rep
;
170 uint32_t total_iterations
= 0;
171 uint32_t iterations
= options
.initial_iterations
;
175 uint32_t samples
= 0;
176 uint64_t total_time
= 0;
177 uint64_t best_guess
= 0;
178 uint64_t cycles_squared
= 0;
179 uint64_t min
= UINT64_MAX
;
182 uint64_t overhead
= UINT64_MAX
;
183 int overhead_iterations
= 10;
184 for (int i
= 0; i
< overhead_iterations
; i
++)
185 overhead
= cpp::min(overhead
, LIBC_NAMESPACE::overhead());
187 for (int64_t time_budget
= options
.max_duration
; time_budget
>= 0;) {
188 uint64_t sample_cycles
= 0;
189 const clock_t start
= static_cast<double>(clock());
190 for (uint32_t i
= 0; i
< iterations
; i
++) {
191 auto wrapper_intermediate
= wrapper_func();
192 uint64_t current_result
= wrapper_intermediate
- overhead
;
193 max
= cpp::max(max
, current_result
);
194 min
= cpp::min(min
, current_result
);
195 sample_cycles
+= current_result
;
197 const clock_t end
= clock();
198 const clock_t duration_ns
=
199 ((end
- start
) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC
;
200 total_time
+= duration_ns
;
201 time_budget
-= duration_ns
;
203 cycles_squared
+= sample_cycles
* sample_cycles
;
205 total_iterations
+= iterations
;
206 const double change_ratio
=
207 rep
.compute_improvement({iterations
, sample_cycles
});
208 best_guess
= rep
.current_estimation
;
210 if (samples
>= options
.max_samples
|| iterations
>= options
.max_iterations
)
212 if (total_time
>= options
.min_duration
&& samples
>= options
.min_samples
&&
213 total_iterations
>= options
.min_iterations
&&
214 change_ratio
< options
.epsilon
)
217 iterations
*= options
.scaling_factor
;
219 result
.cycles
= best_guess
;
220 result
.standard_deviation
= fputil::sqrt
<double>(
221 static_cast<double>(cycles_squared
) / total_iterations
-
222 static_cast<double>(best_guess
* best_guess
));
225 result
.samples
= samples
;
226 result
.total_iterations
= total_iterations
;
227 result
.total_time
= total_time
/ total_iterations
;
231 } // namespace benchmarks
232 } // namespace LIBC_NAMESPACE_DECL