1 #include "benchmark_helpers.hpp"
7 #include "../simd_binary_arithmetic.hpp"
12 aligned_array
<float, 64> out
, in1
, in2
;
15 void __noinline__
bench_1(float * out
, float * in1
, float * in2
, unsigned int n
)
21 __m128 lhs
= _mm_load_ps(in1
);
22 __m128 rhs
= _mm_load_ps(in2
);
23 __m128 result
= _mm_add_ps(lhs
, rhs
);
24 _mm_store_ps(out
, result
);
32 void __noinline__
bench_2(float * out
, float * in1
, float * in2
, unsigned int n
)
38 __m128 lhs1
= _mm_load_ps(in1
);
39 __m128 lhs2
= _mm_load_ps(in1
+4);
40 __m128 rhs1
= _mm_load_ps(in2
);
41 __m128 rhs2
= _mm_load_ps(in2
+4);
42 __m128 result1
= _mm_add_ps(lhs1
, rhs1
);
43 __m128 result2
= _mm_add_ps(lhs2
, rhs2
);
44 _mm_store_ps(out
, result1
);
45 _mm_store_ps(out
+4, result2
);
53 void __noinline__
bench_3(float * out
, float * in1
, float * in2
, unsigned int n
)
59 __m128 lhs1
= _mm_load_ps(in1
);
60 __m128 lhs2
= _mm_load_ps(in1
+4);
61 __m128 lhs3
= _mm_load_ps(in1
+8);
62 __m128 lhs4
= _mm_load_ps(in1
+12);
63 __m128 rhs1
= _mm_load_ps(in2
);
64 __m128 rhs2
= _mm_load_ps(in2
+4);
65 __m128 rhs3
= _mm_load_ps(in2
+8);
66 __m128 rhs4
= _mm_load_ps(in2
+12);
67 __m128 result1
= _mm_add_ps(lhs1
, rhs1
);
68 __m128 result2
= _mm_add_ps(lhs2
, rhs2
);
69 __m128 result3
= _mm_add_ps(lhs3
, rhs3
);
70 __m128 result4
= _mm_add_ps(lhs4
, rhs4
);
71 _mm_store_ps(out
, result1
);
72 _mm_store_ps(out
+4, result2
);
73 _mm_store_ps(out
+8, result3
);
74 _mm_store_ps(out
+12, result4
);
84 void __noinline__
bench_4(float * out
, float * in1
, float * in2
, unsigned int n
)
90 nova::plus_vec_simd
<8>(out
, in1
, in2
);
98 void __noinline__
bench_5(float * out
, float * in1
, float * in2
, unsigned int n
)
104 nova::plus_vec_simd
<16>(out
, in1
, in2
);
120 const unsigned int iterations
= 100000000;
123 run_bench(boost::bind(bench_1
, out
.begin(), in1
.begin(), in2
.begin(), 64), iterations
);
124 run_bench(boost::bind(bench_2
, out
.begin(), in1
.begin(), in2
.begin(), 64), iterations
);
125 run_bench(boost::bind(bench_3
, out
.begin(), in1
.begin(), in2
.begin(), 64), iterations
);
127 run_bench(boost::bind(bench_4
, out
.begin(), in1
.begin(), in2
.begin(), 64), iterations
);
128 run_bench(boost::bind(bench_5
, out
.begin(), in1
.begin(), in2
.begin(), 64), iterations
);