memcpy: hide some memory latencies
[nova-simd.git] / benchmarks / simd_unroll_benchmarks2.cpp
blob3ddfa07e726d58c681026866984b0288fce6c020
1 #include "benchmark_helpers.hpp"
3 #ifdef __SSE__
4 #include <xmmintrin.h>
5 #endif
7 #include "../simd_binary_arithmetic.hpp"
9 using namespace nova;
10 using namespace std;
12 aligned_array<float, 64> out, in1, in2;
14 #ifdef __SSE__
15 void __noinline__ bench_1(float * out, float * in1, float * in2, unsigned int n)
17 n /= 4;
21 __m128 lhs = _mm_load_ps(in1);
22 __m128 rhs = _mm_load_ps(in2);
23 __m128 result = _mm_add_ps(lhs, rhs);
24 _mm_store_ps(out, result);
25 in1 += 4;
26 in2 += 4;
27 out += 4;
29 while (--n);
32 void __noinline__ bench_2(float * out, float * in1, float * in2, unsigned int n)
34 n /= 8;
38 __m128 lhs1 = _mm_load_ps(in1);
39 __m128 lhs2 = _mm_load_ps(in1+4);
40 __m128 rhs1 = _mm_load_ps(in2);
41 __m128 rhs2 = _mm_load_ps(in2+4);
42 __m128 result1 = _mm_add_ps(lhs1, rhs1);
43 __m128 result2 = _mm_add_ps(lhs2, rhs2);
44 _mm_store_ps(out, result1);
45 _mm_store_ps(out+4, result2);
46 in1 += 8;
47 in2 += 8;
48 out += 8;
50 while (--n);
53 void __noinline__ bench_3(float * out, float * in1, float * in2, unsigned int n)
55 n /= 16;
59 __m128 lhs1 = _mm_load_ps(in1);
60 __m128 lhs2 = _mm_load_ps(in1+4);
61 __m128 lhs3 = _mm_load_ps(in1+8);
62 __m128 lhs4 = _mm_load_ps(in1+12);
63 __m128 rhs1 = _mm_load_ps(in2);
64 __m128 rhs2 = _mm_load_ps(in2+4);
65 __m128 rhs3 = _mm_load_ps(in2+8);
66 __m128 rhs4 = _mm_load_ps(in2+12);
67 __m128 result1 = _mm_add_ps(lhs1, rhs1);
68 __m128 result2 = _mm_add_ps(lhs2, rhs2);
69 __m128 result3 = _mm_add_ps(lhs3, rhs3);
70 __m128 result4 = _mm_add_ps(lhs4, rhs4);
71 _mm_store_ps(out, result1);
72 _mm_store_ps(out+4, result2);
73 _mm_store_ps(out+8, result3);
74 _mm_store_ps(out+12, result4);
75 in1 += 16;
76 in2 += 16;
77 out += 16;
79 while (--n);
81 #endif
84 void __noinline__ bench_4(float * out, float * in1, float * in2, unsigned int n)
86 n /= 8;
90 nova::plus_vec_simd<8>(out, in1, in2);
91 in1 += 8;
92 in2 += 8;
93 out += 8;
95 while (--n);
98 void __noinline__ bench_5(float * out, float * in1, float * in2, unsigned int n)
100 n /= 16;
104 nova::plus_vec_simd<16>(out, in1, in2);
105 in1 += 16;
106 in2 += 16;
107 out += 16;
109 while (--n);
114 int main(void)
116 out.assign(0.f);
117 in1.assign(0.f);
118 in2.assign(0.f);
120 const unsigned int iterations = 100000000;
122 #ifdef __SSE__
123 run_bench(boost::bind(bench_1, out.begin(), in1.begin(), in2.begin(), 64), iterations);
124 run_bench(boost::bind(bench_2, out.begin(), in1.begin(), in2.begin(), 64), iterations);
125 run_bench(boost::bind(bench_3, out.begin(), in1.begin(), in2.begin(), 64), iterations);
126 #endif
127 run_bench(boost::bind(bench_4, out.begin(), in1.begin(), in2.begin(), 64), iterations);
128 run_bench(boost::bind(bench_5, out.begin(), in1.begin(), in2.begin(), 64), iterations);