memcpy: hide some memory latencies
[nova-simd.git] / detail / unroll_helpers.hpp
blobff1f9611939d3985e97449b77efa2848f7c761ac
1 // unroll helpers
2 // Copyright (C) 2010 Tim Blechmann
3 //
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; see the file COPYING. If not, write to
16 // the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 // Boston, MA 02111-1307, USA.
19 #ifndef NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP
20 #define NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP
22 #include "../vec.hpp"
24 #if defined(__GNUC__) && defined(NDEBUG)
25 #define always_inline inline __attribute__((always_inline))
26 #else
27 #define always_inline inline
28 #endif
30 namespace nova {
31 namespace detail {
33 template <typename FloatType,
34 int N
36 struct compile_time_unroller
38 typedef vec<FloatType> vec_type;
40 static const int offset = vec_type::size;
42 template <typename arg1_type,
43 typename Functor
45 static always_inline void run(FloatType * out, arg1_type & in1, Functor const & f)
47 compile_time_unroller<FloatType, N>::mp_iteration_1(out, in1.consume(), in1, f);
50 template <typename arg1_type,
51 typename arg2_type,
52 typename Functor
54 static always_inline void run(FloatType * out, arg1_type & in1, arg2_type & in2, Functor const & f)
56 compile_time_unroller<FloatType, N>::mp_iteration_2(out, in1.consume(), in1, in2.consume(), in2, f);
59 template <typename arg1_type,
60 typename arg2_type,
61 typename arg3_type,
62 typename Functor
64 static always_inline void run(FloatType * out, arg1_type & in1, arg2_type & in2,
65 arg3_type & in3, Functor const & f)
67 compile_time_unroller<FloatType, N>::mp_iteration_3(out, in1.consume(), in1, in2.consume(), in2, in3.consume(), in3, f);
70 template <typename arg1_type,
71 typename arg2_type,
72 typename arg3_type,
73 typename arg4_type,
74 typename Functor
76 static always_inline void run(FloatType * out, arg1_type & in1, arg2_type & in2,
77 arg3_type & in3, arg4_type & in4, Functor const & f)
79 compile_time_unroller<FloatType, N>::mp_iteration_4(out, in1.consume(), in1, in2.consume(), in2,
80 in3.consume(), in3, in4.consume(), in4, f);
83 private:
84 friend struct compile_time_unroller<FloatType, vec_type::size + N>;
86 template <typename arg1_type,
87 typename Functor
89 static always_inline void mp_iteration_1(FloatType * out, vec_type loaded_in1, arg1_type & in1, Functor const & f)
91 vec_type loaded_next_in1;
92 if (N != offset)
93 loaded_next_in1 = in1.consume();
95 vec_type result = f(loaded_in1);
96 result.store_aligned(out);
97 compile_time_unroller<FloatType, N-offset>::mp_iteration_1(out+offset, loaded_next_in1, in1, f);
100 template <typename arg1_type,
101 typename arg2_type,
102 typename Functor
104 static always_inline void mp_iteration_2(FloatType * out, vec_type loaded_in1, arg1_type & in1,
105 vec_type loaded_in2, arg2_type & in2, Functor const & f)
107 vec_type loaded_next_in1;
108 if (N != offset)
109 loaded_next_in1 = in1.consume();
111 vec_type loaded_next_in2;
112 if (N != offset)
113 loaded_next_in2 = in2.consume();
115 vec_type result = f(loaded_in1, loaded_in2);
116 result.store_aligned(out);
117 compile_time_unroller<FloatType, N-offset>::mp_iteration_2(out+offset, loaded_next_in1, in1, loaded_next_in2, in2, f);
120 template <typename arg1_type,
121 typename arg2_type,
122 typename arg3_type,
123 typename Functor
125 static always_inline void mp_iteration_3(FloatType * out, vec_type loaded_in1, arg1_type & in1,
126 vec_type loaded_in2, arg2_type & in2,
127 vec_type loaded_in3, arg3_type & in3, Functor const & f)
129 vec_type loaded_next_in1;
130 if (N != offset)
131 loaded_next_in1 = in1.consume();
133 vec_type loaded_next_in2;
134 if (N != offset)
135 loaded_next_in2 = in2.consume();
137 vec_type loaded_next_in3;
138 if (N != offset)
139 loaded_next_in3 = in3.consume();
141 vec_type result = f(loaded_in1, loaded_in2, loaded_in3);
142 result.store_aligned(out);
143 compile_time_unroller<FloatType, N-offset>::mp_iteration_3(out+offset, loaded_next_in1, in1, loaded_next_in2, in2,
144 loaded_next_in3, in3, f);
147 template <typename arg1_type,
148 typename arg2_type,
149 typename arg3_type,
150 typename arg4_type,
151 typename Functor
153 static always_inline void mp_iteration_4(FloatType * out, vec_type loaded_in1, arg1_type & in1, vec_type loaded_in2, arg2_type & in2,
154 vec_type loaded_in3, arg3_type & in3, vec_type loaded_in4, arg4_type & in4, Functor const & f)
156 vec_type loaded_next_in1;
157 if (N != offset)
158 loaded_next_in1 = in1.consume();
160 vec_type loaded_next_in2;
161 if (N != offset)
162 loaded_next_in2 = in2.consume();
164 vec_type loaded_next_in3;
165 if (N != offset)
166 loaded_next_in3 = in3.consume();
168 vec_type loaded_next_in4;
169 if (N != offset)
170 loaded_next_in4 = in4.consume();
172 vec_type result = f(loaded_in1, loaded_in2, loaded_in3, loaded_in4);
173 result.store_aligned(out);
175 compile_time_unroller<FloatType, N-offset>::mp_iteration_4(out+offset, loaded_next_in1, in1, loaded_next_in2, in2,
176 loaded_next_in3, in3, loaded_next_in4, in4, f);
180 template <typename FloatType>
181 struct compile_time_unroller<FloatType, 0>
183 friend struct compile_time_unroller<FloatType, vec<FloatType>::size>;
185 private:
186 template <typename LoadedArg1, typename Arg1,
187 typename Functor
189 static always_inline void mp_iteration_1(FloatType * out, LoadedArg1 const &, Arg1 const &, Functor const & f)
192 template <typename LoadedArg1, typename Arg1,
193 typename LoadedArg2, typename Arg2,
194 typename Functor
196 static always_inline void mp_iteration_2(FloatType * out, LoadedArg1 const &, Arg1 const &,
197 LoadedArg2 const &, Arg2 const &, Functor const & f)
200 template <typename LoadedArg1, typename Arg1,
201 typename LoadedArg2, typename Arg2,
202 typename LoadedArg3, typename Arg3,
203 typename Functor
205 static always_inline void mp_iteration_3(FloatType * out, LoadedArg1 const &, Arg1 const &,
206 LoadedArg2 const &, Arg2 const &, LoadedArg3 const &, Arg3 const &,
207 Functor const & f)
210 template <typename LoadedArg1, typename Arg1,
211 typename LoadedArg2, typename Arg2,
212 typename LoadedArg3, typename Arg3,
213 typename LoadedArg4, typename Arg4,
214 typename Functor
216 static always_inline void mp_iteration_4(FloatType * out, LoadedArg1 const &, Arg1 const &,
217 LoadedArg2 const &, Arg2 const &, LoadedArg3 const &, Arg3 const &,
218 LoadedArg4 const &, Arg4 const &, Functor const & f)
223 template <typename float_type,
224 typename Arg1,
225 typename Functor
227 always_inline void generate_simd_loop(float_type * out, Arg1 arg1, unsigned int n, Functor const & f)
229 const unsigned int per_loop = vec<float_type>::objects_per_cacheline;
230 n /= per_loop;
231 do {
232 detail::compile_time_unroller<float_type, per_loop>::run(out, arg1, f);
233 out += per_loop;
234 } while (--n);
237 template <typename float_type,
238 typename Arg1,
239 typename Arg2,
240 typename Functor
242 always_inline void generate_simd_loop(float_type * out, Arg1 arg1, Arg2 arg2, unsigned int n, Functor const & f)
244 const unsigned int per_loop = vec<float_type>::objects_per_cacheline;
245 n /= per_loop;
246 do {
247 detail::compile_time_unroller<float_type, per_loop>::run(out, arg1, arg2, f);
248 out += per_loop;
249 } while (--n);
252 template <typename float_type,
253 typename Arg1,
254 typename Arg2,
255 typename Arg3,
256 typename Functor
258 always_inline void generate_simd_loop(float_type * out, Arg1 arg1, Arg2 arg2, Arg3 arg3, unsigned int n, Functor const & f)
260 const unsigned int per_loop = vec<float_type>::objects_per_cacheline;
261 n /= per_loop;
262 do {
263 detail::compile_time_unroller<float_type, per_loop>::run(out, arg1, arg2, arg3, f);
264 out += per_loop;
265 } while (--n);
268 template <typename float_type,
269 typename Arg1,
270 typename Arg2,
271 typename Arg3,
272 typename Arg4,
273 typename Functor
275 always_inline void generate_simd_loop(float_type * out, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, unsigned int n, Functor const & f)
277 const unsigned int per_loop = vec<float_type>::objects_per_cacheline;
278 n /= per_loop;
279 do {
280 detail::compile_time_unroller<float_type, per_loop>::run(out, arg1, arg2, arg3, arg4, f);
281 out += per_loop;
282 } while (--n);
288 #undef always_inline
290 #endif /* NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP */