memcpy: hide some memory latencies
[nova-simd.git] / simd_memory.hpp
blobebcd8c0ebe9695e9d68b55595deb1d7e7951cf0f
1 // templated memory simd functions
2 // Copyright (C) 2010 Tim Blechmann <tim@klingt.org>
3 //
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; see the file COPYING. If not, write to
16 // the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 // Boston, MA 02111-1307, USA.
20 #ifndef SIMD_MEMORY_HPP
21 #define SIMD_MEMORY_HPP
23 #include <cassert>
24 #include <cstring>
26 #include "vec.hpp"
28 #if defined(__GNUC__) && defined(NDEBUG)
29 #define always_inline inline __attribute__((always_inline))
30 #else
31 #define always_inline inline
32 #endif
34 namespace nova {
36 template <typename F>
37 inline void zerovec(F * dest, unsigned int n)
39 std::memset(dest, 0, n*sizeof(F));
42 template <typename F>
43 inline void setvec(F * dest, F f, unsigned int n)
45 assert(n);
47 *dest++ = f;
48 while (--n);
51 namespace detail
54 template <bool aligned, typename F>
55 inline void store_aligned(vec<F> const & value, F * dest)
57 if (aligned)
58 value.store_aligned(dest);
59 else
60 value.store(dest);
63 template <typename F, unsigned int n, bool aligned>
64 struct setvec
66 static const int offset = vec<F>::size;
68 static always_inline void mp_iteration(F * dst, vec<F> const & val)
70 store_aligned<aligned>(val, dst);
71 setvec<F, n-offset, aligned>::mp_iteration(dst+offset, val);
75 template <typename F, bool aligned>
76 struct setvec<F, 0, aligned>
78 static always_inline void mp_iteration(F * dst, vec<F> const & val)
82 template <typename F, bool aligned>
83 inline void setvec_simd(F * dest, vec<F> const & val, unsigned int n)
85 const unsigned int offset = vec<F>::objects_per_cacheline;
86 unsigned int unroll = n / offset;
90 setvec<F, offset, aligned>::mp_iteration(dest, val);
91 dest += offset;
93 while (--unroll);
96 } /* namespace detail */
98 template <typename F>
99 inline void zerovec_simd(F * dest, unsigned int n)
101 vec<F> zero; zero.clear();
102 detail::setvec_simd<F, true>(dest, zero, n);
105 template <unsigned int n, typename F>
106 inline void zerovec_simd(F *dest)
108 vec<F> zero; zero.clear();
109 detail::setvec<F, n, true>::mp_iteration(dest, zero);
112 template <typename F>
113 inline void zerovec_na_simd(F * dest, unsigned int n)
115 vec<F> zero; zero.clear();
116 detail::setvec_simd<F, false>(dest, zero, n);
119 template <unsigned int n, typename F>
120 inline void zerovec_na_simd(F *dest)
122 vec<F> zero; zero.clear();
123 detail::setvec<F, n, false>::mp_iteration(dest, zero);
128 template <typename F>
129 inline void setvec_simd(F * dest, F f, unsigned int n)
131 vec<F> val(f);
132 detail::setvec_simd<F, true>(dest, val, n);
135 template <unsigned int n, typename F>
136 inline void setvec_simd(F *dest, F f)
138 vec<F> val(f);
139 detail::setvec<F, n, true>::mp_iteration(dest, val);
142 template <typename F>
143 inline void setvec_na_simd(F * dest, F f, unsigned int n)
145 vec<F> val(f);
146 detail::setvec_simd<F, false>(dest, val, n);
149 template <unsigned int n, typename F>
150 inline void setvec_na_simd(F *dest, F f)
152 vec<F> val(f);
153 detail::setvec<F, n, false>::mp_iteration(dest, val);
157 namespace detail
160 template <typename F, unsigned int n>
161 struct set_ramp
163 static const int offset = vec<F>::size;
165 static always_inline void slope_mp_iteration(F * dst, vec<F> & vbase, vec<F> const & vslope)
167 vbase.store_aligned(dst);
168 vbase += vslope;
169 set_ramp<F, n-offset>::slope_mp_iteration(dst+offset, vbase, vslope);
172 static always_inline void exp_mp_iteration(F * dst, vec<F> & vbase, vec<F> const & vcurve)
174 vbase.store_aligned(dst);
175 vbase *= vcurve;
176 set_ramp<F, n-offset>::exp_mp_iteration(dst+offset, vbase, vcurve);
180 template <typename F>
181 struct set_ramp<F, 0>
183 static always_inline void slope_mp_iteration(F * dst, vec<F> & vbase, vec<F> const & vslope)
185 static always_inline void exp_mp_iteration(F * dst, vec<F> & vbase, vec<F> const & curve)
189 } /* namespace detail */
192 template <typename F>
193 inline void set_slope_vec(F * dest, F f, F slope, unsigned int n)
195 assert(n);
196 do {
197 *dest++ = f; f += slope;
198 } while (--n);
201 template <typename F>
202 inline void set_slope_vec_simd(F * dest, F f, F slope, unsigned int n)
204 vec<F> vbase, vslope;
205 vbase.set_slope(f, slope);
206 vslope.set_vec(vec<F>::size * slope);
208 unsigned int unroll = n / vec<F>::objects_per_cacheline;
211 detail::set_ramp<F, vec<F>::objects_per_cacheline>::slope_mp_iteration(dest, vbase, vslope);
212 dest += vec<F>::objects_per_cacheline;
213 } while(--unroll);
216 template <typename F>
217 inline void set_exp_vec(F * dest, F f, F curve, unsigned int n)
219 assert(n);
220 do {
221 *dest++ = f; f *= curve;
222 } while (--n);
225 namespace detail {
227 template <int Exponent, typename Type>
228 class pow_i
230 public:
231 static Type run(Type const & base)
233 return base * pow_i<Exponent - 1, Type>::run(base);
237 template <typename Type>
238 class pow_i<1, Type>
240 public:
241 static Type run(Type const & base)
243 return base;
247 template <size_t Exponent, typename Type>
248 Type ipow(Type const & base)
250 return pow_i<Exponent, Type>::run(base);
255 template <typename F>
256 inline void set_exp_vec_simd(F * dest, F f, F curve, unsigned int n)
258 vec<F> vbase, vcurve(detail::ipow<vec<F>::size, F>(curve));
259 vbase.set_exp(f, curve);
261 unsigned int unroll = n / vec<F>::objects_per_cacheline;
264 detail::set_ramp<F, vec<F>::objects_per_cacheline>::exp_mp_iteration(dest, vbase, vcurve);
265 dest += vec<F>::objects_per_cacheline;
266 } while(--unroll);
270 template <typename F>
271 inline void copyvec(F * dest, const F * src, unsigned int n)
273 std::memcpy(dest, src, n*sizeof(F));
276 namespace detail {
278 template <typename F, bool src_aligned, bool dst_aligned, unsigned int n>
279 struct copyvec
281 static const int offset = vec<F>::size;
283 static always_inline void mp_iteration(F * dst, const F * src)
285 vec<F> val;
286 if (src_aligned)
287 val.load_aligned(src);
288 else
289 val.load(src);
291 mp_iteration(dst, src + offset, val);
294 static always_inline void mp_iteration(F * dst, const F * src, vec<F> const & loaded_value)
296 vec<F> val;
298 if (src_aligned)
299 val.load_aligned(src);
300 else
301 val.load(src);
303 store_aligned<dst_aligned>(loaded_value, dst);
304 copyvec<F, src_aligned, dst_aligned, n-offset>::mp_iteration(dst+offset, src+offset, val);
308 template <typename F, bool src_aligned, bool dst_aligned>
309 struct copyvec<F, src_aligned, dst_aligned, 0>
311 static always_inline void mp_iteration(F * dst, const F * src, vec<F> loaded_value)
317 #define COPYVEC_FUNCTION(name, src_aligned, dst_aligned) \
318 template <typename F> \
319 inline void name##_simd(F * dest, const F * src, unsigned int n) \
321 const int per_loop = vec<F>::objects_per_cacheline; \
322 n /= per_loop; \
323 do \
325 detail::copyvec<F, src_aligned, dst_aligned, per_loop>::mp_iteration(dest, src); \
326 dest += per_loop; src += per_loop; \
328 while (--n); \
331 template <unsigned int n, typename F> \
332 inline void name##_simd(F * dest, const F * src) \
334 detail::copyvec<F, src_aligned, dst_aligned, n>::mp_iteration(dest, src); \
337 COPYVEC_FUNCTION(copyvec_aa, true, true)
338 COPYVEC_FUNCTION(copyvec_na, false, true)
339 COPYVEC_FUNCTION(copyvec_an, true, false)
340 COPYVEC_FUNCTION(copyvec_nn, false, false)
342 template <typename F>
343 inline void copyvec_simd(F * dest, const F * src, unsigned int n)
345 copyvec_aa_simd(dest, src, n);
348 template <unsigned int n, typename F>
349 inline void copyvec_simd(F * dest, const F * src)
351 copyvec_aa_simd<n, F>(dest, src);
354 template <typename F>
355 inline void addvec(F * out, const F * in, unsigned int n)
357 do {
358 *out++ += *in++;
359 } while (--n);
362 template <typename F>
363 inline void addvec(F * out, const F in, unsigned int n)
365 do {
366 *out++ += in;
367 } while (--n);
370 template <typename F>
371 inline void addvec(F * out, const F in, const F slope, unsigned int n)
373 do {
374 *out++ += in; in += slope;
375 } while (--n);
378 namespace detail
381 template <typename F, unsigned int n>
382 struct addvec
384 static const int offset = vec<F>::size;
386 static always_inline void mp_iteration(F * dst, const F * src)
388 vec<F> v1, v2;
389 v1.load_aligned(dst);
390 v2.load_aligned(src);
391 v1 += v2;
392 v1.store_aligned(dst);
393 addvec<F, n-offset>::mp_iteration(dst+offset, src+offset);
396 static always_inline void mp_iteration(F * dst, vec<F> const & in)
398 vec<F> v1;
399 v1.load_aligned(dst);
400 v1 += in;
401 v1.store_aligned(dst);
402 addvec<F, n-offset>::mp_iteration(dst+offset, in);
405 static always_inline void mp_iteration(F * dst, vec<F> & in, vec<F> const & vslope)
407 vec<F> v1;
408 v1.load_aligned(dst);
409 v1 += in;
410 v1.store_aligned(dst);
411 in += vslope;
412 addvec<F, n-offset>::mp_iteration(dst+offset, in, vslope);
416 template <typename F>
417 struct addvec<F, 0>
419 static always_inline void mp_iteration(F * dst, const F * src)
422 static always_inline void mp_iteration(F * dst, vec<F> const & in)
425 static always_inline void mp_iteration(F * dst, vec<F> & in, vec<F> const & vslope)
431 template <typename F>
432 inline void addvec_simd(F * out, const F * in, unsigned int n)
434 const int per_loop = vec<F>::objects_per_cacheline;
435 n /= per_loop;
438 detail::addvec<F, per_loop>::mp_iteration(out, in);
439 out += per_loop; in += per_loop;
441 while (--n);
444 template <typename F>
445 inline void addvec_simd(F * out, const F in, unsigned int n)
447 const int per_loop = vec<F>::objects_per_cacheline;
448 vec<F> vin(in);
449 n /= per_loop;
452 detail::addvec<F, per_loop>::mp_iteration(out, vin);
453 out += per_loop;
455 while (--n);
458 template <typename F>
459 inline void addvec_simd(F * out, const F in, const F slope, unsigned int n)
461 const int per_loop = vec<F>::objects_per_cacheline;
462 vec<F> vin; vin.set_slope(in, slope);
463 vec<F> vslope; vslope.set(slope * vec<F>::size);
464 n /= per_loop;
467 detail::addvec<F, per_loop>::mp_iteration(out, vin, vslope);
468 out += per_loop;
470 while (--n);
473 template <unsigned int n, typename F>
474 inline void addvec_simd(F * out, const F * in)
476 detail::addvec<F, n>::mp_iteration(out, in);
479 template <unsigned int n, typename F>
480 inline void addvec_simd(F * out, const F in)
482 vec<F> vin(in);
483 detail::addvec<F, n>::mp_iteration(out, vin);
486 template <unsigned int n, typename F>
487 inline void addvec_simd(F * out, const F in, const F slope)
489 vec<F> vin; vin.set_slope(in, slope);
490 vec<F> vslope; vslope.set(slope * vec<F>::size);
491 detail::addvec<F, n>::mp_iteration(out, vin, vslope);
495 } /* namespace nova */
497 #undef always_inline
499 #endif /* SIMD_MEMORY_HPP */