1 // templated memory simd functions
2 // Copyright (C) 2010 Tim Blechmann <tim@klingt.org>
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; see the file COPYING. If not, write to
16 // the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 // Boston, MA 02111-1307, USA.
20 #ifndef SIMD_MEMORY_HPP
21 #define SIMD_MEMORY_HPP
28 #if defined(__GNUC__) && defined(NDEBUG)
29 #define always_inline inline __attribute__((always_inline))
31 #define always_inline inline
37 inline void zerovec(F
* dest
, unsigned int n
)
39 std::memset(dest
, 0, n
*sizeof(F
));
43 inline void setvec(F
* dest
, F f
, unsigned int n
)
54 template <bool aligned
, typename F
>
55 inline void store_aligned(vec
<F
> const & value
, F
* dest
)
58 value
.store_aligned(dest
);
63 template <typename F
, unsigned int n
, bool aligned
>
66 static const int offset
= vec
<F
>::size
;
68 static always_inline
void mp_iteration(F
* dst
, vec
<F
> const & val
)
70 store_aligned
<aligned
>(val
, dst
);
71 setvec
<F
, n
-offset
, aligned
>::mp_iteration(dst
+offset
, val
);
75 template <typename F
, bool aligned
>
76 struct setvec
<F
, 0, aligned
>
78 static always_inline
void mp_iteration(F
* dst
, vec
<F
> const & val
)
82 template <typename F
, bool aligned
>
83 inline void setvec_simd(F
* dest
, vec
<F
> const & val
, unsigned int n
)
85 const unsigned int offset
= vec
<F
>::objects_per_cacheline
;
86 unsigned int unroll
= n
/ offset
;
90 setvec
<F
, offset
, aligned
>::mp_iteration(dest
, val
);
96 } /* namespace detail */
99 inline void zerovec_simd(F
* dest
, unsigned int n
)
101 vec
<F
> zero
; zero
.clear();
102 detail::setvec_simd
<F
, true>(dest
, zero
, n
);
105 template <unsigned int n
, typename F
>
106 inline void zerovec_simd(F
*dest
)
108 vec
<F
> zero
; zero
.clear();
109 detail::setvec
<F
, n
, true>::mp_iteration(dest
, zero
);
112 template <typename F
>
113 inline void zerovec_na_simd(F
* dest
, unsigned int n
)
115 vec
<F
> zero
; zero
.clear();
116 detail::setvec_simd
<F
, false>(dest
, zero
, n
);
119 template <unsigned int n
, typename F
>
120 inline void zerovec_na_simd(F
*dest
)
122 vec
<F
> zero
; zero
.clear();
123 detail::setvec
<F
, n
, false>::mp_iteration(dest
, zero
);
128 template <typename F
>
129 inline void setvec_simd(F
* dest
, F f
, unsigned int n
)
132 detail::setvec_simd
<F
, true>(dest
, val
, n
);
135 template <unsigned int n
, typename F
>
136 inline void setvec_simd(F
*dest
, F f
)
139 detail::setvec
<F
, n
, true>::mp_iteration(dest
, val
);
142 template <typename F
>
143 inline void setvec_na_simd(F
* dest
, F f
, unsigned int n
)
146 detail::setvec_simd
<F
, false>(dest
, val
, n
);
149 template <unsigned int n
, typename F
>
150 inline void setvec_na_simd(F
*dest
, F f
)
153 detail::setvec
<F
, n
, false>::mp_iteration(dest
, val
);
160 template <typename F
, unsigned int n
>
163 static const int offset
= vec
<F
>::size
;
165 static always_inline
void slope_mp_iteration(F
* dst
, vec
<F
> & vbase
, vec
<F
> const & vslope
)
167 vbase
.store_aligned(dst
);
169 set_ramp
<F
, n
-offset
>::slope_mp_iteration(dst
+offset
, vbase
, vslope
);
172 static always_inline
void exp_mp_iteration(F
* dst
, vec
<F
> & vbase
, vec
<F
> const & vcurve
)
174 vbase
.store_aligned(dst
);
176 set_ramp
<F
, n
-offset
>::exp_mp_iteration(dst
+offset
, vbase
, vcurve
);
180 template <typename F
>
181 struct set_ramp
<F
, 0>
183 static always_inline
void slope_mp_iteration(F
* dst
, vec
<F
> & vbase
, vec
<F
> const & vslope
)
185 static always_inline
void exp_mp_iteration(F
* dst
, vec
<F
> & vbase
, vec
<F
> const & curve
)
189 } /* namespace detail */
192 template <typename F
>
193 inline void set_slope_vec(F
* dest
, F f
, F slope
, unsigned int n
)
197 *dest
++ = f
; f
+= slope
;
201 template <typename F
>
202 inline void set_slope_vec_simd(F
* dest
, F f
, F slope
, unsigned int n
)
204 vec
<F
> vbase
, vslope
;
205 vbase
.set_slope(f
, slope
);
206 vslope
.set_vec(vec
<F
>::size
* slope
);
208 unsigned int unroll
= n
/ vec
<F
>::objects_per_cacheline
;
211 detail::set_ramp
<F
, vec
<F
>::objects_per_cacheline
>::slope_mp_iteration(dest
, vbase
, vslope
);
212 dest
+= vec
<F
>::objects_per_cacheline
;
216 template <typename F
>
217 inline void set_exp_vec(F
* dest
, F f
, F curve
, unsigned int n
)
221 *dest
++ = f
; f
*= curve
;
227 template <int Exponent
, typename Type
>
231 static Type
run(Type
const & base
)
233 return base
* pow_i
<Exponent
- 1, Type
>::run(base
);
237 template <typename Type
>
241 static Type
run(Type
const & base
)
247 template <size_t Exponent
, typename Type
>
248 Type
ipow(Type
const & base
)
250 return pow_i
<Exponent
, Type
>::run(base
);
255 template <typename F
>
256 inline void set_exp_vec_simd(F
* dest
, F f
, F curve
, unsigned int n
)
258 vec
<F
> vbase
, vcurve(detail::ipow
<vec
<F
>::size
, F
>(curve
));
259 vbase
.set_exp(f
, curve
);
261 unsigned int unroll
= n
/ vec
<F
>::objects_per_cacheline
;
264 detail::set_ramp
<F
, vec
<F
>::objects_per_cacheline
>::exp_mp_iteration(dest
, vbase
, vcurve
);
265 dest
+= vec
<F
>::objects_per_cacheline
;
270 template <typename F
>
271 inline void copyvec(F
* dest
, const F
* src
, unsigned int n
)
273 std::memcpy(dest
, src
, n
*sizeof(F
));
278 template <typename F
, bool src_aligned
, bool dst_aligned
, unsigned int n
>
281 static const int offset
= vec
<F
>::size
;
283 static always_inline
void mp_iteration(F
* dst
, const F
* src
)
287 val
.load_aligned(src
);
291 mp_iteration(dst
, src
+ offset
, val
);
294 static always_inline
void mp_iteration(F
* dst
, const F
* src
, vec
<F
> const & loaded_value
)
299 val
.load_aligned(src
);
303 store_aligned
<dst_aligned
>(loaded_value
, dst
);
304 copyvec
<F
, src_aligned
, dst_aligned
, n
-offset
>::mp_iteration(dst
+offset
, src
+offset
, val
);
308 template <typename F
, bool src_aligned
, bool dst_aligned
>
309 struct copyvec
<F
, src_aligned
, dst_aligned
, 0>
311 static always_inline
void mp_iteration(F
* dst
, const F
* src
, vec
<F
> loaded_value
)
317 #define COPYVEC_FUNCTION(name, src_aligned, dst_aligned) \
318 template <typename F> \
319 inline void name##_simd(F * dest, const F * src, unsigned int n) \
321 const int per_loop = vec<F>::objects_per_cacheline; \
325 detail::copyvec<F, src_aligned, dst_aligned, per_loop>::mp_iteration(dest, src); \
326 dest += per_loop; src += per_loop; \
331 template <unsigned int n, typename F> \
332 inline void name##_simd(F * dest, const F * src) \
334 detail::copyvec<F, src_aligned, dst_aligned, n>::mp_iteration(dest, src); \
337 COPYVEC_FUNCTION(copyvec_aa
, true, true)
338 COPYVEC_FUNCTION(copyvec_na
, false, true)
339 COPYVEC_FUNCTION(copyvec_an
, true, false)
340 COPYVEC_FUNCTION(copyvec_nn
, false, false)
342 template <typename F
>
343 inline void copyvec_simd(F
* dest
, const F
* src
, unsigned int n
)
345 copyvec_aa_simd(dest
, src
, n
);
348 template <unsigned int n
, typename F
>
349 inline void copyvec_simd(F
* dest
, const F
* src
)
351 copyvec_aa_simd
<n
, F
>(dest
, src
);
354 template <typename F
>
355 inline void addvec(F
* out
, const F
* in
, unsigned int n
)
362 template <typename F
>
363 inline void addvec(F
* out
, const F in
, unsigned int n
)
370 template <typename F
>
371 inline void addvec(F
* out
, const F in
, const F slope
, unsigned int n
)
374 *out
++ += in
; in
+= slope
;
381 template <typename F
, unsigned int n
>
384 static const int offset
= vec
<F
>::size
;
386 static always_inline
void mp_iteration(F
* dst
, const F
* src
)
389 v1
.load_aligned(dst
);
390 v2
.load_aligned(src
);
392 v1
.store_aligned(dst
);
393 addvec
<F
, n
-offset
>::mp_iteration(dst
+offset
, src
+offset
);
396 static always_inline
void mp_iteration(F
* dst
, vec
<F
> const & in
)
399 v1
.load_aligned(dst
);
401 v1
.store_aligned(dst
);
402 addvec
<F
, n
-offset
>::mp_iteration(dst
+offset
, in
);
405 static always_inline
void mp_iteration(F
* dst
, vec
<F
> & in
, vec
<F
> const & vslope
)
408 v1
.load_aligned(dst
);
410 v1
.store_aligned(dst
);
412 addvec
<F
, n
-offset
>::mp_iteration(dst
+offset
, in
, vslope
);
416 template <typename F
>
419 static always_inline
void mp_iteration(F
* dst
, const F
* src
)
422 static always_inline
void mp_iteration(F
* dst
, vec
<F
> const & in
)
425 static always_inline
void mp_iteration(F
* dst
, vec
<F
> & in
, vec
<F
> const & vslope
)
431 template <typename F
>
432 inline void addvec_simd(F
* out
, const F
* in
, unsigned int n
)
434 const int per_loop
= vec
<F
>::objects_per_cacheline
;
438 detail::addvec
<F
, per_loop
>::mp_iteration(out
, in
);
439 out
+= per_loop
; in
+= per_loop
;
444 template <typename F
>
445 inline void addvec_simd(F
* out
, const F in
, unsigned int n
)
447 const int per_loop
= vec
<F
>::objects_per_cacheline
;
452 detail::addvec
<F
, per_loop
>::mp_iteration(out
, vin
);
458 template <typename F
>
459 inline void addvec_simd(F
* out
, const F in
, const F slope
, unsigned int n
)
461 const int per_loop
= vec
<F
>::objects_per_cacheline
;
462 vec
<F
> vin
; vin
.set_slope(in
, slope
);
463 vec
<F
> vslope
; vslope
.set(slope
* vec
<F
>::size
);
467 detail::addvec
<F
, per_loop
>::mp_iteration(out
, vin
, vslope
);
473 template <unsigned int n
, typename F
>
474 inline void addvec_simd(F
* out
, const F
* in
)
476 detail::addvec
<F
, n
>::mp_iteration(out
, in
);
479 template <unsigned int n
, typename F
>
480 inline void addvec_simd(F
* out
, const F in
)
483 detail::addvec
<F
, n
>::mp_iteration(out
, vin
);
486 template <unsigned int n
, typename F
>
487 inline void addvec_simd(F
* out
, const F in
, const F slope
)
489 vec
<F
> vin
; vin
.set_slope(in
, slope
);
490 vec
<F
> vslope
; vslope
.set(slope
* vec
<F
>::size
);
491 detail::addvec
<F
, n
>::mp_iteration(out
, vin
, vslope
);
495 } /* namespace nova */
499 #endif /* SIMD_MEMORY_HPP */