2 // Copyright (C) 2010 Tim Blechmann
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; see the file COPYING. If not, write to
16 // the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 // Boston, MA 02111-1307, USA.
19 #ifndef NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP
20 #define NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP
24 #if defined(__GNUC__) && defined(NDEBUG)
25 #define always_inline inline __attribute__((always_inline))
27 #define always_inline inline
33 template <typename FloatType
,
36 struct compile_time_unroller
38 typedef vec
<FloatType
> vec_type
;
40 static const int offset
= vec_type::size
;
42 template <typename arg1_type
,
45 static always_inline
void run(FloatType
* out
, arg1_type
& in1
, Functor
const & f
)
47 compile_time_unroller
<FloatType
, N
>::mp_iteration_1(out
, in1
.consume(), in1
, f
);
50 template <typename arg1_type
,
54 static always_inline
void run(FloatType
* out
, arg1_type
& in1
, arg2_type
& in2
, Functor
const & f
)
56 compile_time_unroller
<FloatType
, N
>::mp_iteration_2(out
, in1
.consume(), in1
, in2
.consume(), in2
, f
);
59 template <typename arg1_type
,
64 static always_inline
void run(FloatType
* out
, arg1_type
& in1
, arg2_type
& in2
,
65 arg3_type
& in3
, Functor
const & f
)
67 compile_time_unroller
<FloatType
, N
>::mp_iteration_3(out
, in1
.consume(), in1
, in2
.consume(), in2
, in3
.consume(), in3
, f
);
70 template <typename arg1_type
,
76 static always_inline
void run(FloatType
* out
, arg1_type
& in1
, arg2_type
& in2
,
77 arg3_type
& in3
, arg4_type
& in4
, Functor
const & f
)
79 compile_time_unroller
<FloatType
, N
>::mp_iteration_4(out
, in1
.consume(), in1
, in2
.consume(), in2
,
80 in3
.consume(), in3
, in4
.consume(), in4
, f
);
84 friend struct compile_time_unroller
<FloatType
, vec_type::size
+ N
>;
86 template <typename arg1_type
,
89 static always_inline
void mp_iteration_1(FloatType
* out
, vec_type loaded_in1
, arg1_type
& in1
, Functor
const & f
)
91 vec_type loaded_next_in1
;
93 loaded_next_in1
= in1
.consume();
95 vec_type result
= f(loaded_in1
);
96 result
.store_aligned(out
);
97 compile_time_unroller
<FloatType
, N
-offset
>::mp_iteration_1(out
+offset
, loaded_next_in1
, in1
, f
);
100 template <typename arg1_type
,
104 static always_inline
void mp_iteration_2(FloatType
* out
, vec_type loaded_in1
, arg1_type
& in1
,
105 vec_type loaded_in2
, arg2_type
& in2
, Functor
const & f
)
107 vec_type loaded_next_in1
;
109 loaded_next_in1
= in1
.consume();
111 vec_type loaded_next_in2
;
113 loaded_next_in2
= in2
.consume();
115 vec_type result
= f(loaded_in1
, loaded_in2
);
116 result
.store_aligned(out
);
117 compile_time_unroller
<FloatType
, N
-offset
>::mp_iteration_2(out
+offset
, loaded_next_in1
, in1
, loaded_next_in2
, in2
, f
);
120 template <typename arg1_type
,
125 static always_inline
void mp_iteration_3(FloatType
* out
, vec_type loaded_in1
, arg1_type
& in1
,
126 vec_type loaded_in2
, arg2_type
& in2
,
127 vec_type loaded_in3
, arg3_type
& in3
, Functor
const & f
)
129 vec_type loaded_next_in1
;
131 loaded_next_in1
= in1
.consume();
133 vec_type loaded_next_in2
;
135 loaded_next_in2
= in2
.consume();
137 vec_type loaded_next_in3
;
139 loaded_next_in3
= in3
.consume();
141 vec_type result
= f(loaded_in1
, loaded_in2
, loaded_in3
);
142 result
.store_aligned(out
);
143 compile_time_unroller
<FloatType
, N
-offset
>::mp_iteration_3(out
+offset
, loaded_next_in1
, in1
, loaded_next_in2
, in2
,
144 loaded_next_in3
, in3
, f
);
147 template <typename arg1_type
,
153 static always_inline
void mp_iteration_4(FloatType
* out
, vec_type loaded_in1
, arg1_type
& in1
, vec_type loaded_in2
, arg2_type
& in2
,
154 vec_type loaded_in3
, arg3_type
& in3
, vec_type loaded_in4
, arg4_type
& in4
, Functor
const & f
)
156 vec_type loaded_next_in1
;
158 loaded_next_in1
= in1
.consume();
160 vec_type loaded_next_in2
;
162 loaded_next_in2
= in2
.consume();
164 vec_type loaded_next_in3
;
166 loaded_next_in3
= in3
.consume();
168 vec_type loaded_next_in4
;
170 loaded_next_in4
= in4
.consume();
172 vec_type result
= f(loaded_in1
, loaded_in2
, loaded_in3
, loaded_in4
);
173 result
.store_aligned(out
);
175 compile_time_unroller
<FloatType
, N
-offset
>::mp_iteration_4(out
+offset
, loaded_next_in1
, in1
, loaded_next_in2
, in2
,
176 loaded_next_in3
, in3
, loaded_next_in4
, in4
, f
);
180 template <typename FloatType
>
181 struct compile_time_unroller
<FloatType
, 0>
183 friend struct compile_time_unroller
<FloatType
, vec
<FloatType
>::size
>;
186 template <typename LoadedArg1
, typename Arg1
,
189 static always_inline
void mp_iteration_1(FloatType
* out
, LoadedArg1
const &, Arg1
const &, Functor
const & f
)
192 template <typename LoadedArg1
, typename Arg1
,
193 typename LoadedArg2
, typename Arg2
,
196 static always_inline
void mp_iteration_2(FloatType
* out
, LoadedArg1
const &, Arg1
const &,
197 LoadedArg2
const &, Arg2
const &, Functor
const & f
)
200 template <typename LoadedArg1
, typename Arg1
,
201 typename LoadedArg2
, typename Arg2
,
202 typename LoadedArg3
, typename Arg3
,
205 static always_inline
void mp_iteration_3(FloatType
* out
, LoadedArg1
const &, Arg1
const &,
206 LoadedArg2
const &, Arg2
const &, LoadedArg3
const &, Arg3
const &,
210 template <typename LoadedArg1
, typename Arg1
,
211 typename LoadedArg2
, typename Arg2
,
212 typename LoadedArg3
, typename Arg3
,
213 typename LoadedArg4
, typename Arg4
,
216 static always_inline
void mp_iteration_4(FloatType
* out
, LoadedArg1
const &, Arg1
const &,
217 LoadedArg2
const &, Arg2
const &, LoadedArg3
const &, Arg3
const &,
218 LoadedArg4
const &, Arg4
const &, Functor
const & f
)
223 template <typename float_type
,
227 always_inline
void generate_simd_loop(float_type
* out
, Arg1 arg1
, unsigned int n
, Functor
const & f
)
229 const unsigned int per_loop
= vec
<float_type
>::objects_per_cacheline
;
232 detail::compile_time_unroller
<float_type
, per_loop
>::run(out
, arg1
, f
);
237 template <typename float_type
,
242 always_inline
void generate_simd_loop(float_type
* out
, Arg1 arg1
, Arg2 arg2
, unsigned int n
, Functor
const & f
)
244 const unsigned int per_loop
= vec
<float_type
>::objects_per_cacheline
;
247 detail::compile_time_unroller
<float_type
, per_loop
>::run(out
, arg1
, arg2
, f
);
252 template <typename float_type
,
258 always_inline
void generate_simd_loop(float_type
* out
, Arg1 arg1
, Arg2 arg2
, Arg3 arg3
, unsigned int n
, Functor
const & f
)
260 const unsigned int per_loop
= vec
<float_type
>::objects_per_cacheline
;
263 detail::compile_time_unroller
<float_type
, per_loop
>::run(out
, arg1
, arg2
, arg3
, f
);
268 template <typename float_type
,
275 always_inline
void generate_simd_loop(float_type
* out
, Arg1 arg1
, Arg2 arg2
, Arg3 arg3
, Arg4 arg4
, unsigned int n
, Functor
const & f
)
277 const unsigned int per_loop
= vec
<float_type
>::objects_per_cacheline
;
280 detail::compile_time_unroller
<float_type
, per_loop
>::run(out
, arg1
, arg2
, arg3
, arg4
, f
);
290 #endif /* NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP */