detail/unroll_helpers.hpp

   1 //  unroll helpers
   2 //  Copyright (C) 2010 Tim Blechmann
   3 //
   4 //  This program is free software; you can redistribute it and/or modify
   5 //  it under the terms of the GNU General Public License as published by
   6 //  the Free Software Foundation; either version 2 of the License, or
   7 //  (at your option) any later version.
   8 //
   9 //  This program is distributed in the hope that it will be useful,
  10 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 //  GNU General Public License for more details.
  13 //
  14 //  You should have received a copy of the GNU General Public License
  15 //  along with this program; see the file COPYING.  If not, write to
  16 //  the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17 //  Boston, MA 02111-1307, USA.
  18
  19 #ifndef NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP
  20 #define NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP
  21
  22 #include "../vec.hpp"
  23
  24 #if defined(__GNUC__) && defined(NDEBUG)
  25 #define always_inline inline  __attribute__((always_inline))
  26 #else
  27 #define always_inline inline
  28 #endif
  29
  30 namespace nova {
  31 namespace detail {
  32
  33 template <typename FloatType,
  34           int N
  35          >
  36 struct compile_time_unroller
  37 {
  38     typedef vec<FloatType> vec_type;
  39
  40     static const int offset = vec_type::size;
  41
  42     template <typename arg1_type,
  43               typename Functor
  44              >
  45     static always_inline void run(FloatType * out, arg1_type & in1, Functor const & f)
  46     {
  47         compile_time_unroller<FloatType, N>::mp_iteration_1(out, in1.consume(), in1, f);
  48     }
  49
  50     template <typename arg1_type,
  51               typename arg2_type,
  52               typename Functor
  53              >
  54     static always_inline void run(FloatType * out, arg1_type & in1, arg2_type & in2, Functor const & f)
  55     {
  56         compile_time_unroller<FloatType, N>::mp_iteration_2(out, in1.consume(), in1, in2.consume(), in2, f);
  57     }
  58
  59     template <typename arg1_type,
  60               typename arg2_type,
  61               typename arg3_type,
  62               typename Functor
  63              >
  64     static always_inline void run(FloatType * out, arg1_type & in1, arg2_type & in2,
  65                                   arg3_type & in3, Functor const & f)
  66     {
  67         compile_time_unroller<FloatType, N>::mp_iteration_3(out, in1.consume(), in1, in2.consume(), in2, in3.consume(), in3, f);
  68     }
  69
  70     template <typename arg1_type,
  71               typename arg2_type,
  72               typename arg3_type,
  73               typename arg4_type,
  74               typename Functor
  75              >
  76     static always_inline void run(FloatType * out, arg1_type & in1, arg2_type & in2,
  77                                   arg3_type & in3, arg4_type & in4, Functor const & f)
  78     {
  79         compile_time_unroller<FloatType, N>::mp_iteration_4(out, in1.consume(), in1, in2.consume(), in2,
  80                                                             in3.consume(), in3, in4.consume(), in4, f);
  81     }
  82
  83 private:
  84     friend struct compile_time_unroller<FloatType, vec_type::size + N>;
  85
  86     template <typename arg1_type,
  87               typename Functor
  88              >
  89     static always_inline void mp_iteration_1(FloatType * out, vec_type loaded_in1, arg1_type & in1, Functor const & f)
  90     {
  91         vec_type loaded_next_in1;
  92         if (N != offset)
  93             loaded_next_in1 = in1.consume();
  94
  95         vec_type result = f(loaded_in1);
  96         result.store_aligned(out);
  97         compile_time_unroller<FloatType, N-offset>::mp_iteration_1(out+offset, loaded_next_in1, in1, f);
  98     }
  99
 100     template <typename arg1_type,
 101               typename arg2_type,
 102               typename Functor
 103              >
 104     static always_inline void mp_iteration_2(FloatType * out, vec_type loaded_in1, arg1_type & in1,
 105                                              vec_type loaded_in2, arg2_type & in2, Functor const & f)
 106     {
 107         vec_type loaded_next_in1;
 108         if (N != offset)
 109             loaded_next_in1 = in1.consume();
 110
 111         vec_type loaded_next_in2;
 112         if (N != offset)
 113             loaded_next_in2 = in2.consume();
 114
 115         vec_type result = f(loaded_in1, loaded_in2);
 116         result.store_aligned(out);
 117         compile_time_unroller<FloatType, N-offset>::mp_iteration_2(out+offset, loaded_next_in1, in1, loaded_next_in2, in2, f);
 118     }
 119
 120     template <typename arg1_type,
 121               typename arg2_type,
 122               typename arg3_type,
 123               typename Functor
 124              >
 125     static always_inline void mp_iteration_3(FloatType * out, vec_type loaded_in1, arg1_type & in1,
 126                                              vec_type loaded_in2, arg2_type & in2,
 127                                              vec_type loaded_in3, arg3_type & in3, Functor const & f)
 128     {
 129         vec_type loaded_next_in1;
 130         if (N != offset)
 131             loaded_next_in1 = in1.consume();
 132
 133         vec_type loaded_next_in2;
 134         if (N != offset)
 135             loaded_next_in2 = in2.consume();
 136
 137         vec_type loaded_next_in3;
 138         if (N != offset)
 139             loaded_next_in3 = in3.consume();
 140
 141         vec_type result = f(loaded_in1, loaded_in2, loaded_in3);
 142         result.store_aligned(out);
 143         compile_time_unroller<FloatType, N-offset>::mp_iteration_3(out+offset, loaded_next_in1, in1, loaded_next_in2, in2,
 144                                                                    loaded_next_in3, in3, f);
 145     }
 146
 147     template <typename arg1_type,
 148               typename arg2_type,
 149               typename arg3_type,
 150               typename arg4_type,
 151               typename Functor
 152              >
 153     static always_inline void mp_iteration_4(FloatType * out, vec_type loaded_in1, arg1_type & in1, vec_type loaded_in2, arg2_type & in2,
 154                                              vec_type loaded_in3, arg3_type & in3, vec_type loaded_in4, arg4_type & in4, Functor const & f)
 155     {
 156         vec_type loaded_next_in1;
 157         if (N != offset)
 158             loaded_next_in1 = in1.consume();
 159
 160         vec_type loaded_next_in2;
 161         if (N != offset)
 162             loaded_next_in2 = in2.consume();
 163
 164         vec_type loaded_next_in3;
 165         if (N != offset)
 166             loaded_next_in3 = in3.consume();
 167
 168         vec_type loaded_next_in4;
 169         if (N != offset)
 170             loaded_next_in4 = in4.consume();
 171
 172         vec_type result = f(loaded_in1, loaded_in2, loaded_in3, loaded_in4);
 173         result.store_aligned(out);
 174
 175         compile_time_unroller<FloatType, N-offset>::mp_iteration_4(out+offset, loaded_next_in1, in1, loaded_next_in2, in2,
 176                                                                    loaded_next_in3, in3, loaded_next_in4, in4, f);
 177     }
 178 };
 179
 180 template <typename FloatType>
 181 struct compile_time_unroller<FloatType, 0>
 182 {
 183     friend struct compile_time_unroller<FloatType, vec<FloatType>::size>;
 184
 185 private:
 186     template <typename LoadedArg1, typename Arg1,
 187               typename Functor
 188              >
 189     static always_inline void mp_iteration_1(FloatType * out, LoadedArg1 const &, Arg1 const &, Functor const & f)
 190     {}
 191
 192     template <typename LoadedArg1, typename Arg1,
 193               typename LoadedArg2, typename Arg2,
 194               typename Functor
 195              >
 196     static always_inline void mp_iteration_2(FloatType * out, LoadedArg1 const &, Arg1 const &,
 197                                              LoadedArg2 const &, Arg2 const &, Functor const & f)
 198     {}
 199
 200     template <typename LoadedArg1, typename Arg1,
 201               typename LoadedArg2, typename Arg2,
 202               typename LoadedArg3, typename Arg3,
 203               typename Functor
 204              >
 205     static always_inline void mp_iteration_3(FloatType * out, LoadedArg1 const &, Arg1 const &,
 206                                              LoadedArg2 const &, Arg2 const &, LoadedArg3 const &, Arg3 const &,
 207                                              Functor const & f)
 208     {}
 209
 210     template <typename LoadedArg1, typename Arg1,
 211               typename LoadedArg2, typename Arg2,
 212               typename LoadedArg3, typename Arg3,
 213               typename LoadedArg4, typename Arg4,
 214               typename Functor
 215              >
 216     static always_inline void mp_iteration_4(FloatType * out, LoadedArg1 const &, Arg1 const &,
 217                                              LoadedArg2 const &, Arg2 const &, LoadedArg3 const &, Arg3 const &,
 218                                              LoadedArg4 const &, Arg4 const &, Functor const & f)
 219     {}
 220 };
 221
 222
 223 template <typename float_type,
 224           typename Arg1,
 225           typename Functor
 226          >
 227 always_inline void generate_simd_loop(float_type * out, Arg1 arg1, unsigned int n, Functor const & f)
 228 {
 229     const unsigned int per_loop = vec<float_type>::objects_per_cacheline;
 230     n /= per_loop;
 231     do {
 232         detail::compile_time_unroller<float_type, per_loop>::run(out, arg1, f);
 233         out += per_loop;
 234     } while (--n);
 235 }
 236
 237 template <typename float_type,
 238           typename Arg1,
 239           typename Arg2,
 240           typename Functor
 241          >
 242 always_inline void generate_simd_loop(float_type * out, Arg1 arg1, Arg2 arg2, unsigned int n, Functor const & f)
 243 {
 244     const unsigned int per_loop = vec<float_type>::objects_per_cacheline;
 245     n /= per_loop;
 246     do {
 247         detail::compile_time_unroller<float_type, per_loop>::run(out, arg1, arg2, f);
 248         out += per_loop;
 249     } while (--n);
 250 }
 251
 252 template <typename float_type,
 253           typename Arg1,
 254           typename Arg2,
 255           typename Arg3,
 256           typename Functor
 257          >
 258 always_inline void generate_simd_loop(float_type * out, Arg1 arg1, Arg2 arg2, Arg3 arg3, unsigned int n, Functor const & f)
 259 {
 260     const unsigned int per_loop = vec<float_type>::objects_per_cacheline;
 261     n /= per_loop;
 262     do {
 263         detail::compile_time_unroller<float_type, per_loop>::run(out, arg1, arg2, arg3, f);
 264         out += per_loop;
 265     } while (--n);
 266 }
 267
 268 template <typename float_type,
 269           typename Arg1,
 270           typename Arg2,
 271           typename Arg3,
 272           typename Arg4,
 273           typename Functor
 274          >
 275 always_inline void generate_simd_loop(float_type * out, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, unsigned int n, Functor const & f)
 276 {
 277     const unsigned int per_loop = vec<float_type>::objects_per_cacheline;
 278     n /= per_loop;
 279     do {
 280         detail::compile_time_unroller<float_type, per_loop>::run(out, arg1, arg2, arg3, arg4, f);
 281         out += per_loop;
 282     } while (--n);
 283 }
 284
 285 }
 286 }
 287
 288 #undef always_inline
 289
 290 #endif /* NOVA_SIMD_DETAIL_UNROLL_HELPERS_HPP */