src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_simd4_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
  37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include "gromacs/utility/basedefinitions.h"
  42
  43 #include "impl_ibm_vsx_definitions.h"
  44 #include "impl_ibm_vsx_simd_float.h"
  45
  46 namespace gmx
  47 {
  48
  49 class Simd4Float
  50 {
  51 public:
  52     Simd4Float() {}
  53
  54     // gcc-4.9 does not recognize that we use the parameter
  55     Simd4Float(float gmx_unused f) : simdInternal_(vec_splats(f)) {}
  56
  57     // Internal utility constructor to simplify return statements
  58     Simd4Float(__vector float simd) : simdInternal_(simd) {}
  59
  60     __vector float simdInternal_;
  61 };
  62
  63 class Simd4FBool
  64 {
  65 public:
  66     Simd4FBool() {}
  67
  68     //! \brief Construct from scalar bool
  69     Simd4FBool(bool b) :
  70         simdInternal_(reinterpret_cast<__vector vsxBool int>(vec_splats(b ? 0xFFFFFFFF : 0)))
  71     {
  72     }
  73
  74     // Internal utility constructor to simplify return statements
  75     Simd4FBool(__vector vsxBool int simd) : simdInternal_(simd) {}
  76
  77     __vector vsxBool int simdInternal_;
  78 };
  79
  80 // The VSX load & store operations are a bit of a mess. The interface is different
  81 // for xlc version 12, xlc version 13, and gcc. Long-term IBM recommends
  82 // simply using pointer dereferencing both for aligned and unaligned loads.
  83 // That's nice, but unfortunately xlc still bugs out when the pointer is
  84 // not aligned. Sticking to vec_xl/vec_xst isn't a solution either, since
  85 // that appears to be buggy for some _aligned_ loads :-)
  86 //
  87 // For now, we use pointer dereferencing for all aligned load/stores, and
  88 // for unaligned ones with gcc. On xlc we use vec_xlw4/vec_xstw4 for
  89 // unaligned memory operations. The latest docs recommend using the overloaded
  90 // vec_xl/vec_xst, but that is not supported on xlc version 12. We'll
  91 // revisit things once xlc is a bit more stable - for now you probably want
  92 // to stick to gcc...
  93
  94 static inline Simd4Float gmx_simdcall load4(const float* m)
  95 {
  96     return { *reinterpret_cast<const __vector float*>(m) };
  97 }
  98
  99 static inline void gmx_simdcall store4(float* m, Simd4Float a)
 100 {
 101     *reinterpret_cast<__vector float*>(m) = a.simdInternal_;
 102 }
 103
 104 static inline Simd4Float gmx_simdcall load4U(const float* m)
 105 {
 106     return
 107     {
 108 #if __GNUC__ < 7
 109         *reinterpret_cast<const __vector float*>(m)
 110 #else
 111         vec_xl(0, m)
 112 #endif
 113     };
 114 }
 115
 116 static inline void gmx_simdcall store4U(float* m, Simd4Float a)
 117 {
 118 #if __GNUC__ < 7
 119     *reinterpret_cast<__vector float*>(m) = a.simdInternal_;
 120 #else
 121     vec_xst(a.simdInternal_, 0, m);
 122 #endif
 123 }
 124
 125 static inline Simd4Float gmx_simdcall simd4SetZeroF()
 126 {
 127     return { vec_splats(0.0F) };
 128 }
 129
 130 static inline Simd4Float gmx_simdcall operator&(Simd4Float a, Simd4Float b)
 131 {
 132     return { vec_and(a.simdInternal_, b.simdInternal_) };
 133 }
 134
 135 static inline Simd4Float gmx_simdcall andNot(Simd4Float a, Simd4Float b)
 136 {
 137     return { vec_andc(b.simdInternal_, a.simdInternal_) };
 138 }
 139
 140 static inline Simd4Float gmx_simdcall operator|(Simd4Float a, Simd4Float b)
 141 {
 142     return { vec_or(a.simdInternal_, b.simdInternal_) };
 143 }
 144
 145 static inline Simd4Float gmx_simdcall operator^(Simd4Float a, Simd4Float b)
 146 {
 147     return { vec_xor(a.simdInternal_, b.simdInternal_) };
 148 }
 149
 150 static inline Simd4Float gmx_simdcall operator+(Simd4Float a, Simd4Float b)
 151 {
 152     return { vec_add(a.simdInternal_, b.simdInternal_) };
 153 }
 154
 155 static inline Simd4Float gmx_simdcall operator-(Simd4Float a, Simd4Float b)
 156 {
 157     return { vec_sub(a.simdInternal_, b.simdInternal_) };
 158 }
 159
 160 static inline Simd4Float gmx_simdcall operator-(Simd4Float x)
 161 {
 162     return { -x.simdInternal_ };
 163 }
 164
 165 static inline Simd4Float gmx_simdcall operator*(Simd4Float a, Simd4Float b)
 166 {
 167     return { vec_mul(a.simdInternal_, b.simdInternal_) };
 168 }
 169
 170 static inline Simd4Float gmx_simdcall fma(Simd4Float a, Simd4Float b, Simd4Float c)
 171 {
 172     return { vec_madd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 173 }
 174
 175 static inline Simd4Float gmx_simdcall fms(Simd4Float a, Simd4Float b, Simd4Float c)
 176 {
 177     return { vec_msub(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 178 }
 179
 180 static inline Simd4Float gmx_simdcall fnma(Simd4Float a, Simd4Float b, Simd4Float c)
 181 {
 182     return { vec_nmsub(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 183 }
 184
 185 static inline Simd4Float gmx_simdcall fnms(Simd4Float a, Simd4Float b, Simd4Float c)
 186 {
 187     return { vec_nmadd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
 188 }
 189
 190 static inline Simd4Float gmx_simdcall rsqrt(Simd4Float x)
 191 {
 192     return { vec_rsqrte(x.simdInternal_) };
 193 }
 194
 195 static inline Simd4Float gmx_simdcall abs(Simd4Float x)
 196 {
 197     return { vec_abs(x.simdInternal_) };
 198 }
 199
 200 static inline Simd4Float gmx_simdcall max(Simd4Float a, Simd4Float b)
 201 {
 202     return { vec_max(a.simdInternal_, b.simdInternal_) };
 203 }
 204
 205 static inline Simd4Float gmx_simdcall min(Simd4Float a, Simd4Float b)
 206 {
 207     return { vec_min(a.simdInternal_, b.simdInternal_) };
 208 }
 209
 210 static inline Simd4Float gmx_simdcall round(Simd4Float x)
 211 {
 212     return { vec_round(x.simdInternal_) };
 213 }
 214
 215 static inline Simd4Float gmx_simdcall trunc(Simd4Float x)
 216 {
 217     return { vec_trunc(x.simdInternal_) };
 218 }
 219
 220 static inline float gmx_simdcall dotProduct(Simd4Float a, Simd4Float b)
 221 {
 222     const __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
 223     const __vector unsigned char perm2 = { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
 224     __vector float               c     = vec_mul(a.simdInternal_, b.simdInternal_);
 225     __vector float               sum;
 226     sum = vec_add(c, vec_perm(c, c, perm1));
 227     sum = vec_add(sum, vec_perm(c, c, perm2));
 228     return vec_extract(sum, 0);
 229 }
 230
 231 static inline void gmx_simdcall transpose(Simd4Float* v0, Simd4Float* v1, Simd4Float* v2, Simd4Float* v3)
 232 {
 233     __vector float t0 = vec_mergeh(v0->simdInternal_, v2->simdInternal_);
 234     __vector float t1 = vec_mergel(v0->simdInternal_, v2->simdInternal_);
 235     __vector float t2 = vec_mergeh(v1->simdInternal_, v3->simdInternal_);
 236     __vector float t3 = vec_mergel(v1->simdInternal_, v3->simdInternal_);
 237     v0->simdInternal_ = vec_mergeh(t0, t2);
 238     v1->simdInternal_ = vec_mergel(t0, t2);
 239     v2->simdInternal_ = vec_mergeh(t1, t3);
 240     v3->simdInternal_ = vec_mergel(t1, t3);
 241 }
 242
 243 static inline Simd4FBool gmx_simdcall operator==(Simd4Float a, Simd4Float b)
 244 {
 245     return { vec_cmpeq(a.simdInternal_, b.simdInternal_) };
 246 }
 247
 248 static inline Simd4FBool gmx_simdcall operator!=(Simd4Float a, Simd4Float b)
 249 {
 250     return { vec_or(vec_cmpgt(a.simdInternal_, b.simdInternal_),
 251                     vec_cmplt(a.simdInternal_, b.simdInternal_)) };
 252 }
 253
 254 static inline Simd4FBool gmx_simdcall operator<(Simd4Float a, Simd4Float b)
 255 {
 256     return { vec_cmplt(a.simdInternal_, b.simdInternal_) };
 257 }
 258
 259 static inline Simd4FBool gmx_simdcall operator<=(Simd4Float a, Simd4Float b)
 260 {
 261     return { vec_cmple(a.simdInternal_, b.simdInternal_) };
 262 }
 263
 264 static inline Simd4FBool gmx_simdcall operator&&(Simd4FBool a, Simd4FBool b)
 265 {
 266     return { vec_and(a.simdInternal_, b.simdInternal_) };
 267 }
 268
 269 static inline Simd4FBool gmx_simdcall operator||(Simd4FBool a, Simd4FBool b)
 270 {
 271     return { vec_or(a.simdInternal_, b.simdInternal_) };
 272 }
 273
 274 static inline bool gmx_simdcall anyTrue(Simd4FBool a)
 275 {
 276     return vec_any_ne(a.simdInternal_, reinterpret_cast<__vector vsxBool int>(vec_splats(0)));
 277 }
 278
 279 static inline Simd4Float gmx_simdcall selectByMask(Simd4Float a, Simd4FBool m)
 280 {
 281     return { vec_and(a.simdInternal_, reinterpret_cast<__vector float>(m.simdInternal_)) };
 282 }
 283
 284 static inline Simd4Float gmx_simdcall selectByNotMask(Simd4Float a, Simd4FBool m)
 285 {
 286     return { vec_andc(a.simdInternal_, reinterpret_cast<__vector float>(m.simdInternal_)) };
 287 }
 288
 289 static inline Simd4Float gmx_simdcall blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
 290 {
 291     return { vec_sel(a.simdInternal_, b.simdInternal_, sel.simdInternal_) };
 292 }
 293
 294 static inline float gmx_simdcall reduce(Simd4Float x)
 295 {
 296     const __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
 297     const __vector unsigned char perm2 = { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
 298
 299     x.simdInternal_ = vec_add(x.simdInternal_, vec_perm(x.simdInternal_, x.simdInternal_, perm1));
 300     x.simdInternal_ = vec_add(x.simdInternal_, vec_perm(x.simdInternal_, x.simdInternal_, perm2));
 301     return vec_extract(x.simdInternal_, 0);
 302 }
 303
 304 } // namespace gmx
 305
 306 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H