src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_simd4_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
  37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include "gromacs/utility/basedefinitions.h"
  42
  43 #include "impl_ibm_vsx_definitions.h"
  44 #include "impl_ibm_vsx_simd_float.h"
  45
  46 namespace gmx
  47 {
  48
  49 class Simd4Float
  50 {
  51     public:
  52         Simd4Float() {}
  53
  54         // gcc-4.9 does not recognize that we use the parameter
  55         Simd4Float(float gmx_unused f) : simdInternal_(vec_splats(f)) {}
  56
  57         // Internal utility constructor to simplify return statements
  58         Simd4Float(__vector float simd) : simdInternal_(simd) {}
  59
  60         __vector float  simdInternal_;
  61 };
  62
  63 class Simd4FBool
  64 {
  65     public:
  66         Simd4FBool() {}
  67
  68         //! \brief Construct from scalar bool
  69         Simd4FBool(bool b) : simdInternal_(reinterpret_cast<__vector vsxBool int>(vec_splats( b ? 0xFFFFFFFF : 0))) {}
  70
  71         // Internal utility constructor to simplify return statements
  72         Simd4FBool(__vector vsxBool int simd) : simdInternal_(simd) {}
  73
  74         __vector vsxBool int  simdInternal_;
  75 };
  76
  77 // The VSX load & store operations are a bit of a mess. The interface is different
  78 // for xlc version 12, xlc version 13, and gcc. Long-term IBM recommends
  79 // simply using pointer dereferencing both for aligned and unaligned loads.
  80 // That's nice, but unfortunately xlc still bugs out when the pointer is
  81 // not aligned. Sticking to vec_xl/vec_xst isn't a solution either, since
  82 // that appears to be buggy for some _aligned_ loads :-)
  83 //
  84 // For now, we use pointer dereferencing for all aligned load/stores, and
  85 // for unaligned ones with gcc. On xlc we use vec_xlw4/vec_xstw4 for
  86 // unaligned memory operations. The latest docs recommend using the overloaded
  87 // vec_xl/vec_xst, but that is not supported on xlc version 12. We'll
  88 // revisit things once xlc is a bit more stable - for now you probably want
  89 // to stick to gcc...
  90
  91 static inline Simd4Float gmx_simdcall
  92 load4(const float *m)
  93 {
  94     return {
  95                *reinterpret_cast<const __vector float *>(m)
  96     };
  97 }
  98
  99 static inline void gmx_simdcall
 100 store4(float *m, Simd4Float a)
 101 {
 102     *reinterpret_cast<__vector float *>(m) = a.simdInternal_;
 103 }
 104
 105 static inline Simd4Float gmx_simdcall
 106 load4U(const float *m)
 107 {
 108     return {
 109 #if __GNUC__ < 7
 110                *reinterpret_cast<const __vector float *>(m)
 111 #else
 112                vec_xl(0, m)
 113 #endif
 114     };
 115 }
 116
 117 static inline void gmx_simdcall
 118 store4U(float *m, Simd4Float a)
 119 {
 120 #if __GNUC__ < 7
 121     *reinterpret_cast<__vector float *>(m) = a.simdInternal_;
 122 #else
 123     vec_xst(a.simdInternal_, 0, m);
 124 #endif
 125 }
 126
 127 static inline Simd4Float gmx_simdcall
 128 simd4SetZeroF()
 129 {
 130     return {
 131                vec_splats(0.0f)
 132     };
 133 }
 134
 135 static inline Simd4Float gmx_simdcall
 136 operator&(Simd4Float a, Simd4Float b)
 137 {
 138     return {
 139                vec_and(a.simdInternal_, b.simdInternal_)
 140     };
 141 }
 142
 143 static inline Simd4Float gmx_simdcall
 144 andNot(Simd4Float a, Simd4Float b)
 145 {
 146     return {
 147                vec_andc(b.simdInternal_, a.simdInternal_)
 148     };
 149 }
 150
 151 static inline Simd4Float gmx_simdcall
 152 operator|(Simd4Float a, Simd4Float b)
 153 {
 154     return {
 155                vec_or(a.simdInternal_, b.simdInternal_)
 156     };
 157 }
 158
 159 static inline Simd4Float gmx_simdcall
 160 operator^(Simd4Float a, Simd4Float b)
 161 {
 162     return {
 163                vec_xor(a.simdInternal_, b.simdInternal_)
 164     };
 165 }
 166
 167 static inline Simd4Float gmx_simdcall
 168 operator+(Simd4Float a, Simd4Float b)
 169 {
 170     return {
 171                vec_add(a.simdInternal_, b.simdInternal_)
 172     };
 173 }
 174
 175 static inline Simd4Float gmx_simdcall
 176 operator-(Simd4Float a, Simd4Float b)
 177 {
 178     return {
 179                vec_sub(a.simdInternal_, b.simdInternal_)
 180     };
 181 }
 182
 183 static inline Simd4Float gmx_simdcall
 184 operator-(Simd4Float x)
 185 {
 186     return {
 187                -x.simdInternal_
 188     };
 189 }
 190
 191 static inline Simd4Float gmx_simdcall
 192 operator*(Simd4Float a, Simd4Float b)
 193 {
 194     return {
 195                vec_mul(a.simdInternal_, b.simdInternal_)
 196     };
 197 }
 198
 199 static inline Simd4Float gmx_simdcall
 200 fma(Simd4Float a, Simd4Float b, Simd4Float c)
 201 {
 202     return {
 203                vec_madd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
 204     };
 205 }
 206
 207 static inline Simd4Float gmx_simdcall
 208 fms(Simd4Float a, Simd4Float b, Simd4Float c)
 209 {
 210     return {
 211                vec_msub(a.simdInternal_, b.simdInternal_, c.simdInternal_)
 212     };
 213 }
 214
 215 static inline Simd4Float gmx_simdcall
 216 fnma(Simd4Float a, Simd4Float b, Simd4Float c)
 217 {
 218     return {
 219                vec_nmsub(a.simdInternal_, b.simdInternal_, c.simdInternal_)
 220     };
 221 }
 222
 223 static inline Simd4Float gmx_simdcall
 224 fnms(Simd4Float a, Simd4Float b, Simd4Float c)
 225 {
 226     return {
 227                vec_nmadd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
 228     };
 229 }
 230
 231 static inline Simd4Float gmx_simdcall
 232 rsqrt(Simd4Float x)
 233 {
 234     return {
 235                vec_rsqrte(x.simdInternal_)
 236     };
 237 }
 238
 239 static inline Simd4Float gmx_simdcall
 240 abs(Simd4Float x)
 241 {
 242     return {
 243                vec_abs( x.simdInternal_ )
 244     };
 245 }
 246
 247 static inline Simd4Float gmx_simdcall
 248 max(Simd4Float a, Simd4Float b)
 249 {
 250     return {
 251                vec_max(a.simdInternal_, b.simdInternal_)
 252     };
 253 }
 254
 255 static inline Simd4Float gmx_simdcall
 256 min(Simd4Float a, Simd4Float b)
 257 {
 258     return {
 259                vec_min(a.simdInternal_, b.simdInternal_)
 260     };
 261 }
 262
 263 static inline Simd4Float gmx_simdcall
 264 round(Simd4Float x)
 265 {
 266     return {
 267                vec_round( x.simdInternal_ )
 268     };
 269 }
 270
 271 static inline Simd4Float gmx_simdcall
 272 trunc(Simd4Float x)
 273 {
 274     return {
 275                vec_trunc( x.simdInternal_ )
 276     };
 277 }
 278
 279 static inline float gmx_simdcall
 280 dotProduct(Simd4Float a, Simd4Float b)
 281 {
 282     const __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
 283     const __vector unsigned char perm2 = { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
 284     __vector float               c     = vec_mul(a.simdInternal_, b.simdInternal_);
 285     __vector float               sum;
 286     sum = vec_add(c, vec_perm(c, c, perm1));
 287     sum = vec_add(sum, vec_perm(c, c, perm2));
 288     return vec_extract(sum, 0);
 289 }
 290
 291 static inline void gmx_simdcall
 292 transpose(Simd4Float * v0, Simd4Float * v1,
 293           Simd4Float * v2, Simd4Float * v3)
 294 {
 295     __vector float t0 = vec_mergeh(v0->simdInternal_, v2->simdInternal_);
 296     __vector float t1 = vec_mergel(v0->simdInternal_, v2->simdInternal_);
 297     __vector float t2 = vec_mergeh(v1->simdInternal_, v3->simdInternal_);
 298     __vector float t3 = vec_mergel(v1->simdInternal_, v3->simdInternal_);
 299     v0->simdInternal_ = vec_mergeh(t0, t2);
 300     v1->simdInternal_ = vec_mergel(t0, t2);
 301     v2->simdInternal_ = vec_mergeh(t1, t3);
 302     v3->simdInternal_ = vec_mergel(t1, t3);
 303 }
 304
 305 static inline Simd4FBool gmx_simdcall
 306 operator==(Simd4Float a, Simd4Float b)
 307 {
 308     return {
 309                vec_cmpeq(a.simdInternal_, b.simdInternal_)
 310     };
 311 }
 312
 313 static inline Simd4FBool gmx_simdcall
 314 operator!=(Simd4Float a, Simd4Float b)
 315 {
 316     return {
 317                vec_or(vec_cmpgt(a.simdInternal_, b.simdInternal_),
 318                       vec_cmplt(a.simdInternal_, b.simdInternal_))
 319     };
 320 }
 321
 322 static inline Simd4FBool gmx_simdcall
 323 operator<(Simd4Float a, Simd4Float b)
 324 {
 325     return {
 326                vec_cmplt(a.simdInternal_, b.simdInternal_)
 327     };
 328 }
 329
 330 static inline Simd4FBool gmx_simdcall
 331 operator<=(Simd4Float a, Simd4Float b)
 332 {
 333     return {
 334                vec_cmple(a.simdInternal_, b.simdInternal_)
 335     };
 336 }
 337
 338 static inline Simd4FBool gmx_simdcall
 339 operator&&(Simd4FBool a, Simd4FBool b)
 340 {
 341     return {
 342                vec_and(a.simdInternal_, b.simdInternal_)
 343     };
 344 }
 345
 346 static inline Simd4FBool gmx_simdcall
 347 operator||(Simd4FBool a, Simd4FBool b)
 348 {
 349     return {
 350                vec_or(a.simdInternal_, b.simdInternal_)
 351     };
 352 }
 353
 354 static inline bool gmx_simdcall
 355 anyTrue(Simd4FBool a)
 356 {
 357     return vec_any_ne(a.simdInternal_, reinterpret_cast<__vector vsxBool int>(vec_splats(0)));
 358 }
 359
 360 static inline Simd4Float gmx_simdcall
 361 selectByMask(Simd4Float a, Simd4FBool m)
 362 {
 363     return {
 364                vec_and(a.simdInternal_, reinterpret_cast<__vector float>(m.simdInternal_))
 365     };
 366 }
 367
 368 static inline Simd4Float gmx_simdcall
 369 selectByNotMask(Simd4Float a, Simd4FBool m)
 370 {
 371     return {
 372                vec_andc(a.simdInternal_, reinterpret_cast<__vector float>(m.simdInternal_))
 373     };
 374 }
 375
 376 static inline Simd4Float gmx_simdcall
 377 blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
 378 {
 379     return {
 380                vec_sel(a.simdInternal_, b.simdInternal_, sel.simdInternal_)
 381     };
 382 }
 383
 384 static inline float gmx_simdcall
 385 reduce(Simd4Float x)
 386 {
 387     const __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
 388     const __vector unsigned char perm2 = { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
 389
 390     x.simdInternal_ = vec_add(x.simdInternal_, vec_perm(x.simdInternal_, x.simdInternal_, perm1));
 391     x.simdInternal_ = vec_add(x.simdInternal_, vec_perm(x.simdInternal_, x.simdInternal_, perm2));
 392     return vec_extract(x.simdInternal_, 0);
 393 }
 394
 395 }      // namespace gmx
 396
 397 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H