Update instructions in containers.rst
[gromacs.git] / src / gromacs / simd / impl_ibm_vsx / impl_ibm_vsx_simd4_float.h
blobb6957e249e22d70454314b79fd2fa916ccbbee45
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
39 #include "config.h"
41 #include "gromacs/utility/basedefinitions.h"
43 #include "impl_ibm_vsx_definitions.h"
44 #include "impl_ibm_vsx_simd_float.h"
46 namespace gmx
49 class Simd4Float
51 public:
52 Simd4Float() {}
54 // gcc-4.9 does not recognize that we use the parameter
55 Simd4Float(float gmx_unused f) : simdInternal_(vec_splats(f)) {}
57 // Internal utility constructor to simplify return statements
58 Simd4Float(__vector float simd) : simdInternal_(simd) {}
60 __vector float simdInternal_;
63 class Simd4FBool
65 public:
66 Simd4FBool() {}
68 //! \brief Construct from scalar bool
69 Simd4FBool(bool b) :
70 simdInternal_(reinterpret_cast<__vector vsxBool int>(vec_splats(b ? 0xFFFFFFFF : 0)))
74 // Internal utility constructor to simplify return statements
75 Simd4FBool(__vector vsxBool int simd) : simdInternal_(simd) {}
77 __vector vsxBool int simdInternal_;
80 // The VSX load & store operations are a bit of a mess. The interface is different
81 // for xlc version 12, xlc version 13, and gcc. Long-term IBM recommends
82 // simply using pointer dereferencing both for aligned and unaligned loads.
83 // That's nice, but unfortunately xlc still bugs out when the pointer is
84 // not aligned. Sticking to vec_xl/vec_xst isn't a solution either, since
85 // that appears to be buggy for some _aligned_ loads :-)
87 // For now, we use pointer dereferencing for all aligned load/stores, and
88 // for unaligned ones with gcc. On xlc we use vec_xlw4/vec_xstw4 for
89 // unaligned memory operations. The latest docs recommend using the overloaded
90 // vec_xl/vec_xst, but that is not supported on xlc version 12. We'll
91 // revisit things once xlc is a bit more stable - for now you probably want
92 // to stick to gcc...
94 static inline Simd4Float gmx_simdcall load4(const float* m)
96 return { *reinterpret_cast<const __vector float*>(m) };
99 static inline void gmx_simdcall store4(float* m, Simd4Float a)
101 *reinterpret_cast<__vector float*>(m) = a.simdInternal_;
104 static inline Simd4Float gmx_simdcall load4U(const float* m)
106 return
108 #if __GNUC__ < 7
109 *reinterpret_cast<const __vector float*>(m)
110 #else
111 vec_xl(0, m)
112 #endif
116 static inline void gmx_simdcall store4U(float* m, Simd4Float a)
118 #if __GNUC__ < 7
119 *reinterpret_cast<__vector float*>(m) = a.simdInternal_;
120 #else
121 vec_xst(a.simdInternal_, 0, m);
122 #endif
125 static inline Simd4Float gmx_simdcall simd4SetZeroF()
127 return { vec_splats(0.0F) };
130 static inline Simd4Float gmx_simdcall operator&(Simd4Float a, Simd4Float b)
132 return { vec_and(a.simdInternal_, b.simdInternal_) };
135 static inline Simd4Float gmx_simdcall andNot(Simd4Float a, Simd4Float b)
137 return { vec_andc(b.simdInternal_, a.simdInternal_) };
140 static inline Simd4Float gmx_simdcall operator|(Simd4Float a, Simd4Float b)
142 return { vec_or(a.simdInternal_, b.simdInternal_) };
145 static inline Simd4Float gmx_simdcall operator^(Simd4Float a, Simd4Float b)
147 return { vec_xor(a.simdInternal_, b.simdInternal_) };
150 static inline Simd4Float gmx_simdcall operator+(Simd4Float a, Simd4Float b)
152 return { vec_add(a.simdInternal_, b.simdInternal_) };
155 static inline Simd4Float gmx_simdcall operator-(Simd4Float a, Simd4Float b)
157 return { vec_sub(a.simdInternal_, b.simdInternal_) };
160 static inline Simd4Float gmx_simdcall operator-(Simd4Float x)
162 return { -x.simdInternal_ };
165 static inline Simd4Float gmx_simdcall operator*(Simd4Float a, Simd4Float b)
167 return { vec_mul(a.simdInternal_, b.simdInternal_) };
170 static inline Simd4Float gmx_simdcall fma(Simd4Float a, Simd4Float b, Simd4Float c)
172 return { vec_madd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
175 static inline Simd4Float gmx_simdcall fms(Simd4Float a, Simd4Float b, Simd4Float c)
177 return { vec_msub(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
180 static inline Simd4Float gmx_simdcall fnma(Simd4Float a, Simd4Float b, Simd4Float c)
182 return { vec_nmsub(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
185 static inline Simd4Float gmx_simdcall fnms(Simd4Float a, Simd4Float b, Simd4Float c)
187 return { vec_nmadd(a.simdInternal_, b.simdInternal_, c.simdInternal_) };
190 static inline Simd4Float gmx_simdcall rsqrt(Simd4Float x)
192 return { vec_rsqrte(x.simdInternal_) };
195 static inline Simd4Float gmx_simdcall abs(Simd4Float x)
197 return { vec_abs(x.simdInternal_) };
200 static inline Simd4Float gmx_simdcall max(Simd4Float a, Simd4Float b)
202 return { vec_max(a.simdInternal_, b.simdInternal_) };
205 static inline Simd4Float gmx_simdcall min(Simd4Float a, Simd4Float b)
207 return { vec_min(a.simdInternal_, b.simdInternal_) };
210 static inline Simd4Float gmx_simdcall round(Simd4Float x)
212 return { vec_round(x.simdInternal_) };
215 static inline Simd4Float gmx_simdcall trunc(Simd4Float x)
217 return { vec_trunc(x.simdInternal_) };
220 static inline float gmx_simdcall dotProduct(Simd4Float a, Simd4Float b)
222 const __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
223 const __vector unsigned char perm2 = { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
224 __vector float c = vec_mul(a.simdInternal_, b.simdInternal_);
225 __vector float sum;
226 sum = vec_add(c, vec_perm(c, c, perm1));
227 sum = vec_add(sum, vec_perm(c, c, perm2));
228 return vec_extract(sum, 0);
231 static inline void gmx_simdcall transpose(Simd4Float* v0, Simd4Float* v1, Simd4Float* v2, Simd4Float* v3)
233 __vector float t0 = vec_mergeh(v0->simdInternal_, v2->simdInternal_);
234 __vector float t1 = vec_mergel(v0->simdInternal_, v2->simdInternal_);
235 __vector float t2 = vec_mergeh(v1->simdInternal_, v3->simdInternal_);
236 __vector float t3 = vec_mergel(v1->simdInternal_, v3->simdInternal_);
237 v0->simdInternal_ = vec_mergeh(t0, t2);
238 v1->simdInternal_ = vec_mergel(t0, t2);
239 v2->simdInternal_ = vec_mergeh(t1, t3);
240 v3->simdInternal_ = vec_mergel(t1, t3);
243 static inline Simd4FBool gmx_simdcall operator==(Simd4Float a, Simd4Float b)
245 return { vec_cmpeq(a.simdInternal_, b.simdInternal_) };
248 static inline Simd4FBool gmx_simdcall operator!=(Simd4Float a, Simd4Float b)
250 return { vec_or(vec_cmpgt(a.simdInternal_, b.simdInternal_),
251 vec_cmplt(a.simdInternal_, b.simdInternal_)) };
254 static inline Simd4FBool gmx_simdcall operator<(Simd4Float a, Simd4Float b)
256 return { vec_cmplt(a.simdInternal_, b.simdInternal_) };
259 static inline Simd4FBool gmx_simdcall operator<=(Simd4Float a, Simd4Float b)
261 return { vec_cmple(a.simdInternal_, b.simdInternal_) };
264 static inline Simd4FBool gmx_simdcall operator&&(Simd4FBool a, Simd4FBool b)
266 return { vec_and(a.simdInternal_, b.simdInternal_) };
269 static inline Simd4FBool gmx_simdcall operator||(Simd4FBool a, Simd4FBool b)
271 return { vec_or(a.simdInternal_, b.simdInternal_) };
274 static inline bool gmx_simdcall anyTrue(Simd4FBool a)
276 return vec_any_ne(a.simdInternal_, reinterpret_cast<__vector vsxBool int>(vec_splats(0)));
279 static inline Simd4Float gmx_simdcall selectByMask(Simd4Float a, Simd4FBool m)
281 return { vec_and(a.simdInternal_, reinterpret_cast<__vector float>(m.simdInternal_)) };
284 static inline Simd4Float gmx_simdcall selectByNotMask(Simd4Float a, Simd4FBool m)
286 return { vec_andc(a.simdInternal_, reinterpret_cast<__vector float>(m.simdInternal_)) };
289 static inline Simd4Float gmx_simdcall blend(Simd4Float a, Simd4Float b, Simd4FBool sel)
291 return { vec_sel(a.simdInternal_, b.simdInternal_, sel.simdInternal_) };
294 static inline float gmx_simdcall reduce(Simd4Float x)
296 const __vector unsigned char perm1 = { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
297 const __vector unsigned char perm2 = { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
299 x.simdInternal_ = vec_add(x.simdInternal_, vec_perm(x.simdInternal_, x.simdInternal_, perm1));
300 x.simdInternal_ = vec_add(x.simdInternal_, vec_perm(x.simdInternal_, x.simdInternal_, perm2));
301 return vec_extract(x.simdInternal_, 0);
304 } // namespace gmx
306 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H