2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
41 #include "gromacs/utility/basedefinitions.h"
43 #include "impl_ibm_vsx_definitions.h"
44 #include "impl_ibm_vsx_simd_float.h"
54 // gcc-4.9 does not recognize that we use the parameter
55 Simd4Float(float gmx_unused f
) : simdInternal_(vec_splats(f
)) {}
57 // Internal utility constructor to simplify return statements
58 Simd4Float(__vector
float simd
) : simdInternal_(simd
) {}
60 __vector
float simdInternal_
;
68 //! \brief Construct from scalar bool
70 simdInternal_(reinterpret_cast<__vector vsxBool
int>(vec_splats(b
? 0xFFFFFFFF : 0)))
74 // Internal utility constructor to simplify return statements
75 Simd4FBool(__vector vsxBool
int simd
) : simdInternal_(simd
) {}
77 __vector vsxBool
int simdInternal_
;
80 // The VSX load & store operations are a bit of a mess. The interface is different
81 // for xlc version 12, xlc version 13, and gcc. Long-term IBM recommends
82 // simply using pointer dereferencing both for aligned and unaligned loads.
83 // That's nice, but unfortunately xlc still bugs out when the pointer is
84 // not aligned. Sticking to vec_xl/vec_xst isn't a solution either, since
85 // that appears to be buggy for some _aligned_ loads :-)
87 // For now, we use pointer dereferencing for all aligned load/stores, and
88 // for unaligned ones with gcc. On xlc we use vec_xlw4/vec_xstw4 for
89 // unaligned memory operations. The latest docs recommend using the overloaded
90 // vec_xl/vec_xst, but that is not supported on xlc version 12. We'll
91 // revisit things once xlc is a bit more stable - for now you probably want
94 static inline Simd4Float gmx_simdcall
load4(const float* m
)
96 return { *reinterpret_cast<const __vector
float*>(m
) };
99 static inline void gmx_simdcall
store4(float* m
, Simd4Float a
)
101 *reinterpret_cast<__vector
float*>(m
) = a
.simdInternal_
;
104 static inline Simd4Float gmx_simdcall
load4U(const float* m
)
109 *reinterpret_cast<const __vector
float*>(m
)
116 static inline void gmx_simdcall
store4U(float* m
, Simd4Float a
)
119 *reinterpret_cast<__vector
float*>(m
) = a
.simdInternal_
;
121 vec_xst(a
.simdInternal_
, 0, m
);
125 static inline Simd4Float gmx_simdcall
simd4SetZeroF()
127 return { vec_splats(0.0F
) };
130 static inline Simd4Float gmx_simdcall
operator&(Simd4Float a
, Simd4Float b
)
132 return { vec_and(a
.simdInternal_
, b
.simdInternal_
) };
135 static inline Simd4Float gmx_simdcall
andNot(Simd4Float a
, Simd4Float b
)
137 return { vec_andc(b
.simdInternal_
, a
.simdInternal_
) };
140 static inline Simd4Float gmx_simdcall
operator|(Simd4Float a
, Simd4Float b
)
142 return { vec_or(a
.simdInternal_
, b
.simdInternal_
) };
145 static inline Simd4Float gmx_simdcall
operator^(Simd4Float a
, Simd4Float b
)
147 return { vec_xor(a
.simdInternal_
, b
.simdInternal_
) };
150 static inline Simd4Float gmx_simdcall
operator+(Simd4Float a
, Simd4Float b
)
152 return { vec_add(a
.simdInternal_
, b
.simdInternal_
) };
155 static inline Simd4Float gmx_simdcall
operator-(Simd4Float a
, Simd4Float b
)
157 return { vec_sub(a
.simdInternal_
, b
.simdInternal_
) };
160 static inline Simd4Float gmx_simdcall
operator-(Simd4Float x
)
162 return { -x
.simdInternal_
};
165 static inline Simd4Float gmx_simdcall
operator*(Simd4Float a
, Simd4Float b
)
167 return { vec_mul(a
.simdInternal_
, b
.simdInternal_
) };
170 static inline Simd4Float gmx_simdcall
fma(Simd4Float a
, Simd4Float b
, Simd4Float c
)
172 return { vec_madd(a
.simdInternal_
, b
.simdInternal_
, c
.simdInternal_
) };
175 static inline Simd4Float gmx_simdcall
fms(Simd4Float a
, Simd4Float b
, Simd4Float c
)
177 return { vec_msub(a
.simdInternal_
, b
.simdInternal_
, c
.simdInternal_
) };
180 static inline Simd4Float gmx_simdcall
fnma(Simd4Float a
, Simd4Float b
, Simd4Float c
)
182 return { vec_nmsub(a
.simdInternal_
, b
.simdInternal_
, c
.simdInternal_
) };
185 static inline Simd4Float gmx_simdcall
fnms(Simd4Float a
, Simd4Float b
, Simd4Float c
)
187 return { vec_nmadd(a
.simdInternal_
, b
.simdInternal_
, c
.simdInternal_
) };
190 static inline Simd4Float gmx_simdcall
rsqrt(Simd4Float x
)
192 return { vec_rsqrte(x
.simdInternal_
) };
195 static inline Simd4Float gmx_simdcall
abs(Simd4Float x
)
197 return { vec_abs(x
.simdInternal_
) };
200 static inline Simd4Float gmx_simdcall
max(Simd4Float a
, Simd4Float b
)
202 return { vec_max(a
.simdInternal_
, b
.simdInternal_
) };
205 static inline Simd4Float gmx_simdcall
min(Simd4Float a
, Simd4Float b
)
207 return { vec_min(a
.simdInternal_
, b
.simdInternal_
) };
210 static inline Simd4Float gmx_simdcall
round(Simd4Float x
)
212 return { vec_round(x
.simdInternal_
) };
215 static inline Simd4Float gmx_simdcall
trunc(Simd4Float x
)
217 return { vec_trunc(x
.simdInternal_
) };
220 static inline float gmx_simdcall
dotProduct(Simd4Float a
, Simd4Float b
)
222 const __vector
unsigned char perm1
= { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
223 const __vector
unsigned char perm2
= { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
224 __vector
float c
= vec_mul(a
.simdInternal_
, b
.simdInternal_
);
226 sum
= vec_add(c
, vec_perm(c
, c
, perm1
));
227 sum
= vec_add(sum
, vec_perm(c
, c
, perm2
));
228 return vec_extract(sum
, 0);
231 static inline void gmx_simdcall
transpose(Simd4Float
* v0
, Simd4Float
* v1
, Simd4Float
* v2
, Simd4Float
* v3
)
233 __vector
float t0
= vec_mergeh(v0
->simdInternal_
, v2
->simdInternal_
);
234 __vector
float t1
= vec_mergel(v0
->simdInternal_
, v2
->simdInternal_
);
235 __vector
float t2
= vec_mergeh(v1
->simdInternal_
, v3
->simdInternal_
);
236 __vector
float t3
= vec_mergel(v1
->simdInternal_
, v3
->simdInternal_
);
237 v0
->simdInternal_
= vec_mergeh(t0
, t2
);
238 v1
->simdInternal_
= vec_mergel(t0
, t2
);
239 v2
->simdInternal_
= vec_mergeh(t1
, t3
);
240 v3
->simdInternal_
= vec_mergel(t1
, t3
);
243 static inline Simd4FBool gmx_simdcall
operator==(Simd4Float a
, Simd4Float b
)
245 return { vec_cmpeq(a
.simdInternal_
, b
.simdInternal_
) };
248 static inline Simd4FBool gmx_simdcall
operator!=(Simd4Float a
, Simd4Float b
)
250 return { vec_or(vec_cmpgt(a
.simdInternal_
, b
.simdInternal_
),
251 vec_cmplt(a
.simdInternal_
, b
.simdInternal_
)) };
254 static inline Simd4FBool gmx_simdcall
operator<(Simd4Float a
, Simd4Float b
)
256 return { vec_cmplt(a
.simdInternal_
, b
.simdInternal_
) };
259 static inline Simd4FBool gmx_simdcall
operator<=(Simd4Float a
, Simd4Float b
)
261 return { vec_cmple(a
.simdInternal_
, b
.simdInternal_
) };
264 static inline Simd4FBool gmx_simdcall
operator&&(Simd4FBool a
, Simd4FBool b
)
266 return { vec_and(a
.simdInternal_
, b
.simdInternal_
) };
269 static inline Simd4FBool gmx_simdcall
operator||(Simd4FBool a
, Simd4FBool b
)
271 return { vec_or(a
.simdInternal_
, b
.simdInternal_
) };
274 static inline bool gmx_simdcall
anyTrue(Simd4FBool a
)
276 return vec_any_ne(a
.simdInternal_
, reinterpret_cast<__vector vsxBool
int>(vec_splats(0)));
279 static inline Simd4Float gmx_simdcall
selectByMask(Simd4Float a
, Simd4FBool m
)
281 return { vec_and(a
.simdInternal_
, reinterpret_cast<__vector
float>(m
.simdInternal_
)) };
284 static inline Simd4Float gmx_simdcall
selectByNotMask(Simd4Float a
, Simd4FBool m
)
286 return { vec_andc(a
.simdInternal_
, reinterpret_cast<__vector
float>(m
.simdInternal_
)) };
289 static inline Simd4Float gmx_simdcall
blend(Simd4Float a
, Simd4Float b
, Simd4FBool sel
)
291 return { vec_sel(a
.simdInternal_
, b
.simdInternal_
, sel
.simdInternal_
) };
294 static inline float gmx_simdcall
reduce(Simd4Float x
)
296 const __vector
unsigned char perm1
= { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
297 const __vector
unsigned char perm2
= { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
299 x
.simdInternal_
= vec_add(x
.simdInternal_
, vec_perm(x
.simdInternal_
, x
.simdInternal_
, perm1
));
300 x
.simdInternal_
= vec_add(x
.simdInternal_
, vec_perm(x
.simdInternal_
, x
.simdInternal_
, perm2
));
301 return vec_extract(x
.simdInternal_
, 0);
306 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H