2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
37 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H
41 #include "gromacs/utility/basedefinitions.h"
43 #include "impl_ibm_vsx_definitions.h"
44 #include "impl_ibm_vsx_simd_float.h"
54 // gcc-4.9 does not recognize that we use the parameter
55 Simd4Float(float gmx_unused f
) : simdInternal_(vec_splats(f
)) {}
57 // Internal utility constructor to simplify return statements
58 Simd4Float(__vector
float simd
) : simdInternal_(simd
) {}
60 __vector
float simdInternal_
;
68 //! \brief Construct from scalar bool
69 Simd4FBool(bool b
) : simdInternal_(reinterpret_cast<__vector vsxBool
int>(vec_splats( b
? 0xFFFFFFFF : 0))) {}
71 // Internal utility constructor to simplify return statements
72 Simd4FBool(__vector vsxBool
int simd
) : simdInternal_(simd
) {}
74 __vector vsxBool
int simdInternal_
;
77 // The VSX load & store operations are a bit of a mess. The interface is different
78 // for xlc version 12, xlc version 13, and gcc. Long-term IBM recommends
79 // simply using pointer dereferencing both for aligned and unaligned loads.
80 // That's nice, but unfortunately xlc still bugs out when the pointer is
81 // not aligned. Sticking to vec_xl/vec_xst isn't a solution either, since
82 // that appears to be buggy for some _aligned_ loads :-)
84 // For now, we use pointer dereferencing for all aligned load/stores, and
85 // for unaligned ones with gcc. On xlc we use vec_xlw4/vec_xstw4 for
86 // unaligned memory operations. The latest docs recommend using the overloaded
87 // vec_xl/vec_xst, but that is not supported on xlc version 12. We'll
88 // revisit things once xlc is a bit more stable - for now you probably want
91 static inline Simd4Float gmx_simdcall
95 *reinterpret_cast<const __vector
float *>(m
)
99 static inline void gmx_simdcall
100 store4(float *m
, Simd4Float a
)
102 *reinterpret_cast<__vector
float *>(m
) = a
.simdInternal_
;
105 static inline Simd4Float gmx_simdcall
106 load4U(const float *m
)
110 *reinterpret_cast<const __vector
float *>(m
)
117 static inline void gmx_simdcall
118 store4U(float *m
, Simd4Float a
)
121 *reinterpret_cast<__vector
float *>(m
) = a
.simdInternal_
;
123 vec_xst(a
.simdInternal_
, 0, m
);
127 static inline Simd4Float gmx_simdcall
135 static inline Simd4Float gmx_simdcall
136 operator&(Simd4Float a
, Simd4Float b
)
139 vec_and(a
.simdInternal_
, b
.simdInternal_
)
143 static inline Simd4Float gmx_simdcall
144 andNot(Simd4Float a
, Simd4Float b
)
147 vec_andc(b
.simdInternal_
, a
.simdInternal_
)
151 static inline Simd4Float gmx_simdcall
152 operator|(Simd4Float a
, Simd4Float b
)
155 vec_or(a
.simdInternal_
, b
.simdInternal_
)
159 static inline Simd4Float gmx_simdcall
160 operator^(Simd4Float a
, Simd4Float b
)
163 vec_xor(a
.simdInternal_
, b
.simdInternal_
)
167 static inline Simd4Float gmx_simdcall
168 operator+(Simd4Float a
, Simd4Float b
)
171 vec_add(a
.simdInternal_
, b
.simdInternal_
)
175 static inline Simd4Float gmx_simdcall
176 operator-(Simd4Float a
, Simd4Float b
)
179 vec_sub(a
.simdInternal_
, b
.simdInternal_
)
183 static inline Simd4Float gmx_simdcall
184 operator-(Simd4Float x
)
191 static inline Simd4Float gmx_simdcall
192 operator*(Simd4Float a
, Simd4Float b
)
195 vec_mul(a
.simdInternal_
, b
.simdInternal_
)
199 static inline Simd4Float gmx_simdcall
200 fma(Simd4Float a
, Simd4Float b
, Simd4Float c
)
203 vec_madd(a
.simdInternal_
, b
.simdInternal_
, c
.simdInternal_
)
207 static inline Simd4Float gmx_simdcall
208 fms(Simd4Float a
, Simd4Float b
, Simd4Float c
)
211 vec_msub(a
.simdInternal_
, b
.simdInternal_
, c
.simdInternal_
)
215 static inline Simd4Float gmx_simdcall
216 fnma(Simd4Float a
, Simd4Float b
, Simd4Float c
)
219 vec_nmsub(a
.simdInternal_
, b
.simdInternal_
, c
.simdInternal_
)
223 static inline Simd4Float gmx_simdcall
224 fnms(Simd4Float a
, Simd4Float b
, Simd4Float c
)
227 vec_nmadd(a
.simdInternal_
, b
.simdInternal_
, c
.simdInternal_
)
231 static inline Simd4Float gmx_simdcall
235 vec_rsqrte(x
.simdInternal_
)
239 static inline Simd4Float gmx_simdcall
243 vec_abs( x
.simdInternal_
)
247 static inline Simd4Float gmx_simdcall
248 max(Simd4Float a
, Simd4Float b
)
251 vec_max(a
.simdInternal_
, b
.simdInternal_
)
255 static inline Simd4Float gmx_simdcall
256 min(Simd4Float a
, Simd4Float b
)
259 vec_min(a
.simdInternal_
, b
.simdInternal_
)
263 static inline Simd4Float gmx_simdcall
267 vec_round( x
.simdInternal_
)
271 static inline Simd4Float gmx_simdcall
275 vec_trunc( x
.simdInternal_
)
279 static inline float gmx_simdcall
280 dotProduct(Simd4Float a
, Simd4Float b
)
282 const __vector
unsigned char perm1
= { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
283 const __vector
unsigned char perm2
= { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
284 __vector
float c
= vec_mul(a
.simdInternal_
, b
.simdInternal_
);
286 sum
= vec_add(c
, vec_perm(c
, c
, perm1
));
287 sum
= vec_add(sum
, vec_perm(c
, c
, perm2
));
288 return vec_extract(sum
, 0);
291 static inline void gmx_simdcall
292 transpose(Simd4Float
* v0
, Simd4Float
* v1
,
293 Simd4Float
* v2
, Simd4Float
* v3
)
295 __vector
float t0
= vec_mergeh(v0
->simdInternal_
, v2
->simdInternal_
);
296 __vector
float t1
= vec_mergel(v0
->simdInternal_
, v2
->simdInternal_
);
297 __vector
float t2
= vec_mergeh(v1
->simdInternal_
, v3
->simdInternal_
);
298 __vector
float t3
= vec_mergel(v1
->simdInternal_
, v3
->simdInternal_
);
299 v0
->simdInternal_
= vec_mergeh(t0
, t2
);
300 v1
->simdInternal_
= vec_mergel(t0
, t2
);
301 v2
->simdInternal_
= vec_mergeh(t1
, t3
);
302 v3
->simdInternal_
= vec_mergel(t1
, t3
);
305 static inline Simd4FBool gmx_simdcall
306 operator==(Simd4Float a
, Simd4Float b
)
309 vec_cmpeq(a
.simdInternal_
, b
.simdInternal_
)
313 static inline Simd4FBool gmx_simdcall
314 operator!=(Simd4Float a
, Simd4Float b
)
317 vec_or(vec_cmpgt(a
.simdInternal_
, b
.simdInternal_
),
318 vec_cmplt(a
.simdInternal_
, b
.simdInternal_
))
322 static inline Simd4FBool gmx_simdcall
323 operator<(Simd4Float a
, Simd4Float b
)
326 vec_cmplt(a
.simdInternal_
, b
.simdInternal_
)
330 static inline Simd4FBool gmx_simdcall
331 operator<=(Simd4Float a
, Simd4Float b
)
334 vec_cmple(a
.simdInternal_
, b
.simdInternal_
)
338 static inline Simd4FBool gmx_simdcall
339 operator&&(Simd4FBool a
, Simd4FBool b
)
342 vec_and(a
.simdInternal_
, b
.simdInternal_
)
346 static inline Simd4FBool gmx_simdcall
347 operator||(Simd4FBool a
, Simd4FBool b
)
350 vec_or(a
.simdInternal_
, b
.simdInternal_
)
354 static inline bool gmx_simdcall
355 anyTrue(Simd4FBool a
)
357 return vec_any_ne(a
.simdInternal_
, reinterpret_cast<__vector vsxBool
int>(vec_splats(0)));
360 static inline Simd4Float gmx_simdcall
361 selectByMask(Simd4Float a
, Simd4FBool m
)
364 vec_and(a
.simdInternal_
, reinterpret_cast<__vector
float>(m
.simdInternal_
))
368 static inline Simd4Float gmx_simdcall
369 selectByNotMask(Simd4Float a
, Simd4FBool m
)
372 vec_andc(a
.simdInternal_
, reinterpret_cast<__vector
float>(m
.simdInternal_
))
376 static inline Simd4Float gmx_simdcall
377 blend(Simd4Float a
, Simd4Float b
, Simd4FBool sel
)
380 vec_sel(a
.simdInternal_
, b
.simdInternal_
, sel
.simdInternal_
)
384 static inline float gmx_simdcall
387 const __vector
unsigned char perm1
= { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
388 const __vector
unsigned char perm2
= { 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3 };
390 x
.simdInternal_
= vec_add(x
.simdInternal_
, vec_perm(x
.simdInternal_
, x
.simdInternal_
, perm1
));
391 x
.simdInternal_
= vec_add(x
.simdInternal_
, vec_perm(x
.simdInternal_
, x
.simdInternal_
, perm2
));
392 return vec_extract(x
.simdInternal_
, 0);
397 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_SIMD4_FLOAT_H