2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H
36 #define GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H
43 #include <emmintrin.h>
53 Simd4Float(float f
) : simdInternal_(_mm_set1_ps(f
)) {}
55 // Internal utility constructor to simplify return statements
56 Simd4Float(__m128 simd
) : simdInternal_(simd
) {}
66 //! \brief Construct from scalar bool
67 Simd4FBool(bool b
) : simdInternal_(_mm_castsi128_ps(_mm_set1_epi32( b
? 0xFFFFFFFF : 0))) {}
69 // Internal utility constructor to simplify return statements
70 Simd4FBool(__m128 simd
) : simdInternal_(simd
) {}
75 static inline Simd4Float gmx_simdcall
78 assert(size_t(m
) % 16 == 0);
84 static inline void gmx_simdcall
85 store4(float *m
, Simd4Float a
)
87 assert(size_t(m
) % 16 == 0);
88 _mm_store_ps(m
, a
.simdInternal_
);
91 static inline Simd4Float gmx_simdcall
92 load4U(const float *m
)
99 static inline void gmx_simdcall
100 store4U(float *m
, Simd4Float a
)
102 _mm_storeu_ps(m
, a
.simdInternal_
);
105 static inline Simd4Float gmx_simdcall
113 static inline Simd4Float gmx_simdcall
114 operator&(Simd4Float a
, Simd4Float b
)
117 _mm_and_ps(a
.simdInternal_
, b
.simdInternal_
)
121 static inline Simd4Float gmx_simdcall
122 andNot(Simd4Float a
, Simd4Float b
)
125 _mm_andnot_ps(a
.simdInternal_
, b
.simdInternal_
)
129 static inline Simd4Float gmx_simdcall
130 operator|(Simd4Float a
, Simd4Float b
)
133 _mm_or_ps(a
.simdInternal_
, b
.simdInternal_
)
137 static inline Simd4Float gmx_simdcall
138 operator^(Simd4Float a
, Simd4Float b
)
141 _mm_xor_ps(a
.simdInternal_
, b
.simdInternal_
)
145 static inline Simd4Float gmx_simdcall
146 operator+(Simd4Float a
, Simd4Float b
)
149 _mm_add_ps(a
.simdInternal_
, b
.simdInternal_
)
153 static inline Simd4Float gmx_simdcall
154 operator-(Simd4Float a
, Simd4Float b
)
157 _mm_sub_ps(a
.simdInternal_
, b
.simdInternal_
)
161 static inline Simd4Float gmx_simdcall
162 operator-(Simd4Float x
)
165 _mm_xor_ps(x
.simdInternal_
, _mm_set1_ps(GMX_FLOAT_NEGZERO
))
169 static inline Simd4Float gmx_simdcall
170 operator*(Simd4Float a
, Simd4Float b
)
173 _mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
)
177 // Override for AVX-128-FMA and higher
178 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
179 static inline Simd4Float gmx_simdcall
180 fma(Simd4Float a
, Simd4Float b
, Simd4Float c
)
183 _mm_add_ps(_mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
)
187 static inline Simd4Float gmx_simdcall
188 fms(Simd4Float a
, Simd4Float b
, Simd4Float c
)
191 _mm_sub_ps(_mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
)
195 static inline Simd4Float gmx_simdcall
196 fnma(Simd4Float a
, Simd4Float b
, Simd4Float c
)
199 _mm_sub_ps(c
.simdInternal_
, _mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
))
203 static inline Simd4Float gmx_simdcall
204 fnms(Simd4Float a
, Simd4Float b
, Simd4Float c
)
207 _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
))
212 static inline Simd4Float gmx_simdcall
216 _mm_rsqrt_ps(x
.simdInternal_
)
220 static inline Simd4Float gmx_simdcall
224 _mm_andnot_ps( _mm_set1_ps(GMX_FLOAT_NEGZERO
), x
.simdInternal_
)
228 static inline Simd4Float gmx_simdcall
229 max(Simd4Float a
, Simd4Float b
)
232 _mm_max_ps(a
.simdInternal_
, b
.simdInternal_
)
236 static inline Simd4Float gmx_simdcall
237 min(Simd4Float a
, Simd4Float b
)
240 _mm_min_ps(a
.simdInternal_
, b
.simdInternal_
)
244 // Override for SSE4.1 and higher
245 #if GMX_SIMD_X86_SSE2
246 static inline Simd4Float gmx_simdcall
250 _mm_cvtepi32_ps( _mm_cvtps_epi32(x
.simdInternal_
) )
254 static inline Simd4Float gmx_simdcall
258 _mm_cvtepi32_ps( _mm_cvttps_epi32(x
.simdInternal_
) )
262 static inline float gmx_simdcall
263 dotProduct(Simd4Float a
, Simd4Float b
)
266 c
= _mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
);
267 d
= _mm_add_ps(c
, _mm_shuffle_ps(c
, c
, _MM_SHUFFLE(2, 1, 2, 1)));
268 d
= _mm_add_ps(d
, _mm_shuffle_ps(c
, c
, _MM_SHUFFLE(3, 2, 3, 2)));
269 return *reinterpret_cast<float *>(&d
);
273 static inline void gmx_simdcall
274 transpose(Simd4Float
* v0
, Simd4Float
* v1
,
275 Simd4Float
* v2
, Simd4Float
* v3
)
277 _MM_TRANSPOSE4_PS(v0
->simdInternal_
, v1
->simdInternal_
, v2
->simdInternal_
, v3
->simdInternal_
);
280 static inline Simd4FBool gmx_simdcall
281 operator==(Simd4Float a
, Simd4Float b
)
284 _mm_cmpeq_ps(a
.simdInternal_
, b
.simdInternal_
)
288 static inline Simd4FBool gmx_simdcall
289 operator!=(Simd4Float a
, Simd4Float b
)
292 _mm_cmpneq_ps(a
.simdInternal_
, b
.simdInternal_
)
296 static inline Simd4FBool gmx_simdcall
297 operator<(Simd4Float a
, Simd4Float b
)
300 _mm_cmplt_ps(a
.simdInternal_
, b
.simdInternal_
)
304 static inline Simd4FBool gmx_simdcall
305 operator<=(Simd4Float a
, Simd4Float b
)
308 _mm_cmple_ps(a
.simdInternal_
, b
.simdInternal_
)
312 static inline Simd4FBool gmx_simdcall
313 operator&&(Simd4FBool a
, Simd4FBool b
)
316 _mm_and_ps(a
.simdInternal_
, b
.simdInternal_
)
320 static inline Simd4FBool gmx_simdcall
321 operator||(Simd4FBool a
, Simd4FBool b
)
324 _mm_or_ps(a
.simdInternal_
, b
.simdInternal_
)
328 static inline bool gmx_simdcall
329 anyTrue(Simd4FBool a
) { return _mm_movemask_ps(a
.simdInternal_
) != 0; }
331 static inline Simd4Float gmx_simdcall
332 selectByMask(Simd4Float a
, Simd4FBool mask
)
335 _mm_and_ps(a
.simdInternal_
, mask
.simdInternal_
)
339 static inline Simd4Float gmx_simdcall
340 selectByNotMask(Simd4Float a
, Simd4FBool mask
)
343 _mm_andnot_ps(mask
.simdInternal_
, a
.simdInternal_
)
347 // Override for SSE4.1 and higher
348 #if GMX_SIMD_X86_SSE2
349 static inline Simd4Float gmx_simdcall
350 blend(Simd4Float a
, Simd4Float b
, Simd4FBool sel
)
353 _mm_or_ps(_mm_andnot_ps(sel
.simdInternal_
, a
.simdInternal_
), _mm_and_ps(sel
.simdInternal_
, b
.simdInternal_
))
358 // Override for AVX-128-FMA and higher
359 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
360 static inline float gmx_simdcall
364 b
= _mm_add_ps(a
.simdInternal_
, _mm_shuffle_ps(a
.simdInternal_
, a
.simdInternal_
, _MM_SHUFFLE(1, 0, 3, 2)));
365 b
= _mm_add_ss(b
, _mm_shuffle_ps(b
, b
, _MM_SHUFFLE(0, 3, 2, 1)));
366 return *reinterpret_cast<float *>(&b
);
372 #endif // GMX_SIMD_IMPL_X86_SSE2_SIMD4_FLOAT_H