2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_X86_SSE2_SIMD_FLOAT_H
36 #define GMX_SIMD_IMPL_X86_SSE2_SIMD_FLOAT_H
44 #include <emmintrin.h>
46 #include "gromacs/math/utilities.h"
56 SimdFloat(float f
) : simdInternal_(_mm_set1_ps(f
)) {}
58 // Internal utility constructor to simplify return statements
59 SimdFloat(__m128 simd
) : simdInternal_(simd
) {}
69 SimdFInt32(std::int32_t i
) : simdInternal_(_mm_set1_epi32(i
)) {}
71 // Internal utility constructor to simplify return statements
72 SimdFInt32(__m128i simd
) : simdInternal_(simd
) {}
74 __m128i simdInternal_
;
82 SimdFBool(bool b
) : simdInternal_(_mm_castsi128_ps(_mm_set1_epi32( b
? 0xFFFFFFFF : 0))) {}
84 // Internal utility constructor to simplify return statements
85 SimdFBool(__m128 simd
) : simdInternal_(simd
) {}
95 SimdFIBool(bool b
) : simdInternal_(_mm_set1_epi32( b
? 0xFFFFFFFF : 0)) {}
97 // Internal utility constructor to simplify return statements
98 SimdFIBool(__m128i simd
) : simdInternal_(simd
) {}
100 __m128i simdInternal_
;
103 static inline SimdFloat gmx_simdcall
104 simdLoad(const float *m
, SimdFloatTag
= {})
106 assert(std::size_t(m
) % 16 == 0);
112 static inline void gmx_simdcall
113 store(float *m
, SimdFloat a
)
115 assert(std::size_t(m
) % 16 == 0);
116 _mm_store_ps(m
, a
.simdInternal_
);
119 static inline SimdFloat gmx_simdcall
120 simdLoadU(const float *m
, SimdFloatTag
= {})
127 static inline void gmx_simdcall
128 storeU(float *m
, SimdFloat a
) { _mm_storeu_ps(m
, a
.simdInternal_
); }
130 static inline SimdFloat gmx_simdcall
138 static inline SimdFInt32 gmx_simdcall
139 simdLoad(const std::int32_t * m
, SimdFInt32Tag
)
141 assert(std::size_t(m
) % 16 == 0);
143 _mm_load_si128(reinterpret_cast<const __m128i
*>(m
))
147 static inline void gmx_simdcall
148 store(std::int32_t * m
, SimdFInt32 a
)
150 assert(std::size_t(m
) % 16 == 0);
151 _mm_store_si128(reinterpret_cast<__m128i
*>(m
), a
.simdInternal_
);
154 static inline SimdFInt32 gmx_simdcall
155 simdLoadU(const std::int32_t *m
, SimdFInt32Tag
)
158 _mm_loadu_si128(reinterpret_cast<const __m128i
*>(m
))
162 static inline void gmx_simdcall
163 storeU(std::int32_t * m
, SimdFInt32 a
)
165 _mm_storeu_si128(reinterpret_cast<__m128i
*>(m
), a
.simdInternal_
);
168 static inline SimdFInt32 gmx_simdcall
177 // Override for SSE4.1 and higher
178 #if GMX_SIMD_X86_SSE2
180 static inline std::int32_t gmx_simdcall
181 extract(SimdFInt32 a
)
183 return _mm_cvtsi128_si32( _mm_srli_si128(a
.simdInternal_
, 4 * index
) );
187 static inline SimdFloat gmx_simdcall
188 operator&(SimdFloat a
, SimdFloat b
)
191 _mm_and_ps(a
.simdInternal_
, b
.simdInternal_
)
195 static inline SimdFloat gmx_simdcall
196 andNot(SimdFloat a
, SimdFloat b
)
199 _mm_andnot_ps(a
.simdInternal_
, b
.simdInternal_
)
203 static inline SimdFloat gmx_simdcall
204 operator|(SimdFloat a
, SimdFloat b
)
207 _mm_or_ps(a
.simdInternal_
, b
.simdInternal_
)
211 static inline SimdFloat gmx_simdcall
212 operator^(SimdFloat a
, SimdFloat b
)
215 _mm_xor_ps(a
.simdInternal_
, b
.simdInternal_
)
219 static inline SimdFloat gmx_simdcall
220 operator+(SimdFloat a
, SimdFloat b
)
223 _mm_add_ps(a
.simdInternal_
, b
.simdInternal_
)
227 static inline SimdFloat gmx_simdcall
228 operator-(SimdFloat a
, SimdFloat b
)
231 _mm_sub_ps(a
.simdInternal_
, b
.simdInternal_
)
235 static inline SimdFloat gmx_simdcall
236 operator-(SimdFloat x
)
239 _mm_xor_ps(x
.simdInternal_
, _mm_set1_ps(GMX_FLOAT_NEGZERO
))
243 static inline SimdFloat gmx_simdcall
244 operator*(SimdFloat a
, SimdFloat b
)
247 _mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
)
251 // Override for AVX-128-FMA and higher
252 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
253 static inline SimdFloat gmx_simdcall
254 fma(SimdFloat a
, SimdFloat b
, SimdFloat c
)
257 _mm_add_ps(_mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
)
261 static inline SimdFloat gmx_simdcall
262 fms(SimdFloat a
, SimdFloat b
, SimdFloat c
)
265 _mm_sub_ps(_mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
)
269 static inline SimdFloat gmx_simdcall
270 fnma(SimdFloat a
, SimdFloat b
, SimdFloat c
)
273 _mm_sub_ps(c
.simdInternal_
, _mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
))
277 static inline SimdFloat gmx_simdcall
278 fnms(SimdFloat a
, SimdFloat b
, SimdFloat c
)
281 _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
))
286 static inline SimdFloat gmx_simdcall
290 _mm_rsqrt_ps(x
.simdInternal_
)
294 static inline SimdFloat gmx_simdcall
298 _mm_rcp_ps(x
.simdInternal_
)
302 static inline SimdFloat gmx_simdcall
303 maskAdd(SimdFloat a
, SimdFloat b
, SimdFBool m
)
306 _mm_add_ps(a
.simdInternal_
, _mm_and_ps(b
.simdInternal_
, m
.simdInternal_
))
310 static inline SimdFloat gmx_simdcall
311 maskzMul(SimdFloat a
, SimdFloat b
, SimdFBool m
)
314 _mm_and_ps(_mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
), m
.simdInternal_
)
318 static inline SimdFloat gmx_simdcall
319 maskzFma(SimdFloat a
, SimdFloat b
, SimdFloat c
, SimdFBool m
)
322 _mm_and_ps(_mm_add_ps(_mm_mul_ps(a
.simdInternal_
, b
.simdInternal_
), c
.simdInternal_
), m
.simdInternal_
)
326 // Override for SSE4.1 and higher
327 #if GMX_SIMD_X86_SSE2
328 static inline SimdFloat gmx_simdcall
329 maskzRsqrt(SimdFloat x
, SimdFBool m
)
332 x
.simdInternal_
= _mm_or_ps(_mm_andnot_ps(m
.simdInternal_
, _mm_set1_ps(1.0f
)), _mm_and_ps(m
.simdInternal_
, x
.simdInternal_
));
335 _mm_and_ps(_mm_rsqrt_ps(x
.simdInternal_
), m
.simdInternal_
)
339 static inline SimdFloat gmx_simdcall
340 maskzRcp(SimdFloat x
, SimdFBool m
)
343 x
.simdInternal_
= _mm_or_ps(_mm_andnot_ps(m
.simdInternal_
, _mm_set1_ps(1.0f
)), _mm_and_ps(m
.simdInternal_
, x
.simdInternal_
));
346 _mm_and_ps(_mm_rcp_ps(x
.simdInternal_
), m
.simdInternal_
)
351 static inline SimdFloat gmx_simdcall
355 _mm_andnot_ps( _mm_set1_ps(GMX_FLOAT_NEGZERO
), x
.simdInternal_
)
359 static inline SimdFloat gmx_simdcall
360 max(SimdFloat a
, SimdFloat b
)
363 _mm_max_ps(a
.simdInternal_
, b
.simdInternal_
)
367 static inline SimdFloat gmx_simdcall
368 min(SimdFloat a
, SimdFloat b
)
371 _mm_min_ps(a
.simdInternal_
, b
.simdInternal_
)
375 // Override for SSE4.1 and higher
376 #if GMX_SIMD_X86_SSE2
377 static inline SimdFloat gmx_simdcall
381 _mm_cvtepi32_ps( _mm_cvtps_epi32(x
.simdInternal_
) )
385 static inline SimdFloat gmx_simdcall
389 _mm_cvtepi32_ps( _mm_cvttps_epi32(x
.simdInternal_
) )
395 static inline SimdFloat gmx_simdcall
396 frexp(SimdFloat value
, SimdFInt32
* exponent
)
398 const __m128 exponentMask
= _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
399 const __m128 mantissaMask
= _mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF));
400 const __m128i exponentBias
= _mm_set1_epi32(126); // add 1 to make our definition identical to frexp()
401 const __m128 half
= _mm_set1_ps(0.5f
);
404 iExponent
= _mm_castps_si128(_mm_and_ps(value
.simdInternal_
, exponentMask
));
405 iExponent
= _mm_sub_epi32(_mm_srli_epi32(iExponent
, 23), exponentBias
);
406 exponent
->simdInternal_
= iExponent
;
409 _mm_or_ps( _mm_and_ps(value
.simdInternal_
, mantissaMask
), half
)
413 // Override for SSE4.1
414 #if GMX_SIMD_X86_SSE2
415 template <MathOptimization opt
= MathOptimization::Safe
>
416 static inline SimdFloat gmx_simdcall
417 ldexp(SimdFloat value
, SimdFInt32 exponent
)
419 const __m128i exponentBias
= _mm_set1_epi32(127);
422 iExponent
= _mm_add_epi32(exponent
.simdInternal_
, exponentBias
);
424 if (opt
== MathOptimization::Safe
)
426 // Make sure biased argument is not negative
427 iExponent
= _mm_and_si128(iExponent
, _mm_cmpgt_epi32(iExponent
, _mm_setzero_si128()));
430 iExponent
= _mm_slli_epi32( iExponent
, 23);
433 _mm_mul_ps(value
.simdInternal_
, _mm_castsi128_ps(iExponent
))
438 // Override for AVX-128-FMA and higher
439 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
440 static inline float gmx_simdcall
443 // Shuffle has latency 1/throughput 1, followed by add with latency 3, t-put 1.
444 // This is likely faster than using _mm_hadd_ps, which has latency 5, t-put 2.
445 a
.simdInternal_
= _mm_add_ps(a
.simdInternal_
, _mm_shuffle_ps(a
.simdInternal_
, a
.simdInternal_
, _MM_SHUFFLE(1, 0, 3, 2)));
446 a
.simdInternal_
= _mm_add_ss(a
.simdInternal_
, _mm_shuffle_ps(a
.simdInternal_
, a
.simdInternal_
, _MM_SHUFFLE(0, 3, 2, 1)));
447 return *reinterpret_cast<float *>(&a
);
451 static inline SimdFBool gmx_simdcall
452 operator==(SimdFloat a
, SimdFloat b
)
455 _mm_cmpeq_ps(a
.simdInternal_
, b
.simdInternal_
)
459 static inline SimdFBool gmx_simdcall
460 operator!=(SimdFloat a
, SimdFloat b
)
463 _mm_cmpneq_ps(a
.simdInternal_
, b
.simdInternal_
)
467 static inline SimdFBool gmx_simdcall
468 operator<(SimdFloat a
, SimdFloat b
)
471 _mm_cmplt_ps(a
.simdInternal_
, b
.simdInternal_
)
475 static inline SimdFBool gmx_simdcall
476 operator<=(SimdFloat a
, SimdFloat b
)
479 _mm_cmple_ps(a
.simdInternal_
, b
.simdInternal_
)
483 static inline SimdFBool gmx_simdcall
484 testBits(SimdFloat a
)
486 __m128i ia
= _mm_castps_si128(a
.simdInternal_
);
487 __m128i res
= _mm_andnot_si128( _mm_cmpeq_epi32(ia
, _mm_setzero_si128()), _mm_cmpeq_epi32(ia
, ia
));
490 _mm_castsi128_ps(res
)
494 static inline SimdFBool gmx_simdcall
495 operator&&(SimdFBool a
, SimdFBool b
)
498 _mm_and_ps(a
.simdInternal_
, b
.simdInternal_
)
502 static inline SimdFBool gmx_simdcall
503 operator||(SimdFBool a
, SimdFBool b
)
506 _mm_or_ps(a
.simdInternal_
, b
.simdInternal_
)
510 static inline bool gmx_simdcall
511 anyTrue(SimdFBool a
) { return _mm_movemask_ps(a
.simdInternal_
) != 0; }
513 static inline SimdFloat gmx_simdcall
514 selectByMask(SimdFloat a
, SimdFBool mask
)
517 _mm_and_ps(a
.simdInternal_
, mask
.simdInternal_
)
521 static inline SimdFloat gmx_simdcall
522 selectByNotMask(SimdFloat a
, SimdFBool mask
)
525 _mm_andnot_ps(mask
.simdInternal_
, a
.simdInternal_
)
529 // Override for SSE4.1 and higher
530 #if GMX_SIMD_X86_SSE2
531 static inline SimdFloat gmx_simdcall
532 blend(SimdFloat a
, SimdFloat b
, SimdFBool sel
)
535 _mm_or_ps(_mm_andnot_ps(sel
.simdInternal_
, a
.simdInternal_
), _mm_and_ps(sel
.simdInternal_
, b
.simdInternal_
))
540 static inline SimdFInt32 gmx_simdcall
541 operator&(SimdFInt32 a
, SimdFInt32 b
)
544 _mm_and_si128(a
.simdInternal_
, b
.simdInternal_
)
548 static inline SimdFInt32 gmx_simdcall
549 andNot(SimdFInt32 a
, SimdFInt32 b
)
552 _mm_andnot_si128(a
.simdInternal_
, b
.simdInternal_
)
556 static inline SimdFInt32 gmx_simdcall
557 operator|(SimdFInt32 a
, SimdFInt32 b
)
560 _mm_or_si128(a
.simdInternal_
, b
.simdInternal_
)
564 static inline SimdFInt32 gmx_simdcall
565 operator^(SimdFInt32 a
, SimdFInt32 b
)
568 _mm_xor_si128(a
.simdInternal_
, b
.simdInternal_
)
572 static inline SimdFInt32 gmx_simdcall
573 operator+(SimdFInt32 a
, SimdFInt32 b
)
576 _mm_add_epi32(a
.simdInternal_
, b
.simdInternal_
)
580 static inline SimdFInt32 gmx_simdcall
581 operator-(SimdFInt32 a
, SimdFInt32 b
)
584 _mm_sub_epi32(a
.simdInternal_
, b
.simdInternal_
)
588 // Override for SSE4.1 and higher
589 #if GMX_SIMD_X86_SSE2
590 static inline SimdFInt32 gmx_simdcall
591 operator*(SimdFInt32 a
, SimdFInt32 b
)
593 __m128i a1
= _mm_srli_si128(a
.simdInternal_
, 4); // - a[3] a[2] a[1]
594 __m128i b1
= _mm_srli_si128(b
.simdInternal_
, 4); // - b[3] b[2] b[1]
595 __m128i c
= _mm_mul_epu32(a
.simdInternal_
, b
.simdInternal_
);
596 __m128i c1
= _mm_mul_epu32(a1
, b1
);
598 c
= _mm_shuffle_epi32(c
, _MM_SHUFFLE(3, 1, 2, 0)); // - - a[2]*b[2] a[0]*b[0]
599 c1
= _mm_shuffle_epi32(c1
, _MM_SHUFFLE(3, 1, 2, 0)); // - - a[3]*b[3] a[1]*b[1]
602 _mm_unpacklo_epi32(c
, c1
)
607 static inline SimdFIBool gmx_simdcall
608 operator==(SimdFInt32 a
, SimdFInt32 b
)
611 _mm_cmpeq_epi32(a
.simdInternal_
, b
.simdInternal_
)
615 static inline SimdFIBool gmx_simdcall
616 testBits(SimdFInt32 a
)
618 __m128i x
= a
.simdInternal_
;
619 __m128i res
= _mm_andnot_si128( _mm_cmpeq_epi32(x
, _mm_setzero_si128()), _mm_cmpeq_epi32(x
, x
));
626 static inline SimdFIBool gmx_simdcall
627 operator<(SimdFInt32 a
, SimdFInt32 b
)
630 _mm_cmplt_epi32(a
.simdInternal_
, b
.simdInternal_
)
634 static inline SimdFIBool gmx_simdcall
635 operator&&(SimdFIBool a
, SimdFIBool b
)
638 _mm_and_si128(a
.simdInternal_
, b
.simdInternal_
)
642 static inline SimdFIBool gmx_simdcall
643 operator||(SimdFIBool a
, SimdFIBool b
)
646 _mm_or_si128(a
.simdInternal_
, b
.simdInternal_
)
650 static inline bool gmx_simdcall
651 anyTrue(SimdFIBool a
) { return _mm_movemask_epi8(a
.simdInternal_
) != 0; }
653 static inline SimdFInt32 gmx_simdcall
654 selectByMask(SimdFInt32 a
, SimdFIBool mask
)
657 _mm_and_si128(a
.simdInternal_
, mask
.simdInternal_
)
661 static inline SimdFInt32 gmx_simdcall
662 selectByNotMask(SimdFInt32 a
, SimdFIBool mask
)
665 _mm_andnot_si128(mask
.simdInternal_
, a
.simdInternal_
)
669 // Override for SSE4.1 and higher
670 #if GMX_SIMD_X86_SSE2
671 static inline SimdFInt32 gmx_simdcall
672 blend(SimdFInt32 a
, SimdFInt32 b
, SimdFIBool sel
)
675 _mm_or_si128(_mm_andnot_si128(sel
.simdInternal_
, a
.simdInternal_
), _mm_and_si128(sel
.simdInternal_
, b
.simdInternal_
))
680 static inline SimdFInt32 gmx_simdcall
684 _mm_cvtps_epi32(a
.simdInternal_
)
688 static inline SimdFInt32 gmx_simdcall
692 _mm_cvttps_epi32(a
.simdInternal_
)
696 static inline SimdFloat gmx_simdcall
700 _mm_cvtepi32_ps(a
.simdInternal_
)
704 static inline SimdFIBool gmx_simdcall
708 _mm_castps_si128(a
.simdInternal_
)
712 static inline SimdFBool gmx_simdcall
713 cvtIB2B(SimdFIBool a
)
716 _mm_castsi128_ps(a
.simdInternal_
)
722 #endif // GMX_SIMD_IMPL_X86_SSE2_SIMD_FLOAT_H