Introduce SimulatorBuilder
[gromacs.git] / src / gromacs / simd / impl_x86_sse2 / impl_x86_sse2_simd_double.h
blob2a49131b6b90dce8004d2c4aa9223f7a1f943c72
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_X86_SSE2_SIMD_DOUBLE_H
36 #define GMX_SIMD_IMPL_X86_SSE2_SIMD_DOUBLE_H
38 #include "config.h"
40 #include <cassert>
41 #include <cstddef>
42 #include <cstdint>
44 #include <emmintrin.h>
46 #include "gromacs/math/utilities.h"
48 #include "impl_x86_sse2_simd_float.h"
50 namespace gmx
53 class SimdDouble
55 public:
56 SimdDouble() {}
58 SimdDouble(double d) : simdInternal_(_mm_set1_pd(d)) {}
60 // Internal utility constructor to simplify return statements
61 SimdDouble(__m128d simd) : simdInternal_(simd) {}
63 __m128d simdInternal_;
66 class SimdDInt32
68 public:
69 SimdDInt32() {}
71 SimdDInt32(std::int32_t i) : simdInternal_(_mm_set1_epi32(i)) {}
73 // Internal utility constructor to simplify return statements
74 SimdDInt32(__m128i simd) : simdInternal_(simd) {}
76 __m128i simdInternal_;
79 class SimdDBool
81 public:
82 SimdDBool() {}
84 SimdDBool(bool b) : simdInternal_(_mm_castsi128_pd(_mm_set1_epi32( b ? 0xFFFFFFFF : 0))) {}
86 // Internal utility constructor to simplify return statements
87 SimdDBool(__m128d simd) : simdInternal_(simd) {}
89 __m128d simdInternal_;
92 class SimdDIBool
94 public:
95 SimdDIBool() {}
97 SimdDIBool(bool b) : simdInternal_(_mm_set1_epi32( b ? 0xFFFFFFFF : 0)) {}
99 // Internal utility constructor to simplify return statements
100 SimdDIBool(__m128i simd) : simdInternal_(simd) {}
102 __m128i simdInternal_;
105 static inline SimdDouble gmx_simdcall
106 simdLoad(const double *m, SimdDoubleTag = {})
108 assert(std::size_t(m) % 16 == 0);
109 return {
110 _mm_load_pd(m)
114 static inline void gmx_simdcall
115 store(double *m, SimdDouble a)
117 assert(std::size_t(m) % 16 == 0);
118 _mm_store_pd(m, a.simdInternal_);
121 static inline SimdDouble gmx_simdcall
122 simdLoadU(const double *m, SimdDoubleTag = {})
124 return {
125 _mm_loadu_pd(m)
129 static inline void gmx_simdcall
130 storeU(double *m, SimdDouble a) { _mm_storeu_pd(m, a.simdInternal_); }
132 static inline SimdDouble gmx_simdcall
133 setZeroD()
135 return {
136 _mm_setzero_pd()
140 static inline SimdDInt32 gmx_simdcall
141 simdLoad(const std::int32_t * m, SimdDInt32Tag)
143 assert(std::size_t(m) % 8 == 0);
144 return {
145 _mm_loadl_epi64(reinterpret_cast<const __m128i *>(m))
149 static inline void gmx_simdcall
150 store(std::int32_t * m, SimdDInt32 a)
152 assert(std::size_t(m) % 8 == 0);
153 _mm_storel_epi64(reinterpret_cast<__m128i *>(m), a.simdInternal_);
156 static inline SimdDInt32 gmx_simdcall
157 simdLoadU(const std::int32_t *m, SimdDInt32Tag)
159 return {
160 _mm_loadl_epi64(reinterpret_cast<const __m128i *>(m))
164 static inline void gmx_simdcall
165 storeU(std::int32_t * m, SimdDInt32 a)
167 _mm_storel_epi64(reinterpret_cast<__m128i *>(m), a.simdInternal_);
170 static inline SimdDInt32 gmx_simdcall
171 setZeroDI()
173 return {
174 _mm_setzero_si128()
178 // Override for SSE4.1 and higher
179 #if GMX_SIMD_X86_SSE2
180 template<int index>
181 static inline std::int32_t gmx_simdcall
182 extract(SimdDInt32 a)
184 return _mm_cvtsi128_si32( _mm_srli_si128(a.simdInternal_, 4 * index) );
186 #endif
188 static inline SimdDouble gmx_simdcall
189 operator&(SimdDouble a, SimdDouble b)
191 return {
192 _mm_and_pd(a.simdInternal_, b.simdInternal_)
196 static inline SimdDouble gmx_simdcall
197 andNot(SimdDouble a, SimdDouble b)
199 return {
200 _mm_andnot_pd(a.simdInternal_, b.simdInternal_)
204 static inline SimdDouble gmx_simdcall
205 operator|(SimdDouble a, SimdDouble b)
207 return {
208 _mm_or_pd(a.simdInternal_, b.simdInternal_)
212 static inline SimdDouble gmx_simdcall
213 operator^(SimdDouble a, SimdDouble b)
215 return {
216 _mm_xor_pd(a.simdInternal_, b.simdInternal_)
220 static inline SimdDouble gmx_simdcall
221 operator+(SimdDouble a, SimdDouble b)
223 return {
224 _mm_add_pd(a.simdInternal_, b.simdInternal_)
228 static inline SimdDouble gmx_simdcall
229 operator-(SimdDouble a, SimdDouble b)
231 return {
232 _mm_sub_pd(a.simdInternal_, b.simdInternal_)
236 static inline SimdDouble gmx_simdcall
237 operator-(SimdDouble x)
239 return {
240 _mm_xor_pd(x.simdInternal_, _mm_set1_pd(GMX_DOUBLE_NEGZERO))
244 static inline SimdDouble gmx_simdcall
245 operator*(SimdDouble a, SimdDouble b)
247 return {
248 _mm_mul_pd(a.simdInternal_, b.simdInternal_)
252 // Override for AVX-128-FMA and higher
253 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
254 static inline SimdDouble gmx_simdcall
255 fma(SimdDouble a, SimdDouble b, SimdDouble c)
257 return {
258 _mm_add_pd(_mm_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_)
262 static inline SimdDouble gmx_simdcall
263 fms(SimdDouble a, SimdDouble b, SimdDouble c)
265 return {
266 _mm_sub_pd(_mm_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_)
270 static inline SimdDouble gmx_simdcall
271 fnma(SimdDouble a, SimdDouble b, SimdDouble c)
273 return {
274 _mm_sub_pd(c.simdInternal_, _mm_mul_pd(a.simdInternal_, b.simdInternal_))
278 static inline SimdDouble gmx_simdcall
279 fnms(SimdDouble a, SimdDouble b, SimdDouble c)
281 return {
282 _mm_sub_pd(_mm_setzero_pd(), _mm_add_pd(_mm_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_))
285 #endif
287 static inline SimdDouble gmx_simdcall
288 rsqrt(SimdDouble x)
290 return {
291 _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x.simdInternal_)))
295 static inline SimdDouble gmx_simdcall
296 rcp(SimdDouble x)
298 return {
299 _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x.simdInternal_)))
303 static inline SimdDouble gmx_simdcall
304 maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
306 return {
307 _mm_add_pd(a.simdInternal_, _mm_and_pd(b.simdInternal_, m.simdInternal_))
311 static inline SimdDouble gmx_simdcall
312 maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
314 return {
315 _mm_and_pd(_mm_mul_pd(a.simdInternal_, b.simdInternal_), m.simdInternal_)
319 static inline SimdDouble gmx_simdcall
320 maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
322 return {
323 _mm_and_pd(_mm_add_pd(_mm_mul_pd(a.simdInternal_, b.simdInternal_), c.simdInternal_), m.simdInternal_)
327 // Override for SSE4.1 and higher
328 #if GMX_SIMD_X86_SSE2
329 static inline SimdDouble gmx_simdcall
330 maskzRsqrt(SimdDouble x, SimdDBool m)
332 // The result will always be correct since we mask the result with m, but
333 // for debug builds we also want to make sure not to generate FP exceptions
334 #ifndef NDEBUG
335 x.simdInternal_ = _mm_or_pd(_mm_andnot_pd(m.simdInternal_, _mm_set1_pd(1.0)), _mm_and_pd(m.simdInternal_, x.simdInternal_));
336 #endif
337 return {
338 _mm_and_pd(_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x.simdInternal_))), m.simdInternal_)
342 static inline SimdDouble gmx_simdcall
343 maskzRcp(SimdDouble x, SimdDBool m)
345 // The result will always be correct since we mask the result with m, but
346 // for debug builds we also want to make sure not to generate FP exceptions
347 #ifndef NDEBUG
348 x.simdInternal_ = _mm_or_pd(_mm_andnot_pd(m.simdInternal_, _mm_set1_pd(1.0)), _mm_and_pd(m.simdInternal_, x.simdInternal_));
349 #endif
350 return {
351 _mm_and_pd(_mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x.simdInternal_))), m.simdInternal_)
354 #endif
356 static inline SimdDouble gmx_simdcall
357 abs(SimdDouble x)
359 return {
360 _mm_andnot_pd( _mm_set1_pd(GMX_DOUBLE_NEGZERO), x.simdInternal_ )
364 static inline SimdDouble gmx_simdcall
365 max(SimdDouble a, SimdDouble b)
367 return {
368 _mm_max_pd(a.simdInternal_, b.simdInternal_)
372 static inline SimdDouble gmx_simdcall
373 min(SimdDouble a, SimdDouble b)
375 return {
376 _mm_min_pd(a.simdInternal_, b.simdInternal_)
380 // Override for SSE4.1 and higher
381 #if GMX_SIMD_X86_SSE2
382 static inline SimdDouble gmx_simdcall
383 round(SimdDouble x)
385 return {
386 _mm_cvtepi32_pd( _mm_cvtpd_epi32(x.simdInternal_) )
390 static inline SimdDouble gmx_simdcall
391 trunc(SimdDouble x)
393 return {
394 _mm_cvtepi32_pd( _mm_cvttpd_epi32(x.simdInternal_) )
398 #endif
400 static inline SimdDouble
401 frexp(SimdDouble value, SimdDInt32 * exponent)
403 // Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds
404 const __m128d exponentMask = _mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
405 const __m128d mantissaMask = _mm_castsi128_pd( _mm_set_epi32(0x800FFFFF, 0xFFFFFFFF, 0x800FFFFF, 0xFFFFFFFF) );
406 const __m128i exponentBias = _mm_set1_epi32(1022); // add 1 to make our definition identical to frexp()
407 const __m128d half = _mm_set1_pd(0.5);
408 __m128i iExponent;
410 iExponent = _mm_castpd_si128(_mm_and_pd(value.simdInternal_, exponentMask));
411 iExponent = _mm_sub_epi32(_mm_srli_epi64(iExponent, 52), exponentBias);
412 iExponent = _mm_shuffle_epi32(iExponent, _MM_SHUFFLE(3, 1, 2, 0) );
413 exponent->simdInternal_ = iExponent;
415 return {
416 _mm_or_pd(_mm_and_pd(value.simdInternal_, mantissaMask), half)
420 // Override for SSE4.1
421 #if GMX_SIMD_X86_SSE2
422 template <MathOptimization opt = MathOptimization::Safe>
423 static inline SimdDouble
424 ldexp(SimdDouble value, SimdDInt32 exponent)
426 const __m128i exponentBias = _mm_set1_epi32(1023);
427 __m128i iExponent = _mm_add_epi32(exponent.simdInternal_, exponentBias);
429 if (opt == MathOptimization::Safe)
431 // Make sure biased argument is not negative
432 iExponent = _mm_and_si128(iExponent, _mm_cmpgt_epi32(iExponent, _mm_setzero_si128()));
435 // After conversion integers will be in slot 0,1. Move them to 0,2 so
436 // we can do a 64-bit shift and get them to the dp exponents.
437 iExponent = _mm_shuffle_epi32(iExponent, _MM_SHUFFLE(3, 1, 2, 0));
438 iExponent = _mm_slli_epi64(iExponent, 52);
440 return {
441 _mm_mul_pd(value.simdInternal_, _mm_castsi128_pd(iExponent))
444 #endif
446 // Override for AVX-128-FMA and higher
447 #if GMX_SIMD_X86_SSE2 || GMX_SIMD_X86_SSE4_1
448 static inline double gmx_simdcall
449 reduce(SimdDouble a)
451 __m128d b = _mm_add_sd(a.simdInternal_, _mm_shuffle_pd(a.simdInternal_, a.simdInternal_, _MM_SHUFFLE2(1, 1)));
452 return *reinterpret_cast<double *>(&b);
454 #endif
456 static inline SimdDBool gmx_simdcall
457 operator==(SimdDouble a, SimdDouble b)
459 return {
460 _mm_cmpeq_pd(a.simdInternal_, b.simdInternal_)
464 static inline SimdDBool gmx_simdcall
465 operator!=(SimdDouble a, SimdDouble b)
467 return {
468 _mm_cmpneq_pd(a.simdInternal_, b.simdInternal_)
472 static inline SimdDBool gmx_simdcall
473 operator<(SimdDouble a, SimdDouble b)
475 return {
476 _mm_cmplt_pd(a.simdInternal_, b.simdInternal_)
480 static inline SimdDBool gmx_simdcall
481 operator<=(SimdDouble a, SimdDouble b)
483 return {
484 _mm_cmple_pd(a.simdInternal_, b.simdInternal_)
488 // Override for SSE4.1 and higher
489 #if GMX_SIMD_X86_SSE2
490 static inline SimdDBool gmx_simdcall
491 testBits(SimdDouble a)
493 __m128i ia = _mm_castpd_si128(a.simdInternal_);
494 __m128i res = _mm_andnot_si128( _mm_cmpeq_epi32(ia, _mm_setzero_si128()), _mm_cmpeq_epi32(ia, ia));
496 // set each 64-bit element if low or high 32-bit part is set
497 res = _mm_or_si128(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(2, 3, 0, 1)));
499 return {
500 _mm_castsi128_pd(res)
503 #endif
505 static inline SimdDBool gmx_simdcall
506 operator&&(SimdDBool a, SimdDBool b)
508 return {
509 _mm_and_pd(a.simdInternal_, b.simdInternal_)
513 static inline SimdDBool gmx_simdcall
514 operator||(SimdDBool a, SimdDBool b)
516 return {
517 _mm_or_pd(a.simdInternal_, b.simdInternal_)
521 static inline bool gmx_simdcall
522 anyTrue(SimdDBool a) { return _mm_movemask_pd(a.simdInternal_) != 0; }
524 static inline SimdDouble gmx_simdcall
525 selectByMask(SimdDouble a, SimdDBool mask)
527 return {
528 _mm_and_pd(a.simdInternal_, mask.simdInternal_)
532 static inline SimdDouble gmx_simdcall
533 selectByNotMask(SimdDouble a, SimdDBool mask)
535 return {
536 _mm_andnot_pd(mask.simdInternal_, a.simdInternal_)
540 // Override for SSE4.1 and higher
541 #if GMX_SIMD_X86_SSE2
542 static inline SimdDouble gmx_simdcall
543 blend(SimdDouble a, SimdDouble b, SimdDBool sel)
545 return {
546 _mm_or_pd(_mm_andnot_pd(sel.simdInternal_, a.simdInternal_), _mm_and_pd(sel.simdInternal_, b.simdInternal_))
549 #endif
551 static inline SimdDInt32 gmx_simdcall
552 operator&(SimdDInt32 a, SimdDInt32 b)
554 return {
555 _mm_and_si128(a.simdInternal_, b.simdInternal_)
559 static inline SimdDInt32 gmx_simdcall
560 andNot(SimdDInt32 a, SimdDInt32 b)
562 return {
563 _mm_andnot_si128(a.simdInternal_, b.simdInternal_)
567 static inline SimdDInt32 gmx_simdcall
568 operator|(SimdDInt32 a, SimdDInt32 b)
570 return {
571 _mm_or_si128(a.simdInternal_, b.simdInternal_)
575 static inline SimdDInt32 gmx_simdcall
576 operator^(SimdDInt32 a, SimdDInt32 b)
578 return {
579 _mm_xor_si128(a.simdInternal_, b.simdInternal_)
583 static inline SimdDInt32 gmx_simdcall
584 operator+(SimdDInt32 a, SimdDInt32 b)
586 return {
587 _mm_add_epi32(a.simdInternal_, b.simdInternal_)
591 static inline SimdDInt32 gmx_simdcall
592 operator-(SimdDInt32 a, SimdDInt32 b)
594 return {
595 _mm_sub_epi32(a.simdInternal_, b.simdInternal_)
599 // Override for SSE4.1 and higher
600 #if GMX_SIMD_X86_SSE2
601 static inline SimdDInt32 gmx_simdcall
602 operator*(SimdDInt32 a, SimdDInt32 b)
605 __m128i tmpA = _mm_unpacklo_epi32(a.simdInternal_, _mm_setzero_si128()); // 0 a[1] 0 a[0]
606 __m128i tmpB = _mm_unpacklo_epi32(b.simdInternal_, _mm_setzero_si128()); // 0 b[1] 0 b[0]
608 __m128i tmpC = _mm_mul_epu32(tmpA, tmpB); // 0 a[1]*b[1] 0 a[0]*b[0]
610 return {
611 _mm_shuffle_epi32(tmpC, _MM_SHUFFLE(3, 1, 2, 0))
614 #endif
616 static inline SimdDIBool gmx_simdcall
617 operator==(SimdDInt32 a, SimdDInt32 b)
619 return {
620 _mm_cmpeq_epi32(a.simdInternal_, b.simdInternal_)
624 static inline SimdDIBool gmx_simdcall
625 testBits(SimdDInt32 a)
627 __m128i x = a.simdInternal_;
628 __m128i res = _mm_andnot_si128( _mm_cmpeq_epi32(x, _mm_setzero_si128()), _mm_cmpeq_epi32(x, x));
630 return {
635 static inline SimdDIBool gmx_simdcall
636 operator<(SimdDInt32 a, SimdDInt32 b)
638 return {
639 _mm_cmplt_epi32(a.simdInternal_, b.simdInternal_)
643 static inline SimdDIBool gmx_simdcall
644 operator&&(SimdDIBool a, SimdDIBool b)
646 return {
647 _mm_and_si128(a.simdInternal_, b.simdInternal_)
651 static inline SimdDIBool gmx_simdcall
652 operator||(SimdDIBool a, SimdDIBool b)
654 return {
655 _mm_or_si128(a.simdInternal_, b.simdInternal_)
659 static inline bool gmx_simdcall
660 anyTrue(SimdDIBool a)
662 return _mm_movemask_epi8(_mm_shuffle_epi32(a.simdInternal_, _MM_SHUFFLE(1, 0, 1, 0))) != 0;
665 static inline SimdDInt32 gmx_simdcall
666 selectByMask(SimdDInt32 a, SimdDIBool mask)
668 return {
669 _mm_and_si128(a.simdInternal_, mask.simdInternal_)
673 static inline SimdDInt32 gmx_simdcall
674 selectByNotMask(SimdDInt32 a, SimdDIBool mask)
676 return {
677 _mm_andnot_si128(mask.simdInternal_, a.simdInternal_)
681 // Override for SSE4.1 and higher
682 #if GMX_SIMD_X86_SSE2
683 static inline SimdDInt32 gmx_simdcall
684 blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
686 return {
687 _mm_or_si128(_mm_andnot_si128(sel.simdInternal_, a.simdInternal_), _mm_and_si128(sel.simdInternal_, b.simdInternal_))
690 #endif
692 static inline SimdDInt32 gmx_simdcall
693 cvtR2I(SimdDouble a)
695 return {
696 _mm_cvtpd_epi32(a.simdInternal_)
700 static inline SimdDInt32 gmx_simdcall
701 cvttR2I(SimdDouble a)
703 return {
704 _mm_cvttpd_epi32(a.simdInternal_)
708 static inline SimdDouble gmx_simdcall
709 cvtI2R(SimdDInt32 a)
711 return {
712 _mm_cvtepi32_pd(a.simdInternal_)
716 static inline SimdDIBool gmx_simdcall
717 cvtB2IB(SimdDBool a)
719 return {
720 _mm_shuffle_epi32(_mm_castpd_si128(a.simdInternal_), _MM_SHUFFLE(2, 0, 2, 0))
724 static inline SimdDBool gmx_simdcall
725 cvtIB2B(SimdDIBool a)
727 return {
728 _mm_castsi128_pd(_mm_shuffle_epi32(a.simdInternal_, _MM_SHUFFLE(1, 1, 0, 0)))
732 static inline void gmx_simdcall
733 cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1)
735 d0->simdInternal_ = _mm_cvtps_pd(f.simdInternal_);
736 d1->simdInternal_ = _mm_cvtps_pd(_mm_movehl_ps(f.simdInternal_, f.simdInternal_));
739 static inline SimdFloat gmx_simdcall
740 cvtDD2F(SimdDouble d0, SimdDouble d1)
742 return {
743 _mm_movelh_ps(_mm_cvtpd_ps(d0.simdInternal_), _mm_cvtpd_ps(d1.simdInternal_))
747 } // namespace gmx
749 #endif // GMX_SIMD_IMPL_X86_SSE2_SIMD_DOUBLE_H