Extended SIMD, implementation for Intel MIC
[gromacs.git] / src / gromacs / simd / impl_x86_mic / impl_x86_mic_simd_double.h
blobc0868eed85c0bbb8211e207f56bd0fbe84e12885
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
37 #define GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H
39 #include "config.h"
41 #include <cassert>
42 #include <cstdint>
44 #include <immintrin.h>
46 #include "gromacs/utility/basedefinitions.h"
48 #include "impl_x86_mic_simd_float.h"
50 namespace gmx
53 class SimdDouble
55 public:
56 SimdDouble() {}
58 SimdDouble(double d) : simdInternal_(_mm512_set1_pd(d)) {}
60 // Internal utility constructor to simplify return statements
61 SimdDouble(__m512d simd) : simdInternal_(simd) {}
63 __m512d simdInternal_;
66 class SimdDInt32
68 public:
69 SimdDInt32() {}
71 SimdDInt32(std::int32_t i) : simdInternal_(_mm512_set1_epi32(i)) {}
73 // Internal utility constructor to simplify return statements
74 SimdDInt32(__m512i simd) : simdInternal_(simd) {}
76 __m512i simdInternal_;
79 class SimdDBool
81 public:
82 SimdDBool() {}
84 // Internal utility constructor to simplify return statements
85 SimdDBool(__mmask8 simd) : simdInternal_(simd) {}
87 __mmask8 simdInternal_;
90 class SimdDIBool
92 public:
93 SimdDIBool() {}
95 // Internal utility constructor to simplify return statements
96 SimdDIBool(__mmask16 simd) : simdInternal_(simd) {}
98 __mmask16 simdInternal_;
101 static inline SimdDouble gmx_simdcall
102 load(const double *m)
104 assert(std::size_t(m) % 64 == 0);
105 return {
106 _mm512_load_pd(m)
110 static inline void gmx_simdcall
111 store(double *m, SimdDouble a)
113 assert(std::size_t(m) % 64 == 0);
114 _mm512_store_pd(m, a.simdInternal_);
117 static inline SimdDouble gmx_simdcall
118 loadU(const double *m)
120 return {
121 _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8)
125 static inline void gmx_simdcall
126 storeU(double *m, SimdDouble a)
128 _mm512_packstorelo_pd(m, a.simdInternal_);
129 _mm512_packstorehi_pd(m+8, a.simdInternal_);
133 static inline SimdDouble gmx_simdcall
134 setZeroD()
136 return {
137 _mm512_setzero_pd()
141 static inline SimdDInt32 gmx_simdcall
142 loadDI(const std::int32_t * m)
144 assert(std::size_t(m) % 32 == 0);
145 return {
146 _mm512_extload_epi64(m, _MM_UPCONV_EPI64_NONE, _MM_BROADCAST_4X8, _MM_HINT_NONE)
150 static inline void gmx_simdcall
151 store(std::int32_t * m, SimdDInt32 a)
153 assert(std::size_t(m) % 32 == 0);
154 _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
157 static inline SimdDInt32 gmx_simdcall
158 loadUDI(const std::int32_t *m)
160 return {
161 _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0x00FF), m),
162 _mm512_int2mask(0x00FF), m+16)
166 static inline void gmx_simdcall
167 storeU(std::int32_t * m, SimdDInt32 a)
169 _mm512_mask_packstorelo_epi32(m, _mm512_int2mask(0x00FF), a.simdInternal_);
170 _mm512_mask_packstorehi_epi32(m+16, _mm512_int2mask(0x00FF), a.simdInternal_);
173 static inline SimdDInt32 gmx_simdcall
174 setZeroDI()
176 return {
177 _mm512_setzero_epi32()
181 template<int index>
182 static inline std::int32_t gmx_simdcall
183 extract(SimdDInt32 a)
185 int r;
186 _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a.simdInternal_);
187 return r;
190 static inline SimdDouble gmx_simdcall
191 operator&(SimdDouble a, SimdDouble b)
193 return {
194 _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
198 static inline SimdDouble gmx_simdcall
199 andNot(SimdDouble a, SimdDouble b)
201 return {
202 _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
206 static inline SimdDouble gmx_simdcall
207 operator|(SimdDouble a, SimdDouble b)
209 return {
210 _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
214 static inline SimdDouble gmx_simdcall
215 operator^(SimdDouble a, SimdDouble b)
217 return {
218 _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a.simdInternal_), _mm512_castpd_si512(b.simdInternal_)))
222 static inline SimdDouble gmx_simdcall
223 operator+(SimdDouble a, SimdDouble b)
225 return {
226 _mm512_add_pd(a.simdInternal_, b.simdInternal_)
230 static inline SimdDouble gmx_simdcall
231 operator-(SimdDouble a, SimdDouble b)
233 return {
234 _mm512_sub_pd(a.simdInternal_, b.simdInternal_)
238 static inline SimdDouble gmx_simdcall
239 operator-(SimdDouble x)
241 return {
242 _mm512_addn_pd(x.simdInternal_, _mm512_setzero_pd())
246 static inline SimdDouble gmx_simdcall
247 operator*(SimdDouble a, SimdDouble b)
249 return {
250 _mm512_mul_pd(a.simdInternal_, b.simdInternal_)
254 static inline SimdDouble gmx_simdcall
255 fma(SimdDouble a, SimdDouble b, SimdDouble c)
257 return {
258 _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
262 static inline SimdDouble gmx_simdcall
263 fms(SimdDouble a, SimdDouble b, SimdDouble c)
265 return {
266 _mm512_fmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
270 static inline SimdDouble gmx_simdcall
271 fnma(SimdDouble a, SimdDouble b, SimdDouble c)
273 return {
274 _mm512_fnmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
278 static inline SimdDouble gmx_simdcall
279 fnms(SimdDouble a, SimdDouble b, SimdDouble c)
281 return {
282 _mm512_fnmsub_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_)
286 static inline SimdDouble gmx_simdcall
287 rsqrt(SimdDouble x)
289 return {
290 _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x.simdInternal_)))
294 static inline SimdDouble gmx_simdcall
295 rcp(SimdDouble x)
297 return {
298 _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x.simdInternal_)))
302 static inline SimdDouble gmx_simdcall
303 maskAdd(SimdDouble a, SimdDouble b, SimdDBool m)
305 return {
306 _mm512_mask_add_pd(a.simdInternal_, m.simdInternal_, a.simdInternal_, b.simdInternal_)
310 static inline SimdDouble gmx_simdcall
311 maskzMul(SimdDouble a, SimdDouble b, SimdDBool m)
313 return {
314 _mm512_mask_mul_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_, b.simdInternal_)
318 static inline SimdDouble gmx_simdcall
319 maskzFma(SimdDouble a, SimdDouble b, SimdDouble c, SimdDBool m)
321 return {
322 _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, _mm512_fmadd_pd(a.simdInternal_, b.simdInternal_, c.simdInternal_))
326 static inline SimdDouble gmx_simdcall
327 maskzRsqrt(SimdDouble x, SimdDBool m)
329 return {
330 _mm512_cvtpslo_pd(_mm512_mask_rsqrt23_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_)))
334 static inline SimdDouble gmx_simdcall
335 maskzRcp(SimdDouble x, SimdDBool m)
337 return {
338 _mm512_cvtpslo_pd(_mm512_mask_rcp23_ps(_mm512_setzero_ps(), m.simdInternal_, _mm512_cvtpd_pslo(x.simdInternal_)))
342 static inline SimdDouble gmx_simdcall
343 abs(SimdDouble x)
345 return {
346 _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(_mm512_set1_pd(GMX_DOUBLE_NEGZERO)), _mm512_castpd_si512(x.simdInternal_)))
350 static inline SimdDouble gmx_simdcall
351 max(SimdDouble a, SimdDouble b)
353 return {
354 _mm512_gmax_pd(a.simdInternal_, b.simdInternal_)
358 static inline SimdDouble gmx_simdcall
359 min(SimdDouble a, SimdDouble b)
361 return {
362 _mm512_gmin_pd(a.simdInternal_, b.simdInternal_)
366 static inline SimdDouble gmx_simdcall
367 round(SimdDouble x)
369 return {
370 _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
374 static inline SimdDouble gmx_simdcall
375 trunc(SimdDouble x)
377 return {
378 _mm512_roundfxpnt_adjust_pd(x.simdInternal_, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
382 static inline SimdDouble
383 frexp(SimdDouble value, SimdDInt32 * exponent)
385 __m512d rExponent = _mm512_getexp_pd(value.simdInternal_);
386 __m512i iExponent = _mm512_cvtfxpnt_roundpd_epi32lo(rExponent, _MM_FROUND_TO_NEAREST_INT);
388 exponent->simdInternal_ = _mm512_add_epi32(iExponent, _mm512_set1_epi32(1));
390 return {
391 _mm512_getmant_pd(value.simdInternal_, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src)
395 static inline SimdDouble
396 ldexp(SimdDouble value, SimdDInt32 exponent)
398 const __m512i exponentBias = _mm512_set1_epi32(1023);
399 __m512i iExponent;
401 iExponent = _mm512_permutevar_epi32(_mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), exponent.simdInternal_);
402 iExponent = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), _mm512_add_epi32(iExponent, exponentBias), 20);
403 return _mm512_mul_pd(_mm512_castsi512_pd(iExponent), value.simdInternal_);
406 static inline double gmx_simdcall
407 reduce(SimdDouble a)
409 return _mm512_reduce_add_pd(a.simdInternal_);
412 // Picky, picky, picky:
413 // icc-16 complains about "Illegal value of immediate argument to intrinsic"
414 // unless we use
415 // 1) Ordered-quiet for ==
416 // 2) Unordered-quiet for !=
417 // 3) Ordered-signaling for < and <=
419 static inline SimdDBool gmx_simdcall
420 operator==(SimdDouble a, SimdDouble b)
422 return {
423 _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_EQ_OQ)
427 static inline SimdDBool gmx_simdcall
428 operator!=(SimdDouble a, SimdDouble b)
430 return {
431 _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_NEQ_UQ)
435 static inline SimdDBool gmx_simdcall
436 operator<(SimdDouble a, SimdDouble b)
438 return {
439 _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LT_OS)
443 static inline SimdDBool gmx_simdcall
444 operator<=(SimdDouble a, SimdDouble b)
446 return {
447 _mm512_cmp_pd_mask(a.simdInternal_, b.simdInternal_, _CMP_LE_OS)
451 static inline SimdDBool gmx_simdcall
452 testBits(SimdDouble a)
454 // This is a bit problematic since Knight's corner does not have any 64-bit integer comparisons,
455 // and we cannot use floating-point since values with just a single bit set can evaluate to 0.0.
456 // Instead, we do it as
457 // 1) Do a logical or of the high/low 32 bits
458 // 2) Do a permute so we have the low 32 bits of each value in the low 8 32-bit elements
459 // 3) Do an integer comparison, and cast so we just keep the low 8 bits of the mask.
461 // By default we will use integers for the masks in the nonbonded kernels, so this shouldn't
462 // have any significant performance drawbacks.
464 __m512i ia = _mm512_castpd_si512(a.simdInternal_);
466 ia = _mm512_or_epi32(ia, _mm512_swizzle_epi32(ia, _MM_SWIZ_REG_CDAB));
467 ia = _mm512_permutevar_epi32( _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), ia);
469 return {
470 static_cast<__mmask8>(_mm512_cmp_epi32_mask(ia, _mm512_setzero_si512(), _MM_CMPINT_NE))
474 static inline SimdDBool gmx_simdcall
475 operator&&(SimdDBool a, SimdDBool b)
477 return {
478 static_cast<__mmask8>(_mm512_kand(a.simdInternal_, b.simdInternal_))
482 static inline SimdDBool gmx_simdcall
483 operator||(SimdDBool a, SimdDBool b)
485 return {
486 static_cast<__mmask8>(_mm512_kor(a.simdInternal_, b.simdInternal_))
490 static inline bool gmx_simdcall
491 anyTrue(SimdDBool a)
493 return _mm512_mask2int(a.simdInternal_) != 0;
496 static inline SimdDouble gmx_simdcall
497 selectByMask(SimdDouble a, SimdDBool m)
499 return {
500 _mm512_mask_mov_pd(_mm512_setzero_pd(), m.simdInternal_, a.simdInternal_)
504 static inline SimdDouble gmx_simdcall
505 selectByNotMask(SimdDouble a, SimdDBool m)
507 return {
508 _mm512_mask_mov_pd(a.simdInternal_, m.simdInternal_, _mm512_setzero_pd())
512 static inline SimdDouble gmx_simdcall
513 blend(SimdDouble a, SimdDouble b, SimdDBool sel)
515 return {
516 _mm512_mask_blend_pd(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
520 static inline SimdDInt32 gmx_simdcall
521 operator<<(SimdDInt32 a, int n)
523 return {
524 _mm512_slli_epi32(a.simdInternal_, n)
528 static inline SimdDInt32 gmx_simdcall
529 operator>>(SimdDInt32 a, int n)
531 return {
532 _mm512_srli_epi32(a.simdInternal_, n)
536 static inline SimdDInt32 gmx_simdcall
537 operator&(SimdDInt32 a, SimdDInt32 b)
539 return {
540 _mm512_and_epi32(a.simdInternal_, b.simdInternal_)
544 static inline SimdDInt32 gmx_simdcall
545 andNot(SimdDInt32 a, SimdDInt32 b)
547 return {
548 _mm512_andnot_epi32(a.simdInternal_, b.simdInternal_)
552 static inline SimdDInt32 gmx_simdcall
553 operator|(SimdDInt32 a, SimdDInt32 b)
555 return {
556 _mm512_or_epi32(a.simdInternal_, b.simdInternal_)
560 static inline SimdDInt32 gmx_simdcall
561 operator^(SimdDInt32 a, SimdDInt32 b)
563 return {
564 _mm512_xor_epi32(a.simdInternal_, b.simdInternal_)
568 static inline SimdDInt32 gmx_simdcall
569 operator+(SimdDInt32 a, SimdDInt32 b)
571 return {
572 _mm512_add_epi32(a.simdInternal_, b.simdInternal_)
576 static inline SimdDInt32 gmx_simdcall
577 operator-(SimdDInt32 a, SimdDInt32 b)
579 return {
580 _mm512_sub_epi32(a.simdInternal_, b.simdInternal_)
584 static inline SimdDInt32 gmx_simdcall
585 operator*(SimdDInt32 a, SimdDInt32 b)
587 return {
588 _mm512_mullo_epi32(a.simdInternal_, b.simdInternal_)
592 static inline SimdDIBool gmx_simdcall
593 operator==(SimdDInt32 a, SimdDInt32 b)
595 return {
596 _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_EQ)
600 static inline SimdDIBool gmx_simdcall
601 testBits(SimdDInt32 a)
603 return {
604 _mm512_cmp_epi32_mask(a.simdInternal_, _mm512_setzero_si512(), _MM_CMPINT_NE)
608 static inline SimdDIBool gmx_simdcall
609 operator<(SimdDInt32 a, SimdDInt32 b)
611 return {
612 _mm512_cmp_epi32_mask(a.simdInternal_, b.simdInternal_, _MM_CMPINT_LT)
616 static inline SimdDIBool gmx_simdcall
617 operator&&(SimdDIBool a, SimdDIBool b)
619 return {
620 _mm512_kand(a.simdInternal_, b.simdInternal_)
624 static inline SimdDIBool gmx_simdcall
625 operator||(SimdDIBool a, SimdDIBool b)
627 return {
628 _mm512_kor(a.simdInternal_, b.simdInternal_)
632 static inline bool gmx_simdcall
633 anyTrue(SimdDIBool a)
635 return ( _mm512_mask2int(a.simdInternal_) & 0xFF) != 0;
638 static inline SimdDInt32 gmx_simdcall
639 selectByMask(SimdDInt32 a, SimdDIBool m)
641 return {
642 _mm512_mask_mov_epi32(_mm512_setzero_epi32(), m.simdInternal_, a.simdInternal_)
646 static inline SimdDInt32 gmx_simdcall
647 selectByNotMask(SimdDInt32 a, SimdDIBool m)
649 return {
650 _mm512_mask_mov_epi32(a.simdInternal_, m.simdInternal_, _mm512_setzero_epi32())
654 static inline SimdDInt32 gmx_simdcall
655 blend(SimdDInt32 a, SimdDInt32 b, SimdDIBool sel)
657 return {
658 _mm512_mask_blend_epi32(sel.simdInternal_, a.simdInternal_, b.simdInternal_)
662 static inline SimdDInt32 gmx_simdcall
663 cvtR2I(SimdDouble a)
665 return {
666 _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_NEAREST_INT)
670 static inline SimdDInt32 gmx_simdcall
671 cvttR2I(SimdDouble a)
673 return {
674 _mm512_cvtfxpnt_roundpd_epi32lo(a.simdInternal_, _MM_FROUND_TO_ZERO)
678 static inline SimdDouble gmx_simdcall
679 cvtI2R(SimdDInt32 a)
681 return {
682 _mm512_cvtepi32lo_pd(a.simdInternal_)
686 static inline SimdDIBool gmx_simdcall
687 cvtB2IB(SimdDBool a)
689 return {
690 a.simdInternal_
694 static inline SimdDBool gmx_simdcall
695 cvtIB2B(SimdDIBool a)
697 return {
698 static_cast<__mmask8>(a.simdInternal_)
702 static inline void gmx_simdcall
703 cvtF2DD(SimdFloat f, SimdDouble *d0, SimdDouble *d1)
705 __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f.simdInternal_), _MM_PERM_DCDC);
707 *d0 = _mm512_cvtpslo_pd(f.simdInternal_);
708 *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
711 static inline SimdFloat gmx_simdcall
712 cvtDD2F(SimdDouble d0, SimdDouble d1)
714 __m512 f0 = _mm512_cvtpd_pslo(d0.simdInternal_);
715 __m512 f1 = _mm512_cvtpd_pslo(d1.simdInternal_);
716 return {
717 _mm512_mask_permute4f128_ps(f0, _mm512_int2mask(0xFF00), f1, _MM_PERM_BABA)
721 } // namespace gmx
723 #endif // GMX_SIMD_IMPL_X86_MIC_SIMD_DOUBLE_H