2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H
37 #define GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H
44 #include <immintrin.h>
46 #include "impl_intel_mic_common.h"
47 #include "impl_intel_mic_simd_float.h"
49 /****************************************************
50 * SINGLE PRECISION SIMD4 IMPLEMENTATION *
51 ****************************************************/
52 /* Load and store are guranteed to only access the 4 floats. All arithmetic operations
53 only operate on the 4 elements (to avoid floating excpetions). But other operations
54 are not gurateed to not modify the other 12 elements. E.g. setzero or blendzero
55 set the upper 12 to zero. */
56 #define Simd4Float __m512
57 #define simd4Mask _mm512_int2mask(0xF)
58 #define simd4LoadF(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), simd4Mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, _MM_HINT_NONE)
59 #define simd4Load1F(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), simd4Mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
60 #define simd4Set1F _mm512_set1_ps
61 #define simd4StoreF simd4StoreF_mic
62 #define simd4LoadUF simd4LoadUF_mic
63 #define simd4StoreUF simd4StoreUF_mic
64 #define simd4SetZeroF _mm512_setzero_ps
65 #define simd4AddF(a, b) _mm512_mask_add_ps(_mm512_undefined_ps(), simd4Mask, a, b)
66 #define simd4SubF(a, b) _mm512_mask_sub_ps(_mm512_undefined_ps(), simd4Mask, a, b)
67 #define simd4MulF(a, b) _mm512_mask_mul_ps(_mm512_undefined_ps(), simd4Mask, a, b)
68 #define simd4FmaddF(a, b, c) _mm512_mask_fmadd_ps(a, simd4Mask, b, c)
69 #define simd4FmsubF(a, b, c) _mm512_mask_fmsub_ps(a, simd4Mask, b, c)
70 #define simd4FnmaddF(a, b, c) _mm512_mask_fnmadd_ps(a, simd4Mask, b, c)
71 #define simd4FnmsubF(a, b, c) _mm512_mask_fnmsub_ps(a, simd4Mask, b, c)
72 #define simd4AndF(a, b) _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
73 #define simd4AndNotF(a, b) _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
74 #define simd4OrF(a, b) _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
75 #define simd4XorF(a, b) _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), simd4Mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
76 #define simd4RsqrtF(a) _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), simd4Mask, a)
77 #define simd4AbsF(x) simd4AndNotF(_mm512_set1_ps(GMX_FLOAT_NEGZERO), x)
78 #define simd4NegF(x) _mm512_mask_addn_ps(_mm512_undefined_ps(), simd4Mask, x, _mm512_setzero_ps())
79 #define simd4MaxF(a, b) _mm512_mask_gmax_ps(_mm512_undefined_ps(), simd4Mask, a, b)
80 #define simd4MinF(a, b) _mm512_mask_gmin_ps(_mm512_undefined_ps(), simd4Mask, a, b)
81 #define simd4RoundF(x) _mm512_mask_round_ps(_mm512_undefined_ps(), simd4Mask, x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
82 #define simd4TruncF(x) _mm512_mask_round_ps(_mm512_undefined_ps(), simd4Mask, x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
83 #define simd4DotProductF(a, b) _mm512_mask_reduce_add_ps(_mm512_int2mask(7), _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(7), a, b))
84 #define Simd4FBool __mmask16
85 #define simd4CmpEqF(a, b) _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_EQ_OQ)
86 #define simd4CmpLtF(a, b) _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_LT_OS)
87 #define simd4CmpLeF(a, b) _mm512_mask_cmp_ps_mask(simd4Mask, a, b, _CMP_LE_OS)
88 #define simd4AndFB _mm512_kand
89 #define simd4OrFB _mm512_kor
90 #define simd4AnyTrueFB(x) (_mm512_mask2int(x)&0xF)
91 #define simd4MaskF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
92 #define simd4MaskNotF(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
93 #define simd4BlendF(a, b, sel) _mm512_mask_blend_ps(sel, a, b)
94 #define simd4ReduceF(x) _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x)
96 /* Implementation helpers */
98 /* load store simd4 */
99 static inline void gmx_simdcall
100 simd4StoreF_mic(float * m
, __m512 s
)
102 assert((size_t)m
%16 == 0);
103 _mm512_mask_packstorelo_ps(m
, simd4Mask
, s
);
106 static inline __m512 gmx_simdcall
107 simd4LoadUF_mic(const float * m
)
109 return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), simd4Mask
, m
), simd4Mask
, m
+16);
112 static inline void gmx_simdcall
113 simd4StoreUF_mic(float * m
, __m512 s
)
115 _mm512_mask_packstorelo_ps(m
, simd4Mask
, s
);
116 _mm512_mask_packstorehi_ps(m
+16, simd4Mask
, s
);
119 #endif /* GMX_SIMD_IMPL_INTEL_MIC_SIMD4_FLOAT_H */