2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
37 #define GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H
43 #include <immintrin.h>
45 #include "gromacs/utility/basedefinitions.h"
47 #include "impl_x86_mic_simd_float.h"
57 Simd4Float(float f
) : simdInternal_(_mm512_set1_ps(f
)) {}
59 // Internal utility constructor to simplify return statements
60 Simd4Float(__m512 simd
) : simdInternal_(simd
) {}
70 // Internal utility constructor to simplify return statements
71 Simd4FBool(__mmask16 simd
) : simdInternal_(simd
) {}
73 __mmask16 simdInternal_
;
76 static inline Simd4Float gmx_simdcall
load4(const float* m
)
78 assert(size_t(m
) % 16 == 0);
79 return { _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m
,
80 _MM_UPCONV_PS_NONE
, _MM_BROADCAST_4X16
, _MM_HINT_NONE
) };
83 static inline void gmx_simdcall
store4(float* m
, Simd4Float a
)
85 assert(size_t(m
) % 16 == 0);
86 _mm512_mask_packstorelo_ps(m
, _mm512_int2mask(0xF), a
.simdInternal_
);
89 static inline Simd4Float gmx_simdcall
load4U(const float* m
)
91 return { _mm512_mask_loadunpackhi_ps(
92 _mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m
),
93 _mm512_int2mask(0xF), m
+ 16) };
96 static inline void gmx_simdcall
store4U(float* m
, Simd4Float a
)
98 _mm512_mask_packstorelo_ps(m
, _mm512_int2mask(0xF), a
.simdInternal_
);
99 _mm512_mask_packstorehi_ps(m
+ 16, _mm512_int2mask(0xF), a
.simdInternal_
);
102 static inline Simd4Float gmx_simdcall
simd4SetZeroF()
104 return { _mm512_setzero_ps() };
107 static inline Simd4Float gmx_simdcall
operator&(Simd4Float a
, Simd4Float b
)
109 return { _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
110 _mm512_castps_si512(a
.simdInternal_
),
111 _mm512_castps_si512(b
.simdInternal_
))) };
114 static inline Simd4Float gmx_simdcall
andNot(Simd4Float a
, Simd4Float b
)
116 return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(
117 _mm512_undefined_epi32(), _mm512_int2mask(0xF), _mm512_castps_si512(a
.simdInternal_
),
118 _mm512_castps_si512(b
.simdInternal_
))) };
121 static inline Simd4Float gmx_simdcall
operator|(Simd4Float a
, Simd4Float b
)
123 return { _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
124 _mm512_castps_si512(a
.simdInternal_
),
125 _mm512_castps_si512(b
.simdInternal_
))) };
128 static inline Simd4Float gmx_simdcall
operator^(Simd4Float a
, Simd4Float b
)
130 return { _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), _mm512_int2mask(0xF),
131 _mm512_castps_si512(a
.simdInternal_
),
132 _mm512_castps_si512(b
.simdInternal_
))) };
135 static inline Simd4Float gmx_simdcall
operator+(Simd4Float a
, Simd4Float b
)
137 return { _mm512_mask_add_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a
.simdInternal_
,
141 static inline Simd4Float gmx_simdcall
operator-(Simd4Float a
, Simd4Float b
)
143 return { _mm512_mask_sub_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a
.simdInternal_
,
147 static inline Simd4Float gmx_simdcall
operator-(Simd4Float x
)
149 return { _mm512_mask_addn_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x
.simdInternal_
,
150 _mm512_setzero_ps()) };
153 static inline Simd4Float gmx_simdcall
operator*(Simd4Float a
, Simd4Float b
)
155 return { _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a
.simdInternal_
,
159 static inline Simd4Float gmx_simdcall
fma(Simd4Float a
, Simd4Float b
, Simd4Float c
)
161 return { _mm512_mask_fmadd_ps(a
.simdInternal_
, _mm512_int2mask(0xF), b
.simdInternal_
, c
.simdInternal_
) };
164 static inline Simd4Float gmx_simdcall
fms(Simd4Float a
, Simd4Float b
, Simd4Float c
)
166 return { _mm512_mask_fmsub_ps(a
.simdInternal_
, _mm512_int2mask(0xF), b
.simdInternal_
, c
.simdInternal_
) };
169 static inline Simd4Float gmx_simdcall
fnma(Simd4Float a
, Simd4Float b
, Simd4Float c
)
171 return { _mm512_mask_fnmadd_ps(a
.simdInternal_
, _mm512_int2mask(0xF), b
.simdInternal_
, c
.simdInternal_
) };
174 static inline Simd4Float gmx_simdcall
fnms(Simd4Float a
, Simd4Float b
, Simd4Float c
)
176 return { _mm512_mask_fnmsub_ps(a
.simdInternal_
, _mm512_int2mask(0xF), b
.simdInternal_
, c
.simdInternal_
) };
179 static inline Simd4Float gmx_simdcall
rsqrt(Simd4Float x
)
181 return { _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x
.simdInternal_
) };
184 static inline Simd4Float gmx_simdcall
abs(Simd4Float x
)
186 return { _mm512_castsi512_ps(_mm512_mask_andnot_epi32(
187 _mm512_undefined_epi32(), _mm512_int2mask(0xF),
188 _mm512_castps_si512(_mm512_set1_ps(GMX_FLOAT_NEGZERO
)), _mm512_castps_si512(x
.simdInternal_
))) };
191 static inline Simd4Float gmx_simdcall
max(Simd4Float a
, Simd4Float b
)
193 return { _mm512_mask_gmax_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a
.simdInternal_
,
197 static inline Simd4Float gmx_simdcall
min(Simd4Float a
, Simd4Float b
)
199 return { _mm512_mask_gmin_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), a
.simdInternal_
,
203 static inline Simd4Float gmx_simdcall
round(Simd4Float x
)
205 return { _mm512_mask_round_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x
.simdInternal_
,
206 _MM_FROUND_TO_NEAREST_INT
, _MM_EXPADJ_NONE
) };
209 static inline Simd4Float gmx_simdcall
trunc(Simd4Float x
)
211 return { _mm512_mask_round_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), x
.simdInternal_
,
212 _MM_FROUND_TO_ZERO
, _MM_EXPADJ_NONE
) };
215 static inline float gmx_simdcall
dotProduct(Simd4Float a
, Simd4Float b
)
217 __m512 x
= _mm512_mask_mul_ps(_mm512_setzero_ps(), _mm512_int2mask(0x7), a
.simdInternal_
,
219 x
= _mm512_add_ps(x
, _mm512_swizzle_ps(x
, _MM_SWIZ_REG_BADC
));
220 x
= _mm512_add_ps(x
, _mm512_swizzle_ps(x
, _MM_SWIZ_REG_CDAB
));
222 _mm512_mask_packstorelo_ps(&f
, _mm512_mask2int(0x1), x
);
226 static inline void gmx_simdcall
transpose(Simd4Float
* v0
, Simd4Float
* v1
, Simd4Float
* v2
, Simd4Float
* v3
)
228 v0
->simdInternal_
= _mm512_mask_permute4f128_ps(v0
->simdInternal_
, _mm512_int2mask(0x00F0),
229 v1
->simdInternal_
, _MM_PERM_AAAA
);
230 v2
->simdInternal_
= _mm512_mask_permute4f128_ps(v2
->simdInternal_
, _mm512_int2mask(0x00F0),
231 v3
->simdInternal_
, _MM_PERM_AAAA
);
232 v0
->simdInternal_
= _mm512_mask_permute4f128_ps(v0
->simdInternal_
, _mm512_int2mask(0xFF00),
233 v2
->simdInternal_
, _MM_PERM_BABA
);
234 v0
->simdInternal_
= _mm512_castsi512_ps(_mm512_permutevar_epi32(
235 _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0),
236 _mm512_castps_si512(v0
->simdInternal_
)));
237 v1
->simdInternal_
= _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F),
238 v0
->simdInternal_
, _MM_PERM_BBBB
);
239 v2
->simdInternal_
= _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F),
240 v0
->simdInternal_
, _MM_PERM_CCCC
);
241 v3
->simdInternal_
= _mm512_mask_permute4f128_ps(_mm512_setzero_ps(), _mm512_int2mask(0x000F),
242 v0
->simdInternal_
, _MM_PERM_DDDD
);
245 // Picky, picky, picky:
246 // icc-16 complains about "Illegal value of immediate argument to intrinsic"
248 // 1) Ordered-quiet for ==
249 // 2) Unordered-quiet for !=
250 // 3) Ordered-signaling for < and <=
252 static inline Simd4FBool gmx_simdcall
operator==(Simd4Float a
, Simd4Float b
)
254 return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a
.simdInternal_
, b
.simdInternal_
, _CMP_EQ_OQ
) };
257 static inline Simd4FBool gmx_simdcall
operator!=(Simd4Float a
, Simd4Float b
)
259 return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a
.simdInternal_
, b
.simdInternal_
, _CMP_NEQ_UQ
) };
262 static inline Simd4FBool gmx_simdcall
operator<(Simd4Float a
, Simd4Float b
)
264 return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a
.simdInternal_
, b
.simdInternal_
, _CMP_LT_OS
) };
267 static inline Simd4FBool gmx_simdcall
operator<=(Simd4Float a
, Simd4Float b
)
269 return { _mm512_mask_cmp_ps_mask(_mm512_int2mask(0xF), a
.simdInternal_
, b
.simdInternal_
, _CMP_LE_OS
) };
272 static inline Simd4FBool gmx_simdcall
operator&&(Simd4FBool a
, Simd4FBool b
)
274 return { _mm512_kand(a
.simdInternal_
, b
.simdInternal_
) };
277 static inline Simd4FBool gmx_simdcall
operator||(Simd4FBool a
, Simd4FBool b
)
279 return { _mm512_kor(a
.simdInternal_
, b
.simdInternal_
) };
282 static inline bool gmx_simdcall
anyTrue(Simd4FBool a
)
284 return (_mm512_mask2int(a
.simdInternal_
) & 0xF) != 0;
287 static inline Simd4Float gmx_simdcall
selectByMask(Simd4Float a
, Simd4FBool m
)
289 return { _mm512_mask_mov_ps(_mm512_setzero_ps(), m
.simdInternal_
, a
.simdInternal_
) };
292 static inline Simd4Float gmx_simdcall
selectByNotMask(Simd4Float a
, Simd4FBool m
)
294 return { _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(m
.simdInternal_
), a
.simdInternal_
) };
297 static inline Simd4Float gmx_simdcall
blend(Simd4Float a
, Simd4Float b
, Simd4FBool sel
)
299 return { _mm512_mask_blend_ps(sel
.simdInternal_
, a
.simdInternal_
, b
.simdInternal_
) };
302 static inline float gmx_simdcall
reduce(Simd4Float a
)
304 __m512 x
= a
.simdInternal_
;
305 x
= _mm512_add_ps(x
, _mm512_swizzle_ps(x
, _MM_SWIZ_REG_BADC
));
306 x
= _mm512_add_ps(x
, _mm512_swizzle_ps(x
, _MM_SWIZ_REG_CDAB
));
308 _mm512_mask_packstorelo_ps(&f
, _mm512_mask2int(0x1), x
);
314 #endif // GMX_SIMD_IMPL_X86_MIC_SIMD4_FLOAT_H