2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 #ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
38 #define GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H
45 #include <immintrin.h>
47 #include "gromacs/utility/basedefinitions.h"
49 #include "impl_x86_mic_simd_float.h"
56 /* This is an internal helper function used by decr3Hsimd(...).
58 inline void gmx_simdcall
decrHsimd(float* m
, SimdFloat a
)
62 assert(std::size_t(m
) % 32 == 0);
64 t
= _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m
), _MM_UPCONV_PD_NONE
,
65 _MM_BROADCAST_4X8
, _MM_HINT_NONE
));
66 a
= _mm512_add_ps(a
.simdInternal_
, _mm512_permute4f128_ps(a
.simdInternal_
, _MM_PERM_BADC
));
67 t
= _mm512_sub_ps(t
, a
.simdInternal_
);
68 _mm512_mask_packstorelo_ps(m
, _mm512_int2mask(0x00FF), t
);
72 // On MIC it is better to use scatter operations, so we define the load routines
73 // that use a SIMD offset variable first.
76 static inline void gmx_simdcall
gatherLoadBySimdIntTranspose(const float* base
,
77 SimdFInt32 simdoffset
,
83 assert(std::size_t(base
) % 16 == 0);
84 assert(align
% 4 == 0);
86 // All instructions might be latency ~4 on MIC, so we use shifts where we
87 // only need a single instruction (since the shift parameter is an immediate),
88 // but multiplication otherwise.
91 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 2);
95 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 3);
99 simdoffset
= simdoffset
* SimdFInt32(align
);
102 v0
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
, sizeof(float));
103 v1
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
+ 1, sizeof(float));
104 v2
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
+ 2, sizeof(float));
105 v3
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
+ 3, sizeof(float));
109 static inline void gmx_simdcall
110 gatherLoadUBySimdIntTranspose(const float* base
, SimdFInt32 simdoffset
, SimdFloat
* v0
, SimdFloat
* v1
)
112 // All instructions might be latency ~4 on MIC, so we use shifts where we
113 // only need a single instruction (since the shift parameter is an immediate),
114 // but multiplication otherwise.
115 // For align == 2 we can merge the constant into the scale parameter,
116 // which can take constants up to 8 in total.
119 v0
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
, align
* sizeof(float));
121 _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
+ 1, align
* sizeof(float));
127 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 2);
131 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 3);
135 simdoffset
= simdoffset
* SimdFInt32(align
);
137 v0
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
, sizeof(float));
138 v1
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
+ 1, sizeof(float));
143 static inline void gmx_simdcall
144 gatherLoadBySimdIntTranspose(const float* base
, SimdFInt32 simdoffset
, SimdFloat
* v0
, SimdFloat
* v1
)
146 assert(std::size_t(base
) % 8 == 0);
147 assert(align
% 2 == 0);
148 gatherLoadUBySimdIntTranspose
<align
>(base
, simdoffset
, v0
, v1
);
152 static inline void gmx_simdcall
gatherLoadTranspose(const float* base
,
153 const std::int32_t offset
[],
159 gatherLoadBySimdIntTranspose
<align
>(base
, simdLoad(offset
, SimdFInt32Tag()), v0
, v1
, v2
, v3
);
163 static inline void gmx_simdcall
164 gatherLoadTranspose(const float* base
, const std::int32_t offset
[], SimdFloat
* v0
, SimdFloat
* v1
)
166 gatherLoadBySimdIntTranspose
<align
>(base
, simdLoad(offset
, SimdFInt32Tag()), v0
, v1
);
169 static const int c_simdBestPairAlignmentFloat
= 2;
172 static inline void gmx_simdcall
gatherLoadUTranspose(const float* base
,
173 const std::int32_t offset
[],
178 SimdFInt32 simdoffset
;
180 assert(std::size_t(offset
) % 64 == 0);
182 simdoffset
= simdLoad(offset
, SimdFInt32Tag());
184 // All instructions might be latency ~4 on MIC, so we use shifts where we
185 // only need a single instruction (since the shift parameter is an immediate),
186 // but multiplication otherwise.
189 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 2);
193 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 3);
197 simdoffset
= simdoffset
* SimdFInt32(align
);
200 v0
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
, sizeof(float));
201 v1
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
+ 1, sizeof(float));
202 v2
->simdInternal_
= _mm512_i32gather_ps(simdoffset
.simdInternal_
, base
+ 2, sizeof(float));
207 static inline void gmx_simdcall
208 transposeScatterStoreU(float* base
, const std::int32_t offset
[], SimdFloat v0
, SimdFloat v1
, SimdFloat v2
)
210 SimdFInt32 simdoffset
;
212 assert(std::size_t(offset
) % 64 == 0);
214 simdoffset
= simdLoad(offset
, SimdFInt32Tag());
216 // All instructions might be latency ~4 on MIC, so we use shifts where we
217 // only need a single instruction (since the shift parameter is an immediate),
218 // but multiplication otherwise.
221 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 2);
225 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 3);
229 simdoffset
= simdoffset
* SimdFInt32(align
);
232 _mm512_i32scatter_ps(base
, simdoffset
.simdInternal_
, v0
.simdInternal_
, sizeof(float));
233 _mm512_i32scatter_ps(base
+ 1, simdoffset
.simdInternal_
, v1
.simdInternal_
, sizeof(float));
234 _mm512_i32scatter_ps(base
+ 2, simdoffset
.simdInternal_
, v2
.simdInternal_
, sizeof(float));
239 static inline void gmx_simdcall
240 transposeScatterIncrU(float* base
, const std::int32_t offset
[], SimdFloat v0
, SimdFloat v1
, SimdFloat v2
)
242 alignas(GMX_SIMD_ALIGNMENT
) float rdata0
[GMX_SIMD_FLOAT_WIDTH
];
243 alignas(GMX_SIMD_ALIGNMENT
) float rdata1
[GMX_SIMD_FLOAT_WIDTH
];
244 alignas(GMX_SIMD_ALIGNMENT
) float rdata2
[GMX_SIMD_FLOAT_WIDTH
];
250 for (int i
= 0; i
< GMX_SIMD_FLOAT_WIDTH
; i
++)
252 base
[align
* offset
[i
] + 0] += rdata0
[i
];
253 base
[align
* offset
[i
] + 1] += rdata1
[i
];
254 base
[align
* offset
[i
] + 2] += rdata2
[i
];
259 static inline void gmx_simdcall
260 transposeScatterDecrU(float* base
, const std::int32_t offset
[], SimdFloat v0
, SimdFloat v1
, SimdFloat v2
)
262 alignas(GMX_SIMD_ALIGNMENT
) float rdata0
[GMX_SIMD_FLOAT_WIDTH
];
263 alignas(GMX_SIMD_ALIGNMENT
) float rdata1
[GMX_SIMD_FLOAT_WIDTH
];
264 alignas(GMX_SIMD_ALIGNMENT
) float rdata2
[GMX_SIMD_FLOAT_WIDTH
];
270 for (int i
= 0; i
< GMX_SIMD_FLOAT_WIDTH
; i
++)
272 base
[align
* offset
[i
] + 0] -= rdata0
[i
];
273 base
[align
* offset
[i
] + 1] -= rdata1
[i
];
274 base
[align
* offset
[i
] + 2] -= rdata2
[i
];
278 static inline void gmx_simdcall
expandScalarsToTriplets(SimdFloat scalar
,
279 SimdFloat
* triplets0
,
280 SimdFloat
* triplets1
,
281 SimdFloat
* triplets2
)
283 triplets0
->simdInternal_
= _mm512_castsi512_ps(
284 _mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0),
285 _mm512_castps_si512(scalar
.simdInternal_
)));
286 triplets1
->simdInternal_
= _mm512_castsi512_ps(_mm512_permutevar_epi32(
287 _mm512_set_epi32(10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 5, 5),
288 _mm512_castps_si512(scalar
.simdInternal_
)));
289 triplets2
->simdInternal_
= _mm512_castsi512_ps(_mm512_permutevar_epi32(
290 _mm512_set_epi32(15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10),
291 _mm512_castps_si512(scalar
.simdInternal_
)));
295 static inline float gmx_simdcall
reduceIncr4ReturnSum(float* m
, SimdFloat v0
, SimdFloat v1
, SimdFloat v2
, SimdFloat v3
)
298 __m512 t0
, t1
, t2
, t3
;
300 assert(std::size_t(m
) % 16 == 0);
302 t0
= _mm512_add_ps(v0
.simdInternal_
, _mm512_swizzle_ps(v0
.simdInternal_
, _MM_SWIZ_REG_BADC
));
303 t0
= _mm512_mask_add_ps(t0
, _mm512_int2mask(0xCCCC), v2
.simdInternal_
,
304 _mm512_swizzle_ps(v2
.simdInternal_
, _MM_SWIZ_REG_BADC
));
305 t1
= _mm512_add_ps(v1
.simdInternal_
, _mm512_swizzle_ps(v1
.simdInternal_
, _MM_SWIZ_REG_BADC
));
306 t1
= _mm512_mask_add_ps(t1
, _mm512_int2mask(0xCCCC), v3
.simdInternal_
,
307 _mm512_swizzle_ps(v3
.simdInternal_
, _MM_SWIZ_REG_BADC
));
308 t2
= _mm512_add_ps(t0
, _mm512_swizzle_ps(t0
, _MM_SWIZ_REG_CDAB
));
309 t2
= _mm512_mask_add_ps(t2
, _mm512_int2mask(0xAAAA), t1
, _mm512_swizzle_ps(t1
, _MM_SWIZ_REG_CDAB
));
311 t2
= _mm512_add_ps(t2
, _mm512_permute4f128_ps(t2
, _MM_PERM_BADC
));
312 t2
= _mm512_add_ps(t2
, _mm512_permute4f128_ps(t2
, _MM_PERM_CDAB
));
314 t0
= _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m
, _MM_UPCONV_PS_NONE
,
315 _MM_BROADCAST_4X16
, _MM_HINT_NONE
);
316 t0
= _mm512_add_ps(t0
, t2
);
317 _mm512_mask_packstorelo_ps(m
, _mm512_int2mask(0xF), t0
);
319 t2
= _mm512_add_ps(t2
, _mm512_swizzle_ps(t2
, _MM_SWIZ_REG_BADC
));
320 t2
= _mm512_add_ps(t2
, _mm512_swizzle_ps(t2
, _MM_SWIZ_REG_CDAB
));
322 _mm512_mask_packstorelo_ps(&f
, _mm512_mask2int(0x1), t2
);
326 static inline SimdFloat gmx_simdcall
loadDualHsimd(const float* m0
, const float* m1
)
328 assert(std::size_t(m0
) % 32 == 0);
329 assert(std::size_t(m1
) % 32 == 0);
331 return _mm512_castpd_ps(_mm512_mask_extload_pd(
332 _mm512_extload_pd(reinterpret_cast<const double*>(m0
), _MM_UPCONV_PD_NONE
,
333 _MM_BROADCAST_4X8
, _MM_HINT_NONE
),
334 _mm512_int2mask(0xF0), reinterpret_cast<const double*>(m1
), _MM_UPCONV_PD_NONE
,
335 _MM_BROADCAST_4X8
, _MM_HINT_NONE
));
338 static inline SimdFloat gmx_simdcall
loadDuplicateHsimd(const float* m
)
340 assert(std::size_t(m
) % 32 == 0);
342 return _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m
),
343 _MM_UPCONV_PD_NONE
, _MM_BROADCAST_4X8
, _MM_HINT_NONE
));
346 static inline SimdFloat gmx_simdcall
loadU1DualHsimd(const float* m
)
348 return _mm512_mask_extload_ps(
349 _mm512_extload_ps(m
, _MM_UPCONV_PS_NONE
, _MM_BROADCAST_1X16
, _MM_HINT_NONE
),
350 _mm512_int2mask(0xFF00), m
+ 1, _MM_UPCONV_PS_NONE
, _MM_BROADCAST_1X16
, _MM_HINT_NONE
);
354 static inline void gmx_simdcall
storeDualHsimd(float* m0
, float* m1
, SimdFloat a
)
358 assert(std::size_t(m0
) % 32 == 0);
359 assert(std::size_t(m1
) % 32 == 0);
361 _mm512_mask_packstorelo_ps(m0
, _mm512_int2mask(0x00FF), a
.simdInternal_
);
362 _mm512_mask_packstorelo_ps(m1
, _mm512_int2mask(0xFF00), a
.simdInternal_
);
365 static inline void gmx_simdcall
incrDualHsimd(float* m0
, float* m1
, SimdFloat a
)
367 assert(std::size_t(m0
) % 32 == 0);
368 assert(std::size_t(m1
) % 32 == 0);
373 x
= _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m0
), _MM_UPCONV_PD_NONE
,
374 _MM_BROADCAST_4X8
, _MM_HINT_NONE
));
375 x
= _mm512_add_ps(x
, a
.simdInternal_
);
376 _mm512_mask_packstorelo_ps(m0
, _mm512_int2mask(0x00FF), x
);
379 x
= _mm512_castpd_ps(_mm512_extload_pd(reinterpret_cast<const double*>(m1
), _MM_UPCONV_PD_NONE
,
380 _MM_BROADCAST_4X8
, _MM_HINT_NONE
));
381 x
= _mm512_add_ps(x
, a
.simdInternal_
);
382 _mm512_mask_packstorelo_ps(m1
, _mm512_int2mask(0xFF00), x
);
385 static inline void gmx_simdcall
decr3Hsimd(float* m
, SimdFloat a0
, SimdFloat a1
, SimdFloat a2
)
387 assert(std::size_t(m
) % 32 == 0);
389 decrHsimd(m
+ GMX_SIMD_FLOAT_WIDTH
/ 2, a1
);
390 decrHsimd(m
+ GMX_SIMD_FLOAT_WIDTH
, a2
);
395 static inline void gmx_simdcall
gatherLoadTransposeHsimd(const float* base0
,
397 const std::int32_t offset
[],
401 __m512i idx0
, idx1
, idx
;
404 assert(std::size_t(offset
) % 32 == 0);
405 assert(std::size_t(base0
) % 8 == 0);
406 assert(std::size_t(base1
) % 8 == 0);
407 assert(std::size_t(align
) % 2 == 0);
409 idx0
= _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), offset
);
411 idx0
= _mm512_mullo_epi32(idx0
, _mm512_set1_epi32(align
));
412 idx1
= _mm512_add_epi32(idx0
, _mm512_set1_epi32(1));
414 idx
= _mm512_mask_permute4f128_epi32(idx0
, _mm512_int2mask(0xFF00), idx1
, _MM_PERM_BABA
);
416 tmp1
= _mm512_i32gather_ps(idx
, base0
, sizeof(float));
417 tmp2
= _mm512_i32gather_ps(idx
, base1
, sizeof(float));
419 v0
->simdInternal_
= _mm512_mask_permute4f128_ps(tmp1
, _mm512_int2mask(0xFF00), tmp2
, _MM_PERM_BABA
);
420 v1
->simdInternal_
= _mm512_mask_permute4f128_ps(tmp2
, _mm512_int2mask(0x00FF), tmp1
, _MM_PERM_DCDC
);
423 static inline float gmx_simdcall
reduceIncr4ReturnSumHsimd(float* m
, SimdFloat v0
, SimdFloat v1
)
428 assert(std::size_t(m
) % 32 == 0);
430 t0
= _mm512_add_ps(v0
.simdInternal_
, _mm512_swizzle_ps(v0
.simdInternal_
, _MM_SWIZ_REG_BADC
));
431 t0
= _mm512_mask_add_ps(t0
, _mm512_int2mask(0xCCCC), v1
.simdInternal_
,
432 _mm512_swizzle_ps(v1
.simdInternal_
, _MM_SWIZ_REG_BADC
));
433 t0
= _mm512_add_ps(t0
, _mm512_swizzle_ps(t0
, _MM_SWIZ_REG_CDAB
));
434 t0
= _mm512_add_ps(t0
, _mm512_castpd_ps(_mm512_swizzle_pd(_mm512_castps_pd(t0
), _MM_SWIZ_REG_BADC
)));
435 t0
= _mm512_mask_permute4f128_ps(t0
, _mm512_int2mask(0xAAAA), t0
, _MM_PERM_BADC
);
436 t1
= _mm512_mask_extload_ps(_mm512_undefined_ps(), _mm512_int2mask(0xF), m
, _MM_UPCONV_PS_NONE
,
437 _MM_BROADCAST_4X16
, _MM_HINT_NONE
);
438 t1
= _mm512_add_ps(t1
, t0
);
439 _mm512_mask_packstorelo_ps(m
, _mm512_int2mask(0xF), t1
);
441 t0
= _mm512_add_ps(t0
, _mm512_swizzle_ps(t0
, _MM_SWIZ_REG_BADC
));
442 t0
= _mm512_add_ps(t0
, _mm512_swizzle_ps(t0
, _MM_SWIZ_REG_CDAB
));
444 _mm512_mask_packstorelo_ps(&f
, _mm512_mask2int(0x1), t0
);
450 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_FLOAT_H