2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 #ifndef GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
38 #define GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H
45 #include <immintrin.h>
47 #include "gromacs/utility/basedefinitions.h"
49 #include "impl_x86_mic_simd_double.h"
56 /* This is an internal helper function used by decr3Hsimd(...).
58 inline void gmx_simdcall
decrHsimd(double* m
, SimdDouble a
)
62 assert(std::size_t(m
) % 32 == 0);
64 t
= _mm512_extload_pd(m
, _MM_UPCONV_PD_NONE
, _MM_BROADCAST_4X8
, _MM_HINT_NONE
);
65 a
.simdInternal_
= _mm512_add_pd(
67 _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(a
.simdInternal_
), _MM_PERM_BADC
)));
68 t
= _mm512_sub_pd(t
, a
.simdInternal_
);
69 _mm512_mask_packstorelo_pd(m
, _mm512_int2mask(0x0F), t
);
73 // On MIC it is better to use scatter operations, so we define the load routines
74 // that use a SIMD offset variable first.
77 static inline void gmx_simdcall
gatherLoadBySimdIntTranspose(const double* base
,
78 SimdDInt32 simdoffset
,
84 assert((size_t)base
% 32 == 0);
85 assert(align
% 4 == 0);
87 // All instructions might be latency ~4 on MIC, so we use shifts where we
88 // only need a single instruction (since the shift parameter is an immediate),
89 // but multiplication otherwise.
92 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 2);
96 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 3);
100 simdoffset
= simdoffset
* SimdDInt32(align
);
103 v0
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
, sizeof(double));
104 v1
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
+ 1, sizeof(double));
105 v2
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
+ 2, sizeof(double));
106 v3
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
+ 3, sizeof(double));
110 static inline void gmx_simdcall
gatherLoadUBySimdIntTranspose(const double* base
,
111 SimdDInt32 simdoffset
,
115 // All instructions might be latency ~4 on MIC, so we use shifts where we
116 // only need a single instruction (since the shift parameter is an immediate),
117 // but multiplication otherwise.
120 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 1);
124 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 2);
128 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 3);
132 simdoffset
= simdoffset
* SimdDInt32(align
);
135 v0
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
, sizeof(double));
136 v1
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
+ 1, sizeof(double));
140 static inline void gmx_simdcall
gatherLoadBySimdIntTranspose(const double* base
,
141 SimdDInt32 simdoffset
,
145 assert(std::size_t(base
) % 16 == 0);
146 assert(align
% 2 == 0);
147 gatherLoadUBySimdIntTranspose
<align
>(base
, simdoffset
, v0
, v1
);
152 static inline void gmx_simdcall
gatherLoadTranspose(const double* base
,
153 const std::int32_t offset
[],
159 gatherLoadBySimdIntTranspose
<align
>(base
, simdLoad(offset
, SimdDInt32Tag()), v0
, v1
, v2
, v3
);
163 static inline void gmx_simdcall
164 gatherLoadTranspose(const double* base
, const std::int32_t offset
[], SimdDouble
* v0
, SimdDouble
* v1
)
166 gatherLoadBySimdIntTranspose
<align
>(base
, simdLoad(offset
, SimdDInt32Tag()), v0
, v1
);
169 static const int c_simdBestPairAlignmentDouble
= 2;
172 static inline void gmx_simdcall
gatherLoadUTranspose(const double* base
,
173 const std::int32_t offset
[],
178 SimdDInt32 simdoffset
;
180 assert(std::size_t(offset
) % 32 == 0);
182 simdoffset
= simdLoad(offset
, SimdDInt32Tag());
184 // All instructions might be latency ~4 on MIC, so we use shifts where we
185 // only need a single instruction (since the shift parameter is an immediate),
186 // but multiplication otherwise.
189 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 2);
193 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 3);
197 simdoffset
= simdoffset
* SimdDInt32(align
);
200 v0
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
, sizeof(double));
201 v1
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
+ 1, sizeof(double));
202 v2
->simdInternal_
= _mm512_i32logather_pd(simdoffset
.simdInternal_
, base
+ 2, sizeof(double));
206 static inline void gmx_simdcall
transposeScatterStoreU(double* base
,
207 const std::int32_t offset
[],
212 SimdDInt32 simdoffset
;
214 assert(std::size_t(offset
) % 32 == 0);
216 simdoffset
= simdLoad(offset
, SimdDInt32Tag());
218 // All instructions might be latency ~4 on MIC, so we use shifts where we
219 // only need a single instruction (since the shift parameter is an immediate),
220 // but multiplication otherwise.
223 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 2);
227 simdoffset
.simdInternal_
= _mm512_slli_epi32(simdoffset
.simdInternal_
, 3);
231 simdoffset
= simdoffset
* SimdDInt32(align
);
234 _mm512_i32loscatter_pd(base
, simdoffset
.simdInternal_
, v0
.simdInternal_
, sizeof(double));
235 _mm512_i32loscatter_pd(base
+ 1, simdoffset
.simdInternal_
, v1
.simdInternal_
, sizeof(double));
236 _mm512_i32loscatter_pd(base
+ 2, simdoffset
.simdInternal_
, v2
.simdInternal_
, sizeof(double));
240 static inline void gmx_simdcall
241 transposeScatterIncrU(double* base
, const std::int32_t offset
[], SimdDouble v0
, SimdDouble v1
, SimdDouble v2
)
243 alignas(GMX_SIMD_ALIGNMENT
) double rdata0
[GMX_SIMD_DOUBLE_WIDTH
];
244 alignas(GMX_SIMD_ALIGNMENT
) double rdata1
[GMX_SIMD_DOUBLE_WIDTH
];
245 alignas(GMX_SIMD_ALIGNMENT
) double rdata2
[GMX_SIMD_DOUBLE_WIDTH
];
251 for (int i
= 0; i
< GMX_SIMD_DOUBLE_WIDTH
; i
++)
253 base
[align
* offset
[i
] + 0] += rdata0
[i
];
254 base
[align
* offset
[i
] + 1] += rdata1
[i
];
255 base
[align
* offset
[i
] + 2] += rdata2
[i
];
260 static inline void gmx_simdcall
261 transposeScatterDecrU(double* base
, const std::int32_t offset
[], SimdDouble v0
, SimdDouble v1
, SimdDouble v2
)
263 alignas(GMX_SIMD_ALIGNMENT
) double rdata0
[GMX_SIMD_DOUBLE_WIDTH
];
264 alignas(GMX_SIMD_ALIGNMENT
) double rdata1
[GMX_SIMD_DOUBLE_WIDTH
];
265 alignas(GMX_SIMD_ALIGNMENT
) double rdata2
[GMX_SIMD_DOUBLE_WIDTH
];
271 for (int i
= 0; i
< GMX_SIMD_DOUBLE_WIDTH
; i
++)
273 base
[align
* offset
[i
] + 0] -= rdata0
[i
];
274 base
[align
* offset
[i
] + 1] -= rdata1
[i
];
275 base
[align
* offset
[i
] + 2] -= rdata2
[i
];
279 static inline void gmx_simdcall
expandScalarsToTriplets(SimdDouble scalar
,
280 SimdDouble
* triplets0
,
281 SimdDouble
* triplets1
,
282 SimdDouble
* triplets2
)
284 triplets0
->simdInternal_
= _mm512_castsi512_pd(
285 _mm512_permutevar_epi32(_mm512_set_epi32(5, 4, 5, 4, 3, 2, 3, 2, 3, 2, 1, 0, 1, 0, 1, 0),
286 _mm512_castpd_si512(scalar
.simdInternal_
)));
287 triplets1
->simdInternal_
= _mm512_castsi512_pd(_mm512_permutevar_epi32(
288 _mm512_set_epi32(11, 10, 9, 8, 9, 8, 9, 8, 7, 6, 7, 6, 7, 6, 5, 4),
289 _mm512_castpd_si512(scalar
.simdInternal_
)));
290 triplets2
->simdInternal_
= _mm512_castsi512_pd(_mm512_permutevar_epi32(
291 _mm512_set_epi32(15, 14, 15, 14, 15, 14, 13, 12, 13, 12, 13, 12, 11, 10, 11, 10),
292 _mm512_castpd_si512(scalar
.simdInternal_
)));
296 static inline double gmx_simdcall
297 reduceIncr4ReturnSum(double* m
, SimdDouble v0
, SimdDouble v1
, SimdDouble v2
, SimdDouble v3
)
300 __m512d t0
, t1
, t2
, t3
;
302 assert(std::size_t(m
) % 32 == 0);
304 t0
= _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v0
.simdInternal_
, v2
.simdInternal_
),
306 t2
= _mm512_mask_blend_pd(_mm512_int2mask(0x33), v2
.simdInternal_
, v0
.simdInternal_
);
307 t1
= _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0x33), v1
.simdInternal_
, v3
.simdInternal_
),
309 t3
= _mm512_mask_blend_pd(_mm512_int2mask(0x33), v3
.simdInternal_
, v1
.simdInternal_
);
310 t0
= _mm512_add_pd(t0
, t2
);
311 t1
= _mm512_add_pd(t1
, t3
);
313 t2
= _mm512_swizzle_pd(_mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t0
, t1
), _MM_SWIZ_REG_CDAB
);
314 t3
= _mm512_mask_blend_pd(_mm512_int2mask(0b01010101), t1
, t0
);
315 t2
= _mm512_add_pd(t2
, t3
);
317 t2
= _mm512_add_pd(t2
, _mm512_castps_pd(_mm512_permute4f128_ps(_mm512_castpd_ps(t2
), _MM_PERM_BADC
)));
319 t0
= _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m
, _MM_UPCONV_PD_NONE
,
320 _MM_BROADCAST_4X8
, _MM_HINT_NONE
);
321 t0
= _mm512_add_pd(t0
, t2
);
322 _mm512_mask_packstorelo_pd(m
, _mm512_int2mask(0xF), t0
);
324 t2
= _mm512_add_pd(t2
, _mm512_swizzle_pd(t2
, _MM_SWIZ_REG_BADC
));
325 t2
= _mm512_add_pd(t2
, _mm512_swizzle_pd(t2
, _MM_SWIZ_REG_CDAB
));
327 _mm512_mask_packstorelo_pd(&d
, _mm512_mask2int(0x01), t2
);
331 static inline SimdDouble gmx_simdcall
loadDualHsimd(const double* m0
, const double* m1
)
333 assert(std::size_t(m0
) % 32 == 0);
334 assert(std::size_t(m1
) % 32 == 0);
336 return _mm512_mask_extload_pd(
337 _mm512_extload_pd(m0
, _MM_UPCONV_PD_NONE
, _MM_BROADCAST_4X8
, _MM_HINT_NONE
),
338 _mm512_int2mask(0xF0), m1
, _MM_UPCONV_PD_NONE
, _MM_BROADCAST_4X8
, _MM_HINT_NONE
);
341 static inline SimdDouble gmx_simdcall
loadDuplicateHsimd(const double* m
)
343 assert(std::size_t(m
) % 32 == 0);
345 return _mm512_extload_pd(m
, _MM_UPCONV_PD_NONE
, _MM_BROADCAST_4X8
, _MM_HINT_NONE
);
348 static inline SimdDouble gmx_simdcall
loadU1DualHsimd(const double* m
)
350 return _mm512_mask_extload_pd(
351 _mm512_extload_pd(m
, _MM_UPCONV_PD_NONE
, _MM_BROADCAST_1X8
, _MM_HINT_NONE
),
352 _mm512_int2mask(0xF0), m
+ 1, _MM_UPCONV_PD_NONE
, _MM_BROADCAST_1X8
, _MM_HINT_NONE
);
356 static inline void gmx_simdcall
storeDualHsimd(double* m0
, double* m1
, SimdDouble a
)
358 assert(std::size_t(m0
) % 32 == 0);
359 assert(std::size_t(m1
) % 32 == 0);
361 _mm512_mask_packstorelo_pd(m0
, _mm512_int2mask(0x0F), a
.simdInternal_
);
362 _mm512_mask_packstorelo_pd(m1
, _mm512_int2mask(0xF0), a
.simdInternal_
);
365 static inline void gmx_simdcall
incrDualHsimd(double* m0
, double* m1
, SimdDouble a
)
367 assert(std::size_t(m0
) % 32 == 0);
368 assert(std::size_t(m1
) % 32 == 0);
373 x
= _mm512_extload_pd(m0
, _MM_UPCONV_PD_NONE
, _MM_BROADCAST_4X8
, _MM_HINT_NONE
);
374 x
= _mm512_add_pd(x
, a
.simdInternal_
);
375 _mm512_mask_packstorelo_pd(m0
, _mm512_int2mask(0x0F), x
);
378 x
= _mm512_extload_pd(m1
, _MM_UPCONV_PD_NONE
, _MM_BROADCAST_4X8
, _MM_HINT_NONE
);
379 x
= _mm512_add_pd(x
, a
.simdInternal_
);
380 _mm512_mask_packstorelo_pd(m1
, _mm512_int2mask(0xF0), x
);
383 static inline void gmx_simdcall
decr3Hsimd(double* m
, SimdDouble a0
, SimdDouble a1
, SimdDouble a2
)
385 assert(std::size_t(m
) % 32 == 0);
387 decrHsimd(m
+ GMX_SIMD_DOUBLE_WIDTH
/ 2, a1
);
388 decrHsimd(m
+ GMX_SIMD_DOUBLE_WIDTH
, a2
);
393 static inline void gmx_simdcall
gatherLoadTransposeHsimd(const double* base0
,
395 const std::int32_t offset
[],
399 __m512i idx0
, idx1
, idx
;
402 assert(std::size_t(offset
) % 16 == 0);
403 assert(std::size_t(base0
) % 16 == 0);
404 assert(std::size_t(base1
) % 16 == 0);
405 assert(std::size_t(align
) % 2 == 0);
407 idx0
= _mm512_extload_epi32(offset
, _MM_UPCONV_EPI32_NONE
, _MM_BROADCAST_4X16
, _MM_HINT_NONE
);
409 idx0
= _mm512_mullo_epi32(idx0
, _mm512_set1_epi32(align
));
410 idx1
= _mm512_add_epi32(idx0
, _mm512_set1_epi32(1));
412 idx
= _mm512_mask_permute4f128_epi32(idx0
, _mm512_int2mask(0x00F0), idx1
, _MM_PERM_AAAA
);
414 tmp1
= _mm512_i32logather_pd(idx
, base0
, sizeof(double));
415 tmp2
= _mm512_i32logather_pd(idx
, base1
, sizeof(double));
417 v0
->simdInternal_
= _mm512_castps_pd(_mm512_mask_permute4f128_ps(
418 _mm512_castpd_ps(tmp1
), _mm512_int2mask(0xFF00), _mm512_castpd_ps(tmp2
), _MM_PERM_BABA
));
419 v1
->simdInternal_
= _mm512_castps_pd(_mm512_mask_permute4f128_ps(
420 _mm512_castpd_ps(tmp2
), _mm512_int2mask(0x00FF), _mm512_castpd_ps(tmp1
), _MM_PERM_DCDC
));
423 static inline double gmx_simdcall
reduceIncr4ReturnSumHsimd(double* m
, SimdDouble v0
, SimdDouble v1
)
428 assert(std::size_t(m
) % 32 == 0);
430 t0
= _mm512_add_pd(v0
.simdInternal_
, _mm512_swizzle_pd(v0
.simdInternal_
, _MM_SWIZ_REG_BADC
));
431 t0
= _mm512_mask_add_pd(t0
, _mm512_int2mask(0xCC), v1
.simdInternal_
,
432 _mm512_swizzle_pd(v1
.simdInternal_
, _MM_SWIZ_REG_BADC
));
433 t0
= _mm512_add_pd(t0
, _mm512_swizzle_pd(t0
, _MM_SWIZ_REG_CDAB
));
434 t0
= _mm512_castps_pd(_mm512_mask_permute4f128_ps(_mm512_castpd_ps(t0
), _mm512_int2mask(0xCCCC),
435 _mm512_castpd_ps(t0
), _MM_PERM_DCDC
));
437 t1
= _mm512_mask_extload_pd(_mm512_undefined_pd(), _mm512_int2mask(0xF), m
, _MM_UPCONV_PD_NONE
,
438 _MM_BROADCAST_4X8
, _MM_HINT_NONE
);
439 t1
= _mm512_add_pd(t1
, t0
);
440 _mm512_mask_packstorelo_pd(m
, _mm512_int2mask(0xF), t1
);
442 t0
= _mm512_add_pd(t0
, _mm512_swizzle_pd(t0
, _MM_SWIZ_REG_BADC
));
443 t0
= _mm512_add_pd(t0
, _mm512_swizzle_pd(t0
, _MM_SWIZ_REG_CDAB
));
445 _mm512_mask_packstorelo_pd(&d
, _mm512_mask2int(0x03), t0
);
451 #endif // GMX_SIMD_IMPL_X86_MIC_UTIL_DOUBLE_H