2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H
37 #define GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H
39 /*! \libinternal \file
41 * \brief Reference impl., higher-level double prec. SIMD utility functions
43 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
45 * \ingroup module_simd
48 /* Avoid adding dependencies on the rest of GROMACS here (e.g. gmxassert.h)
49 * since we want to be able run the low-level SIMD implementations independently
50 * in simulators for new hardware.
61 #include "impl_reference_definitions.h"
62 #include "impl_reference_simd_double.h"
68 /*! \addtogroup module_simd */
71 /*! \name Higher-level SIMD utility functions, double precision.
73 * These include generic functions to work with triplets of data, typically
74 * coordinates, and a few utility functions to load and update data in the
75 * nonbonded kernels. These functions should be available on all implementations.
80 /*! \brief Load 4 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets,
81 * and transpose into 4 SIMD double variables.
83 * \tparam align Alignment of the memory from which we read, i.e. distance
84 * (measured in elements, not bytes) between index points.
85 * When this is identical to the number of SIMD variables
86 * (i.e., 4 for this routine) the input data is packed without
87 * padding in memory. See the SIMD parameters for exactly
88 * what memory positions are loaded.
89 * \param base Pointer to the start of the memory area
90 * \param offset Array with offsets to the start of each data point.
91 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
92 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
93 * \param[out] v2 3rd component of data, base[align*offset[i] + 2] for each i.
94 * \param[out] v3 4th component of data, base[align*offset[i] + 3] for each i.
96 * The floating-point memory locations must be aligned, but only to the smaller
97 * of four elements and the floating-point SIMD width.
99 * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
101 * \note You should NOT scale offsets before calling this routine; it is
102 * done internally by using the alignment template parameter instead.
105 static inline void gmx_simdcall
gatherLoadTranspose(const double* base
,
106 const std::int32_t offset
[],
112 // Offset list must be aligned for SIMD DINT32
113 assert(std::size_t(offset
) % (GMX_SIMD_DINT32_WIDTH
* sizeof(std::int32_t)) == 0);
114 // Base pointer must be aligned to the smaller of 4 elements and double SIMD width
115 assert(std::size_t(base
) % (std::min(GMX_SIMD_DOUBLE_WIDTH
, 4) * sizeof(double)) == 0);
116 // align parameter must also be a multiple of the above alignment requirement
117 assert(align
% std::min(GMX_SIMD_DOUBLE_WIDTH
, 4) == 0);
119 for (std::size_t i
= 0; i
< v0
->simdInternal_
.size(); i
++)
121 v0
->simdInternal_
[i
] = base
[align
* offset
[i
]];
122 v1
->simdInternal_
[i
] = base
[align
* offset
[i
] + 1];
123 v2
->simdInternal_
[i
] = base
[align
* offset
[i
] + 2];
124 v3
->simdInternal_
[i
] = base
[align
* offset
[i
] + 3];
129 /*! \brief Load 2 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets,
130 * and transpose into 2 SIMD double variables.
132 * \tparam align Alignment of the memory from which we read, i.e. distance
133 * (measured in elements, not bytes) between index points.
134 * When this is identical to the number of SIMD variables
135 * (i.e., 2 for this routine) the input data is packed without
136 * padding in memory. See the SIMD parameters for exactly
137 * what memory positions are loaded.
138 * \param base Pointer to the start of the memory area
139 * \param offset Array with offsets to the start of each data point.
140 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
141 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
143 * The floating-point memory locations must be aligned, but only to the smaller
144 * of two elements and the floating-point SIMD width.
146 * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
148 * \note You should NOT scale offsets before calling this routine; it is
149 * done internally by using the alignment template parameter instead.
152 static inline void gmx_simdcall
153 gatherLoadTranspose(const double* base
, const std::int32_t offset
[], SimdDouble
* v0
, SimdDouble
* v1
)
155 // Offset list must be aligned for SIMD DINT32
156 assert(std::size_t(offset
) % (GMX_SIMD_DINT32_WIDTH
* sizeof(std::int32_t)) == 0);
157 // Base pointer must be aligned to the smaller of 2 elements and double SIMD width
158 assert(std::size_t(base
) % (std::min(GMX_SIMD_DOUBLE_WIDTH
, 2) * sizeof(double)) == 0);
159 // align parameter must also be a multiple of the above alignment requirement
160 assert(align
% std::min(GMX_SIMD_DOUBLE_WIDTH
, 2) == 0);
162 for (std::size_t i
= 0; i
< v0
->simdInternal_
.size(); i
++)
164 v0
->simdInternal_
[i
] = base
[align
* offset
[i
]];
165 v1
->simdInternal_
[i
] = base
[align
* offset
[i
] + 1];
170 /*! \brief Best alignment to use for aligned pairs of double data.
172 * \copydetails c_simdBestPairAlignmentFloat
174 static const int c_simdBestPairAlignmentDouble
= 2;
177 /*! \brief Load 3 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets,
178 * and transpose into 3 SIMD double variables.
180 * \tparam align Alignment of the memory from which we read, i.e. distance
181 * (measured in elements, not bytes) between index points.
182 * When this is identical to the number of SIMD variables
183 * (i.e., 3 for this routine) the input data is packed without
184 * padding in memory. See the SIMD parameters for exactly
185 * what memory positions are loaded.
186 * \param base Pointer to the start of the memory area
187 * \param offset Array with offsets to the start of each data point.
188 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
189 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
190 * \param[out] v2 3rd component of data, base[align*offset[i] + 2] for each i.
192 * This function can work with both aligned (better performance) and unaligned
193 * memory. When the align parameter is not a power-of-two (align==3 would be normal
194 * for packed atomic coordinates) the memory obviously cannot be aligned, and
195 * we account for this.
196 * However, in the case where align is a power-of-two, we assume the base pointer
197 * also has the same alignment, which will enable many platforms to use faster
198 * aligned memory load operations.
199 * An easy way to think of this is that each triplet of data in memory must be
200 * aligned to the align parameter you specify when it's a power-of-two.
202 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
203 * enables us to use SIMD loads and gather operations on platforms that support it.
205 * \note You should NOT scale offsets before calling this routine; it is
206 * done internally by using the alignment template parameter instead.
207 * \note This routine uses a normal array for the offsets, since we typically
208 * load this data from memory. On the architectures we have tested this
209 * is faster even when a SIMD integer datatype is present.
210 * \note To improve performance, this function might use full-SIMD-width
211 * unaligned loads. This means you need to ensure the memory is padded
212 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
213 * starting at the last offset. If you use the Gromacs aligned memory
214 * allocation routines this will always be the case.
217 static inline void gmx_simdcall
gatherLoadUTranspose(const double* base
,
218 const std::int32_t offset
[],
223 // Offset list must be aligned for SIMD DINT32
224 assert(std::size_t(offset
) % (GMX_SIMD_DINT32_WIDTH
* sizeof(std::int32_t)) == 0);
226 for (std::size_t i
= 0; i
< v0
->simdInternal_
.size(); i
++)
228 v0
->simdInternal_
[i
] = base
[align
* offset
[i
]];
229 v1
->simdInternal_
[i
] = base
[align
* offset
[i
] + 1];
230 v2
->simdInternal_
[i
] = base
[align
* offset
[i
] + 2];
234 /*! \brief Transpose and store 3 SIMD doubles to 3 consecutive addresses at
235 * GMX_SIMD_DOUBLE_WIDTH offsets.
237 * \tparam align Alignment of the memory to which we write, i.e. distance
238 * (measured in elements, not bytes) between index points.
239 * When this is identical to the number of SIMD variables
240 * (i.e., 3 for this routine) the output data is packed without
241 * padding in memory. See the SIMD parameters for exactly
242 * what memory positions are written.
243 * \param[out] base Pointer to the start of the memory area
244 * \param offset Aligned array with offsets to the start of each triplet.
245 * \param v0 1st component of triplets, written to base[align*offset[i]].
246 * \param v1 2nd component of triplets, written to base[align*offset[i] + 1].
247 * \param v2 3rd component of triplets, written to base[align*offset[i] + 2].
249 * This function can work with both aligned (better performance) and unaligned
250 * memory. When the align parameter is not a power-of-two (align==3 would be normal
251 * for packed atomic coordinates) the memory obviously cannot be aligned, and
252 * we account for this.
253 * However, in the case where align is a power-of-two, we assume the base pointer
254 * also has the same alignment, which will enable many platforms to use faster
255 * aligned memory store operations.
256 * An easy way to think of this is that each triplet of data in memory must be
257 * aligned to the align parameter you specify when it's a power-of-two.
259 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
260 * enables us to use SIMD loads and gather operations on platforms that support it.
262 * \note You should NOT scale offsets before calling this routine; it is
263 * done internally by using the alignment template parameter instead.
264 * \note This routine uses a normal array for the offsets, since we typically
265 * load the data from memory. On the architectures we have tested this
266 * is faster even when a SIMD integer datatype is present.
269 static inline void gmx_simdcall
transposeScatterStoreU(double* base
,
270 const std::int32_t offset
[],
275 // Offset list must be aligned for SIMD DINT32
276 assert(std::size_t(offset
) % (GMX_SIMD_DINT32_WIDTH
* sizeof(std::int32_t)) == 0);
278 for (std::size_t i
= 0; i
< v0
.simdInternal_
.size(); i
++)
280 base
[align
* offset
[i
]] = v0
.simdInternal_
[i
];
281 base
[align
* offset
[i
] + 1] = v1
.simdInternal_
[i
];
282 base
[align
* offset
[i
] + 2] = v2
.simdInternal_
[i
];
287 /*! \brief Transpose and add 3 SIMD doubles to 3 consecutive addresses at
288 * GMX_SIMD_DOUBLE_WIDTH offsets.
290 * \tparam align Alignment of the memory to which we write, i.e. distance
291 * (measured in elements, not bytes) between index points.
292 * When this is identical to the number of SIMD variables
293 * (i.e., 3 for this routine) the output data is packed without
294 * padding in memory. See the SIMD parameters for exactly
295 * what memory positions are incremented.
296 * \param[out] base Pointer to the start of the memory area
297 * \param offset Aligned array with offsets to the start of each triplet.
298 * \param v0 1st component of triplets, added to base[align*offset[i]].
299 * \param v1 2nd component of triplets, added to base[align*offset[i] + 1].
300 * \param v2 3rd component of triplets, added to base[align*offset[i] + 2].
302 * This function can work with both aligned (better performance) and unaligned
303 * memory. When the align parameter is not a power-of-two (align==3 would be normal
304 * for packed atomic coordinates) the memory obviously cannot be aligned, and
305 * we account for this.
306 * However, in the case where align is a power-of-two, we assume the base pointer
307 * also has the same alignment, which will enable many platforms to use faster
308 * aligned memory load/store operations.
309 * An easy way to think of this is that each triplet of data in memory must be
310 * aligned to the align parameter you specify when it's a power-of-two.
312 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
313 * enables us to use SIMD loads and gather operations on platforms that support it.
315 * \note You should NOT scale offsets before calling this routine; it is
316 * done internally by using the alignment template parameter instead.
317 * \note This routine uses a normal array for the offsets, since we typically
318 * load the data from memory. On the architectures we have tested this
319 * is faster even when a SIMD integer datatype is present.
320 * \note To improve performance, this function might use full-SIMD-width
321 * unaligned load/store, and add 0.0 to the extra elements.
322 * This means you need to ensure the memory is padded
323 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
324 * starting at the last offset. If you use the Gromacs aligned memory
325 * allocation routines this will always be the case.
328 static inline void gmx_simdcall
329 transposeScatterIncrU(double* base
, const std::int32_t offset
[], SimdDouble v0
, SimdDouble v1
, SimdDouble v2
)
331 // Offset list must be aligned for SIMD DINT32
332 assert(std::size_t(offset
) % (GMX_SIMD_DINT32_WIDTH
* sizeof(std::int32_t)) == 0);
334 for (std::size_t i
= 0; i
< v0
.simdInternal_
.size(); i
++)
336 base
[align
* offset
[i
]] += v0
.simdInternal_
[i
];
337 base
[align
* offset
[i
] + 1] += v1
.simdInternal_
[i
];
338 base
[align
* offset
[i
] + 2] += v2
.simdInternal_
[i
];
342 /*! \brief Transpose and subtract 3 SIMD doubles to 3 consecutive addresses at
343 * GMX_SIMD_DOUBLE_WIDTH offsets.
345 * \tparam align Alignment of the memory to which we write, i.e. distance
346 * (measured in elements, not bytes) between index points.
347 * When this is identical to the number of SIMD variables
348 * (i.e., 3 for this routine) the output data is packed without
349 * padding in memory. See the SIMD parameters for exactly
350 * what memory positions are decremented.
351 * \param[out] base Pointer to start of memory.
352 * \param offset Aligned array with offsets to the start of each triplet.
353 * \param v0 1st component, subtracted from base[align*offset[i]]
354 * \param v1 2nd component, subtracted from base[align*offset[i]+1]
355 * \param v2 3rd component, subtracted from base[align*offset[i]+2]
357 * This function can work with both aligned (better performance) and unaligned
358 * memory. When the align parameter is not a power-of-two (align==3 would be normal
359 * for packed atomic coordinates) the memory obviously cannot be aligned, and
360 * we account for this.
361 * However, in the case where align is a power-of-two, we assume the base pointer
362 * also has the same alignment, which will enable many platforms to use faster
363 * aligned memory load/store operations.
364 * An easy way to think of this is that each triplet of data in memory must be
365 * aligned to the align parameter you specify when it's a power-of-two.
367 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
368 * enables us to use SIMD loads and gather operations on platforms that support it.
370 * \note You should NOT scale offsets before calling this routine; it is
371 * done internally by using the alignment template parameter instead.
372 * \note This routine uses a normal array for the offsets, since we typically
373 * load the data from memory. On the architectures we have tested this
374 * is faster even when a SIMD integer datatype is present.
375 * \note To improve performance, this function might use full-SIMD-width
376 * unaligned load/store, and subtract 0.0 from the extra elements.
377 * This means you need to ensure the memory is padded
378 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
379 * starting at the last offset. If you use the Gromacs aligned memory
380 * allocation routines this will always be the case.
383 static inline void gmx_simdcall
384 transposeScatterDecrU(double* base
, const std::int32_t offset
[], SimdDouble v0
, SimdDouble v1
, SimdDouble v2
)
386 // Offset list must be aligned for SIMD DINT32
387 assert(std::size_t(offset
) % (GMX_SIMD_DINT32_WIDTH
* sizeof(std::int32_t)) == 0);
389 for (std::size_t i
= 0; i
< v0
.simdInternal_
.size(); i
++)
391 base
[align
* offset
[i
]] -= v0
.simdInternal_
[i
];
392 base
[align
* offset
[i
] + 1] -= v1
.simdInternal_
[i
];
393 base
[align
* offset
[i
] + 2] -= v2
.simdInternal_
[i
];
398 /*! \brief Expand each element of double SIMD variable into three identical
399 * consecutive elements in three SIMD outputs.
401 * \param scalar Floating-point input, e.g. [s0 s1 s2 s3] if width=4.
402 * \param[out] triplets0 First output, e.g. [s0 s0 s0 s1] if width=4.
403 * \param[out] triplets1 Second output, e.g. [s1 s1 s2 s2] if width=4.
404 * \param[out] triplets2 Third output, e.g. [s2 s3 s3 s3] if width=4.
406 * This routine is meant to use for things like scalar-vector multiplication,
407 * where the vectors are stored in a merged format like [x0 y0 z0 x1 y1 z1 ...],
408 * while the scalars are stored as [s0 s1 s2...], and the data cannot easily
409 * be changed to SIMD-friendly layout.
411 * In this case, load 3 full-width SIMD variables from the vector array (This
412 * will always correspond to GMX_SIMD_DOUBLE_WIDTH triplets),
413 * load a single full-width variable from the scalar array, and
414 * call this routine to expand the data. You can then simply multiply the
415 * first, second and third pair of SIMD variables, and store the three
416 * results back into a suitable vector-format array.
418 static inline void gmx_simdcall
expandScalarsToTriplets(SimdDouble scalar
,
419 SimdDouble
* triplets0
,
420 SimdDouble
* triplets1
,
421 SimdDouble
* triplets2
)
423 for (std::size_t i
= 0; i
< scalar
.simdInternal_
.size(); i
++)
425 triplets0
->simdInternal_
[i
] = scalar
.simdInternal_
[i
/ 3];
426 triplets1
->simdInternal_
[i
] = scalar
.simdInternal_
[(i
+ scalar
.simdInternal_
.size()) / 3];
427 triplets2
->simdInternal_
[i
] = scalar
.simdInternal_
[(i
+ 2 * scalar
.simdInternal_
.size()) / 3];
432 /*! \brief Load 4 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
433 * specified by a SIMD integer, transpose into 4 SIMD double variables.
435 * \tparam align Alignment of the memory from which we read, i.e. distance
436 * (measured in elements, not bytes) between index points.
437 * When this is identical to the number of SIMD variables
438 * (i.e., 4 for this routine) the input data is packed without
439 * padding in memory. See the SIMD parameters for exactly
440 * what memory positions are loaded.
441 * \param base Aligned pointer to the start of the memory.
442 * \param offset SIMD integer type with offsets to the start of each triplet.
443 * \param[out] v0 First component, base[align*offset[i]] for each i.
444 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
445 * \param[out] v2 Third component, base[align*offset[i] + 2] for each i.
446 * \param[out] v3 Fourth component, base[align*offset[i] + 3] for each i.
448 * The floating-point memory locations must be aligned, but only to the smaller
449 * of four elements and the floating-point SIMD width.
451 * \note You should NOT scale offsets before calling this routine; it is
452 * done internally by using the alignment template parameter instead.
453 * \note This is a special routine primarily intended for loading Gromacs
454 * table data as efficiently as possible - this is the reason for using
455 * a SIMD offset index, since the result of the real-to-integer conversion
456 * is present in a SIMD register just before calling this routine.
459 static inline void gmx_simdcall
gatherLoadBySimdIntTranspose(const double* base
,
466 // Base pointer must be aligned to the smaller of 4 elements and double SIMD width
467 assert(std::size_t(base
) % (std::min(GMX_SIMD_DOUBLE_WIDTH
, 4) * sizeof(double)) == 0);
468 // align parameter must also be a multiple of the above alignment requirement
469 assert(align
% std::min(GMX_SIMD_DOUBLE_WIDTH
, 4) == 0);
471 for (std::size_t i
= 0; i
< v0
->simdInternal_
.size(); i
++)
473 v0
->simdInternal_
[i
] = base
[align
* offset
.simdInternal_
[i
]];
474 v1
->simdInternal_
[i
] = base
[align
* offset
.simdInternal_
[i
] + 1];
475 v2
->simdInternal_
[i
] = base
[align
* offset
.simdInternal_
[i
] + 2];
476 v3
->simdInternal_
[i
] = base
[align
* offset
.simdInternal_
[i
] + 3];
481 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
482 * (unaligned) specified by SIMD integer, transpose into 2 SIMD doubles.
484 * \tparam align Alignment of the memory from which we read, i.e. distance
485 * (measured in elements, not bytes) between index points.
486 * When this is identical to the number of SIMD variables
487 * (i.e., 2 for this routine) the input data is packed without
488 * padding in memory. See the SIMD parameters for exactly
489 * what memory positions are loaded.
490 * \param base Pointer to the start of the memory.
491 * \param offset SIMD integer type with offsets to the start of each triplet.
492 * \param[out] v0 First component, base[align*offset[i]] for each i.
493 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
495 * Since some SIMD architectures cannot handle any unaligned loads, this routine
496 * is only available if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE is 1.
498 * \note You should NOT scale offsets before calling this routine; it is
499 * done internally by using the alignment template parameter instead.
500 * \note This is a special routine primarily intended for loading Gromacs
501 * table data as efficiently as possible - this is the reason for using
502 * a SIMD offset index, since the result of the real-to-integer conversion
503 * is present in a SIMD register just before calling this routine.
506 static inline void gmx_simdcall
507 gatherLoadUBySimdIntTranspose(const double* base
, SimdDInt32 offset
, SimdDouble
* v0
, SimdDouble
* v1
)
509 for (std::size_t i
= 0; i
< v0
->simdInternal_
.size(); i
++)
511 v0
->simdInternal_
[i
] = base
[align
* offset
.simdInternal_
[i
]];
512 v1
->simdInternal_
[i
] = base
[align
* offset
.simdInternal_
[i
] + 1];
516 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
517 * specified by a SIMD integer, transpose into 2 SIMD double variables.
519 * \tparam align Alignment of the memory from which we read, i.e. distance
520 * (measured in elements, not bytes) between index points.
521 * When this is identical to the number of SIMD variables
522 * (i.e., 2 for this routine) the input data is packed without
523 * padding in memory. See the SIMD parameters for exactly
524 * what memory positions are loaded.
525 * \param base Aligned pointer to the start of the memory.
526 * \param offset SIMD integer type with offsets to the start of each triplet.
527 * \param[out] v0 First component, base[align*offset[i]] for each i.
528 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
530 * The floating-point memory locations must be aligned, but only to the smaller
531 * of two elements and the floating-point SIMD width.
533 * \note You should NOT scale offsets before calling this routine; it is
534 * done internally by using the alignment template parameter instead.
535 * \note This is a special routine primarily intended for loading Gromacs
536 * table data as efficiently as possible - this is the reason for using
537 * a SIMD offset index, since the result of the real-to-integer conversion
538 * is present in a SIMD register just before calling this routine.
541 static inline void gmx_simdcall
542 gatherLoadBySimdIntTranspose(const double* base
, SimdDInt32 offset
, SimdDouble
* v0
, SimdDouble
* v1
)
544 // Base pointer must be aligned to the smaller of 2 elements and double SIMD width
545 assert(std::size_t(base
) % (std::min(GMX_SIMD_DOUBLE_WIDTH
, 2) * sizeof(double)) == 0);
546 // align parameter must also be a multiple of the above alignment requirement
547 assert(align
% std::min(GMX_SIMD_DOUBLE_WIDTH
, 2) == 0);
549 for (std::size_t i
= 0; i
< v0
->simdInternal_
.size(); i
++)
551 v0
->simdInternal_
[i
] = base
[align
* offset
.simdInternal_
[i
]];
552 v1
->simdInternal_
[i
] = base
[align
* offset
.simdInternal_
[i
] + 1];
557 /*! \brief Reduce each of four SIMD doubles, add those values to four consecutive
558 * doubles in memory, return sum.
560 * \param m Pointer to memory where four doubles should be incremented
561 * \param v0 SIMD variable whose sum should be added to m[0]
562 * \param v1 SIMD variable whose sum should be added to m[1]
563 * \param v2 SIMD variable whose sum should be added to m[2]
564 * \param v3 SIMD variable whose sum should be added to m[3]
566 * \return Sum of all elements in the four SIMD variables.
568 * The pointer m must be aligned to the smaller of four elements and the
569 * floating-point SIMD width.
571 * \note This is a special routine intended for the Gromacs nonbonded kernels.
572 * It is used in the epilogue of the outer loop, where the variables will
573 * contain unrolled forces for one outer-loop-particle each, corresponding to
574 * a single coordinate (i.e, say, four x-coordinate force variables). These
575 * should be summed and added to the force array in memory. Since we always work
576 * with contiguous SIMD-layout , we can use efficient aligned loads/stores.
577 * When calculating the virial, we also need the total sum of all forces for
578 * each coordinate. This is provided as the return value. For routines that
579 * do not need these, this extra code will be optimized away completely if you
580 * just ignore the return value (Checked with gcc-4.9.1 and clang-3.6 for AVX).
582 static inline double gmx_simdcall
583 reduceIncr4ReturnSum(double* m
, SimdDouble v0
, SimdDouble v1
, SimdDouble v2
, SimdDouble v3
)
585 double sum
[4]; // Note that the 4 here corresponds to the 4 m-elements, not any SIMD width
587 // Make sure the memory pointer is aligned to the smaller of 4 elements and double SIMD width
588 assert(std::size_t(m
) % (std::min(GMX_SIMD_DOUBLE_WIDTH
, 4) * sizeof(double)) == 0);
600 return sum
[0] + sum
[1] + sum
[2] + sum
[3];
606 * \name Higher-level SIMD utilities accessing partial (half-width) SIMD doubles.
608 * See the single-precision versions for documentation. Since double precision
609 * is typically half the width of single, this double version is likely only
610 * useful with 512-bit and larger implementations.
615 /*! \brief Load low & high parts of SIMD double from different locations.
617 * \param m0 Pointer to memory aligned to half SIMD width.
618 * \param m1 Pointer to memory aligned to half SIMD width.
620 * \return SIMD variable with low part loaded from m0, high from m1.
622 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
624 static inline SimdDouble gmx_simdcall
loadDualHsimd(const double* m0
, const double* m1
)
628 // Make sure the memory pointers are aligned to half double SIMD width
629 assert(std::size_t(m0
) % (GMX_SIMD_DOUBLE_WIDTH
/ 2 * sizeof(double)) == 0);
630 assert(std::size_t(m1
) % (GMX_SIMD_DOUBLE_WIDTH
/ 2 * sizeof(double)) == 0);
632 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 2; i
++)
634 a
.simdInternal_
[i
] = m0
[i
];
635 a
.simdInternal_
[a
.simdInternal_
.size() / 2 + i
] = m1
[i
];
640 /*! \brief Load half-SIMD-width double data, spread to both halves.
642 * \param m Pointer to memory aligned to half SIMD width.
644 * \return SIMD variable with both halves loaded from m..
646 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
648 static inline SimdDouble gmx_simdcall
loadDuplicateHsimd(const double* m
)
652 // Make sure the memory pointer is aligned
653 assert(std::size_t(m
) % (GMX_SIMD_DOUBLE_WIDTH
/ 2 * sizeof(double)) == 0);
655 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 2; i
++)
657 a
.simdInternal_
[i
] = m
[i
];
658 a
.simdInternal_
[a
.simdInternal_
.size() / 2 + i
] = a
.simdInternal_
[i
];
663 /*! \brief Load two doubles, spread 1st in low half, 2nd in high half.
665 * \param m Pointer to two adjacent double values.
667 * \return SIMD variable where all elements in the low half have been set
668 * to m[0], and all elements in high half to m[1].
670 * \note This routine always loads two values and sets the halves separately.
671 * If you want to set all elements to the same value, simply use
672 * the standard (non-half-SIMD) operations.
674 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
676 static inline SimdDouble gmx_simdcall
loadU1DualHsimd(const double* m
)
680 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 2; i
++)
682 a
.simdInternal_
[i
] = m
[0];
683 a
.simdInternal_
[a
.simdInternal_
.size() / 2 + i
] = m
[1];
689 /*! \brief Store low & high parts of SIMD double to different locations.
691 * \param m0 Pointer to memory aligned to half SIMD width.
692 * \param m1 Pointer to memory aligned to half SIMD width.
693 * \param a SIMD variable. Low half should be stored to m0, high to m1.
695 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
697 static inline void gmx_simdcall
storeDualHsimd(double* m0
, double* m1
, SimdDouble a
)
699 // Make sure the memory pointers are aligned to half double SIMD width
700 assert(std::size_t(m0
) % (GMX_SIMD_DOUBLE_WIDTH
/ 2 * sizeof(double)) == 0);
701 assert(std::size_t(m1
) % (GMX_SIMD_DOUBLE_WIDTH
/ 2 * sizeof(double)) == 0);
703 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 2; i
++)
705 m0
[i
] = a
.simdInternal_
[i
];
706 m1
[i
] = a
.simdInternal_
[a
.simdInternal_
.size() / 2 + i
];
710 /*! \brief Add each half of SIMD variable to separate memory adresses
712 * \param m0 Pointer to memory aligned to half SIMD width.
713 * \param m1 Pointer to memory aligned to half SIMD width.
714 * \param a SIMD variable. Lower half will be added to m0, upper half to m1.
716 * The memory must be aligned to half SIMD width.
718 * \note The updated m0 value is written before m1 is read from memory, so
719 * the result will be correct even if the memory regions overlap.
721 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
723 static inline void gmx_simdcall
incrDualHsimd(double* m0
, double* m1
, SimdDouble a
)
725 // Make sure the memory pointer is aligned to half double SIMD width
726 assert(std::size_t(m0
) % (GMX_SIMD_DOUBLE_WIDTH
/ 2 * sizeof(double)) == 0);
727 assert(std::size_t(m1
) % (GMX_SIMD_DOUBLE_WIDTH
/ 2 * sizeof(double)) == 0);
729 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 2; i
++)
731 m0
[i
] += a
.simdInternal_
[i
];
733 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 2; i
++)
735 m1
[i
] += a
.simdInternal_
[a
.simdInternal_
.size() / 2 + i
];
739 /*! \brief Add the two halves of three SIMD doubles, subtract the sum from
740 * three half-SIMD-width consecutive doubles in memory.
742 * \param m half-width aligned memory, from which sum of the halves will be subtracted.
743 * \param a0 SIMD variable. Upper & lower halves will first be added.
744 * \param a1 SIMD variable. Upper & lower halves will second be added.
745 * \param a2 SIMD variable. Upper & lower halves will third be added.
747 * If the SIMD width is 8 and the vectors contain [a0 b0 c0 d0 e0 f0 g0 h0],
748 * [a1 b1 c1 d1 e1 f1 g1 g1] and [a2 b2 c2 d2 e2 f2 g2 h2], the
749 * memory will be modified to [m[0]-(a0+e0) m[1]-(b0+f0) m[2]-(c0+g0) m[3]-(d0+h0)
750 * m[4]-(a1+e1) m[5]-(b1+f1) m[6]-(c1+g1) m[7]-(d1+h1)
751 * m[8]-(a2+e2) m[9]-(b2+f2) m[10]-(c2+g2) m[11]-(d2+h2)].
753 * The memory must be aligned to half SIMD width.
755 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
757 static inline void gmx_simdcall
decr3Hsimd(double* m
, SimdDouble a0
, SimdDouble a1
, SimdDouble a2
)
759 assert(std::size_t(m
) % (GMX_SIMD_DOUBLE_WIDTH
/ 2 * sizeof(double)) == 0);
760 for (std::size_t i
= 0; i
< a0
.simdInternal_
.size() / 2; i
++)
762 m
[i
] -= a0
.simdInternal_
[i
] + a0
.simdInternal_
[a0
.simdInternal_
.size() / 2 + i
];
764 for (std::size_t i
= 0; i
< a1
.simdInternal_
.size() / 2; i
++)
766 m
[a1
.simdInternal_
.size() / 2 + i
] -=
767 a1
.simdInternal_
[i
] + a1
.simdInternal_
[a1
.simdInternal_
.size() / 2 + i
];
769 for (std::size_t i
= 0; i
< a2
.simdInternal_
.size() / 2; i
++)
771 m
[a2
.simdInternal_
.size() + i
] -=
772 a2
.simdInternal_
[i
] + a2
.simdInternal_
[a2
.simdInternal_
.size() / 2 + i
];
777 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH/2 offsets,
778 * transpose into SIMD double (low half from base0, high from base1).
780 * \tparam align Alignment of the storage, i.e. the distance
781 * (measured in elements, not bytes) between index points.
782 * When this is identical to the number of output components
783 * the data is packed without padding. This must be a
784 * multiple of the alignment to keep all data aligned.
785 * \param base0 Pointer to base of first aligned memory
786 * \param base1 Pointer to base of second aligned memory
787 * \param offset Offset to the start of each pair
788 * \param[out] v0 1st element in each pair, base0 in low and base1 in high half.
789 * \param[out] v1 2nd element in each pair, base0 in low and base1 in high half.
791 * The offset array should be of half the SIMD width length, so it corresponds
792 * to the half-SIMD-register operations. This also means it must be aligned
793 * to half the integer SIMD width (i.e., GMX_SIMD_DINT32_WIDTH/2).
795 * The floating-point memory locations must be aligned, but only to the smaller
796 * of two elements and the floating-point SIMD width.
798 * This routine is primarily designed to load nonbonded parameters in the
799 * kernels. It is the equivalent of the full-width routine
800 * gatherLoadTranspose(), but just
801 * as the other hsimd routines it will pick half-SIMD-width data from base0
802 * and put in the lower half, while the upper half comes from base1.
804 * For an example, assume the SIMD width is 8, align is 2, that
805 * base0 is [A0 A1 B0 B1 C0 C1 D0 D1 ...], and base1 [E0 E1 F0 F1 G0 G1 H0 H1...].
807 * Then we will get v0 as [A0 B0 C0 D0 E0 F0 G0 H0] and v1 as [A1 B1 C1 D1 E1 F1 G1 H1].
809 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
812 static inline void gmx_simdcall
gatherLoadTransposeHsimd(const double* base0
,
814 std::int32_t offset
[],
818 // Offset list must be aligned for half SIMD DINT32 width
819 assert(std::size_t(offset
) % (GMX_SIMD_DINT32_WIDTH
/ 2 * sizeof(std::int32_t)) == 0);
820 // base pointers must be aligned to the smaller of 2 elements and double SIMD width
821 assert(std::size_t(base0
) % (std::min(GMX_SIMD_DOUBLE_WIDTH
, 2) * sizeof(double)) == 0);
822 assert(std::size_t(base1
) % (std::min(GMX_SIMD_DOUBLE_WIDTH
, 2) * sizeof(double)) == 0);
823 // alignment parameter must be also be multiple of the above required alignment
824 assert(align
% std::min(GMX_SIMD_DOUBLE_WIDTH
, 2) == 0);
826 for (std::size_t i
= 0; i
< v0
->simdInternal_
.size() / 2; i
++)
828 v0
->simdInternal_
[i
] = base0
[align
* offset
[i
]];
829 v1
->simdInternal_
[i
] = base0
[align
* offset
[i
] + 1];
830 v0
->simdInternal_
[v0
->simdInternal_
.size() / 2 + i
] = base1
[align
* offset
[i
]];
831 v1
->simdInternal_
[v1
->simdInternal_
.size() / 2 + i
] = base1
[align
* offset
[i
] + 1];
836 /*! \brief Reduce the 4 half-SIMD-with doubles in 2 SIMD variables (sum halves),
837 * increment four consecutive doubles in memory, return sum.
839 * \param m Pointer to memory where the four values should be incremented
840 * \param v0 Variable whose half-SIMD sums should be added to m[0]/m[1], respectively.
841 * \param v1 Variable whose half-SIMD sums should be added to m[2]/m[3], respectively.
843 * \return Sum of all elements in the four SIMD variables.
845 * The pointer m must be aligned, but only to the smaller
846 * of four elements and the floating-point SIMD width.
848 * \note This is the half-SIMD-width version of
849 * reduceIncr4ReturnSum(). The only difference is that the
850 * four half-SIMD inputs needed are present in the low/high halves of the
851 * two SIMD arguments.
853 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
855 static inline double gmx_simdcall
reduceIncr4ReturnSumHsimd(double* m
, SimdDouble v0
, SimdDouble v1
)
857 // The 4 here corresponds to the 4 elements in memory, not any SIMD width
858 double sum
[4] = { 0.0, 0.0, 0.0, 0.0 };
860 for (std::size_t i
= 0; i
< v0
.simdInternal_
.size() / 2; i
++)
862 sum
[0] += v0
.simdInternal_
[i
];
863 sum
[1] += v0
.simdInternal_
[v0
.simdInternal_
.size() / 2 + i
];
864 sum
[2] += v1
.simdInternal_
[i
];
865 sum
[3] += v1
.simdInternal_
[v1
.simdInternal_
.size() / 2 + i
];
868 // Make sure the memory pointer is aligned to the smaller of 4 elements and double SIMD width
869 assert(std::size_t(m
) % (std::min(GMX_SIMD_DOUBLE_WIDTH
, 4) * sizeof(double)) == 0);
876 return sum
[0] + sum
[1] + sum
[2] + sum
[3];
879 #if GMX_SIMD_DOUBLE_WIDTH > 8 || defined DOXYGEN
880 /*! \brief Load N doubles and duplicate them 4 times each.
882 * \param m Pointer to unaligned memory
884 * \return SIMD variable with N doubles from m duplicated 4x.
886 * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
887 * N is GMX_SIMD_DOUBLE_WIDTH/4. Duplicated values are
888 * contigous and different values are 4 positions in SIMD
891 static inline SimdDouble gmx_simdcall
loadUNDuplicate4(const double* m
)
894 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 4; i
++)
896 a
.simdInternal_
[i
* 4] = m
[i
];
897 a
.simdInternal_
[i
* 4 + 1] = m
[i
];
898 a
.simdInternal_
[i
* 4 + 2] = m
[i
];
899 a
.simdInternal_
[i
* 4 + 3] = m
[i
];
904 /*! \brief Load 4 doubles and duplicate them N times each.
906 * \param m Pointer to memory aligned to 4 doubles
908 * \return SIMD variable with 4 doubles from m duplicated Nx.
910 * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
911 * N is GMX_SIMD_DOUBLE_WIDTH/4. Different values are
912 * contigous and same values are 4 positions in SIMD
915 static inline SimdDouble gmx_simdcall
load4DuplicateN(const double* m
)
918 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 4; i
++)
920 a
.simdInternal_
[i
* 4] = m
[0];
921 a
.simdInternal_
[i
* 4 + 1] = m
[1];
922 a
.simdInternal_
[i
* 4 + 2] = m
[2];
923 a
.simdInternal_
[i
* 4 + 3] = m
[3];
929 #if GMX_SIMD_DOUBLE_WIDTH >= 8 || defined DOXYGEN
930 /*! \brief Load doubles in blocks of 4 at fixed offsets
932 * \param m Pointer to unaligned memory
933 * \param offset Offset in memory between input blocks of 4
935 * \return SIMD variable with doubles from m.
937 * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
938 * Blocks of 4 doubles are loaded from m+n*offset where n
939 * is the n-th block of 4 doubles.
941 static inline SimdDouble gmx_simdcall
loadU4NOffset(const double* m
, int offset
)
944 for (std::size_t i
= 0; i
< a
.simdInternal_
.size() / 4; i
++)
946 a
.simdInternal_
[i
* 4] = m
[offset
* i
+ 0];
947 a
.simdInternal_
[i
* 4 + 1] = m
[offset
* i
+ 1];
948 a
.simdInternal_
[i
* 4 + 2] = m
[offset
* i
+ 2];
949 a
.simdInternal_
[i
* 4 + 3] = m
[offset
* i
+ 3];
963 #endif // GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H