128-bit AVX2 SIMD for AMD Ryzen
[gromacs.git] / src / gromacs / simd / impl_reference / impl_reference_util_float.h
blob2254d7f967c536593dbd0d2fefdf59667f54bd77
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H
39 /*! \libinternal \file
41 * \brief Reference impl., higher-level single prec. SIMD utility functions
43 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
45 * \ingroup module_simd
48 /* Avoid adding dependencies on the rest of GROMACS here (e.g. gmxassert.h)
49 * since we want to be able run the low-level SIMD implementations independently
50 * in simulators for new hardware.
53 #include "config.h"
55 #include <cassert>
56 #include <cstddef>
57 #include <cstdint>
59 #include <algorithm>
61 #include "impl_reference_definitions.h"
62 #include "impl_reference_simd_float.h"
64 namespace gmx
67 /*! \cond libapi */
68 /*! \addtogroup module_simd */
69 /*! \{ */
71 /* \name Higher-level SIMD utility functions, single precision.
73 * These include generic functions to work with triplets of data, typically
74 * coordinates, and a few utility functions to load and update data in the
75 * nonbonded kernels.
76 * These functions should be available on all implementations, although
77 * some wide SIMD implementations (width>=8) also provide special optional
78 * versions to work with half or quarter registers to improve the performance
79 * in the nonbonded kernels.
81 * \{
84 /*! \brief Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
85 * and transpose into 4 SIMD float variables.
87 * \tparam align Alignment of the memory from which we read, i.e. distance
88 * (measured in elements, not bytes) between index points.
89 * When this is identical to the number of SIMD variables
90 * (i.e., 4 for this routine) the input data is packed without
91 * padding in memory. See the SIMD parameters for exactly
92 * what memory positions are loaded.
93 * \param base Pointer to the start of the memory area
94 * \param offset Array with offsets to the start of each data point.
95 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
96 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
97 * \param[out] v2 3rd component of data, base[align*offset[i] + 2] for each i.
98 * \param[out] v3 4th component of data, base[align*offset[i] + 3] for each i.
100 * The floating-point memory locations must be aligned, but only to the smaller
101 * of four elements and the floating-point SIMD width.
103 * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
105 * \note You should NOT scale offsets before calling this routine; it is
106 * done internally by using the alignment template parameter instead.
108 template <int align>
109 static inline void gmx_simdcall
110 gatherLoadTranspose(const float * base,
111 const std::int32_t offset[],
112 SimdFloat * v0,
113 SimdFloat * v1,
114 SimdFloat * v2,
115 SimdFloat * v3)
117 // Offset list must be aligned for SIMD FINT32
118 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
119 // Base pointer must be aligned to the smaller of 4 elements and float SIMD width
120 assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4)*sizeof(float)) == 0);
121 // align parameter must also be a multiple of the above alignment requirement
122 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 4) == 0);
124 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
126 v0->simdInternal_[i] = base[align * offset[i]];
127 v1->simdInternal_[i] = base[align * offset[i] + 1];
128 v2->simdInternal_[i] = base[align * offset[i] + 2];
129 v3->simdInternal_[i] = base[align * offset[i] + 3];
133 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
134 * and transpose into 2 SIMD float variables.
136 * \tparam align Alignment of the memory from which we read, i.e. distance
137 * (measured in elements, not bytes) between index points.
138 * When this is identical to the number of SIMD variables
139 * (i.e., 2 for this routine) the input data is packed without
140 * padding in memory. See the SIMD parameters for exactly
141 * what memory positions are loaded.
142 * \param base Pointer to the start of the memory area
143 * \param offset Array with offsets to the start of each data point.
144 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
145 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
147 * The floating-point memory locations must be aligned, but only to the smaller
148 * of two elements and the floating-point SIMD width.
150 * The offset memory must be aligned to GMX_SIMD_FINT32_WIDTH.
152 * To achieve the best possible performance, you should store your data with
153 * alignment \ref c_simdBestPairAlignmentFloat in single, or
154 * \ref c_simdBestPairAlignmentDouble in double.
156 * \note You should NOT scale offsets before calling this routine; it is
157 * done internally by using the alignment template parameter instead.
159 template <int align>
160 static inline void gmx_simdcall
161 gatherLoadTranspose(const float * base,
162 const std::int32_t offset[],
163 SimdFloat * v0,
164 SimdFloat * v1)
166 // Offset list must be aligned for SIMD FINT32
167 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
168 // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
169 assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2)*sizeof(float)) == 0);
170 // align parameter must also be a multiple of the above alignment requirement
171 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
173 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
175 v0->simdInternal_[i] = base[align * offset[i]];
176 v1->simdInternal_[i] = base[align * offset[i] + 1];
181 /*! \brief Best alignment to use for aligned pairs of float data.
183 * The routines to load and transpose data will work with a wide range of
184 * alignments, but some might be faster than others, depending on the load
185 * instructions available in the hardware. This specifies the best
186 * alignment for each implementation when working with pairs of data.
188 * To allow each architecture to use the most optimal form, we use a constant
189 * that code outside the SIMD module should use to store things properly. It
190 * must be at least 2. For example, a value of 2 means the two parameters A & B
191 * are stored as [A0 B0 A1 B1] while align-4 means [A0 B0 - - A1 B1 - -].
193 * This alignment depends on the efficiency of partial-register load/store
194 * operations, and will depend on the architecture.
196 static const int c_simdBestPairAlignmentFloat = 2;
199 /*! \brief Load 3 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
200 * and transpose into 3 SIMD float variables.
202 * \tparam align Alignment of the memory from which we read, i.e. distance
203 * (measured in elements, not bytes) between index points.
204 * When this is identical to the number of SIMD variables
205 * (i.e., 3 for this routine) the input data is packed without
206 * padding in memory. See the SIMD parameters for exactly
207 * what memory positions are loaded.
208 * \param base Pointer to the start of the memory area
209 * \param offset Array with offsets to the start of each data point.
210 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
211 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
212 * \param[out] v2 3rd component of data, base[align*offset[i] + 2] for each i.
214 * This function can work with both aligned (better performance) and unaligned
215 * memory. When the align parameter is not a power-of-two (align==3 would be normal
216 * for packed atomic coordinates) the memory obviously cannot be aligned, and
217 * we account for this.
218 * However, in the case where align is a power-of-two, we assume the base pointer
219 * also has the same alignment, which will enable many platforms to use faster
220 * aligned memory load operations.
221 * An easy way to think of this is that each triplet of data in memory must be
222 * aligned to the align parameter you specify when it's a power-of-two.
224 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
225 * enables us to use SIMD loads and gather operations on platforms that support it.
227 * \note You should NOT scale offsets before calling this routine; it is
228 * done internally by using the alignment template parameter instead.
229 * \note This routine uses a normal array for the offsets, since we typically
230 * load this data from memory. On the architectures we have tested this
231 * is faster even when a SIMD integer datatype is present.
232 * \note To improve performance, this function might use full-SIMD-width
233 * unaligned loads. This means you need to ensure the memory is padded
234 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
235 * starting at the last offset. If you use the Gromacs aligned memory
236 * allocation routines this will always be the case.
238 template <int align>
239 static inline void gmx_simdcall
240 gatherLoadUTranspose(const float * base,
241 const std::int32_t offset[],
242 SimdFloat * v0,
243 SimdFloat * v1,
244 SimdFloat * v2)
246 // Offset list must be aligned for SIMD FINT32
247 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
249 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
251 v0->simdInternal_[i] = base[align * offset[i]];
252 v1->simdInternal_[i] = base[align * offset[i] + 1];
253 v2->simdInternal_[i] = base[align * offset[i] + 2];
258 /*! \brief Transpose and store 3 SIMD floats to 3 consecutive addresses at
259 * GMX_SIMD_FLOAT_WIDTH offsets.
261 * \tparam align Alignment of the memory to which we write, i.e. distance
262 * (measured in elements, not bytes) between index points.
263 * When this is identical to the number of SIMD variables
264 * (i.e., 3 for this routine) the output data is packed without
265 * padding in memory. See the SIMD parameters for exactly
266 * what memory positions are written.
267 * \param[out] base Pointer to the start of the memory area
268 * \param offset Aligned array with offsets to the start of each triplet.
269 * \param v0 1st component of triplets, written to base[align*offset[i]].
270 * \param v1 2nd component of triplets, written to base[align*offset[i] + 1].
271 * \param v2 3rd component of triplets, written to base[align*offset[i] + 2].
273 * This function can work with both aligned (better performance) and unaligned
274 * memory. When the align parameter is not a power-of-two (align==3 would be normal
275 * for packed atomic coordinates) the memory obviously cannot be aligned, and
276 * we account for this.
277 * However, in the case where align is a power-of-two, we assume the base pointer
278 * also has the same alignment, which will enable many platforms to use faster
279 * aligned memory store operations.
280 * An easy way to think of this is that each triplet of data in memory must be
281 * aligned to the align parameter you specify when it's a power-of-two.
283 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
284 * enables us to use SIMD loads and gather operations on platforms that support it.
286 * \note You should NOT scale offsets before calling this routine; it is
287 * done internally by using the alignment template parameter instead.
288 * \note This routine uses a normal array for the offsets, since we typically
289 * load the data from memory. On the architectures we have tested this
290 * is faster even when a SIMD integer datatype is present.
292 template <int align>
293 static inline void gmx_simdcall
294 transposeScatterStoreU(float * base,
295 const std::int32_t offset[],
296 SimdFloat v0,
297 SimdFloat v1,
298 SimdFloat v2)
300 // Offset list must be aligned for SIMD FINT32
301 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
303 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
305 base[align * offset[i]] = v0.simdInternal_[i];
306 base[align * offset[i] + 1] = v1.simdInternal_[i];
307 base[align * offset[i] + 2] = v2.simdInternal_[i];
312 /*! \brief Transpose and add 3 SIMD floats to 3 consecutive addresses at
313 * GMX_SIMD_FLOAT_WIDTH offsets.
315 * \tparam align Alignment of the memory to which we write, i.e. distance
316 * (measured in elements, not bytes) between index points.
317 * When this is identical to the number of SIMD variables
318 * (i.e., 3 for this routine) the output data is packed without
319 * padding in memory. See the SIMD parameters for exactly
320 * what memory positions are incremented.
321 * \param[out] base Pointer to the start of the memory area
322 * \param offset Aligned array with offsets to the start of each triplet.
323 * \param v0 1st component of triplets, added to base[align*offset[i]].
324 * \param v1 2nd component of triplets, added to base[align*offset[i] + 1].
325 * \param v2 3rd component of triplets, added to base[align*offset[i] + 2].
327 * This function can work with both aligned (better performance) and unaligned
328 * memory. When the align parameter is not a power-of-two (align==3 would be normal
329 * for packed atomic coordinates) the memory obviously cannot be aligned, and
330 * we account for this.
331 * However, in the case where align is a power-of-two, we assume the base pointer
332 * also has the same alignment, which will enable many platforms to use faster
333 * aligned memory load/store operations.
334 * An easy way to think of this is that each triplet of data in memory must be
335 * aligned to the align parameter you specify when it's a power-of-two.
337 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
338 * enables us to use SIMD loads and gather operations on platforms that support it.
340 * \note You should NOT scale offsets before calling this routine; it is
341 * done internally by using the alignment template parameter instead.
342 * \note This routine uses a normal array for the offsets, since we typically
343 * load the data from memory. On the architectures we have tested this
344 * is faster even when a SIMD integer datatype is present.
345 * \note To improve performance, this function might use full-SIMD-width
346 * unaligned load/store, and add 0.0 to the extra elements.
347 * This means you need to ensure the memory is padded
348 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
349 * starting at the last offset. If you use the Gromacs aligned memory
350 * allocation routines this will always be the case.
352 template <int align>
353 static inline void gmx_simdcall
354 transposeScatterIncrU(float * base,
355 const std::int32_t offset[],
356 SimdFloat v0,
357 SimdFloat v1,
358 SimdFloat v2)
360 // Offset list must be aligned for SIMD FINT32
361 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
363 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
365 base[align * offset[i]] += v0.simdInternal_[i];
366 base[align * offset[i] + 1] += v1.simdInternal_[i];
367 base[align * offset[i] + 2] += v2.simdInternal_[i];
372 /*! \brief Transpose and subtract 3 SIMD floats to 3 consecutive addresses at
373 * GMX_SIMD_FLOAT_WIDTH offsets.
375 * \tparam align Alignment of the memory to which we write, i.e. distance
376 * (measured in elements, not bytes) between index points.
377 * When this is identical to the number of SIMD variables
378 * (i.e., 3 for this routine) the output data is packed without
379 * padding in memory. See the SIMD parameters for exactly
380 * what memory positions are decremented.
381 * \param[out] base Pointer to start of memory.
382 * \param offset Aligned array with offsets to the start of each triplet.
383 * \param v0 1st component, subtracted from base[align*offset[i]]
384 * \param v1 2nd component, subtracted from base[align*offset[i]+1]
385 * \param v2 3rd component, subtracted from base[align*offset[i]+2]
387 * This function can work with both aligned (better performance) and unaligned
388 * memory. When the align parameter is not a power-of-two (align==3 would be normal
389 * for packed atomic coordinates) the memory obviously cannot be aligned, and
390 * we account for this.
391 * However, in the case where align is a power-of-two, we assume the base pointer
392 * also has the same alignment, which will enable many platforms to use faster
393 * aligned memory load/store operations.
394 * An easy way to think of this is that each triplet of data in memory must be
395 * aligned to the align parameter you specify when it's a power-of-two.
397 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
398 * enables us to use SIMD loads and gather operations on platforms that support it.
400 * \note You should NOT scale offsets before calling this routine; it is
401 * done internally by using the alignment template parameter instead.
402 * \note This routine uses a normal array for the offsets, since we typically
403 * load the data from memory. On the architectures we have tested this
404 * is faster even when a SIMD integer datatype is present.
405 * \note To improve performance, this function might use full-SIMD-width
406 * unaligned load/store, and subtract 0.0 from the extra elements.
407 * This means you need to ensure the memory is padded
408 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
409 * starting at the last offset. If you use the Gromacs aligned memory
410 * allocation routines this will always be the case.
412 template <int align>
413 static inline void gmx_simdcall
414 transposeScatterDecrU(float * base,
415 const std::int32_t offset[],
416 SimdFloat v0,
417 SimdFloat v1,
418 SimdFloat v2)
420 // Offset list must be aligned for SIMD FINT32
421 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
423 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
425 base[align * offset[i]] -= v0.simdInternal_[i];
426 base[align * offset[i] + 1] -= v1.simdInternal_[i];
427 base[align * offset[i] + 2] -= v2.simdInternal_[i];
432 /*! \brief Expand each element of float SIMD variable into three identical
433 * consecutive elements in three SIMD outputs.
435 * \param scalar Floating-point input, e.g. [s0 s1 s2 s3] if width=4.
436 * \param[out] triplets0 First output, e.g. [s0 s0 s0 s1] if width=4.
437 * \param[out] triplets1 Second output, e.g. [s1 s1 s2 s2] if width=4.
438 * \param[out] triplets2 Third output, e.g. [s2 s3 s3 s3] if width=4.
440 * This routine is meant to use for things like scalar-vector multiplication,
441 * where the vectors are stored in a merged format like [x0 y0 z0 x1 y1 z1 ...],
442 * while the scalars are stored as [s0 s1 s2...], and the data cannot easily
443 * be changed to SIMD-friendly layout.
445 * In this case, load 3 full-width SIMD variables from the vector array (This
446 * will always correspond to GMX_SIMD_FLOAT_WIDTH triplets),
447 * load a single full-width variable from the scalar array, and
448 * call this routine to expand the data. You can then simply multiply the
449 * first, second and third pair of SIMD variables, and store the three
450 * results back into a suitable vector-format array.
452 static inline void gmx_simdcall
453 expandScalarsToTriplets(SimdFloat scalar,
454 SimdFloat * triplets0,
455 SimdFloat * triplets1,
456 SimdFloat * triplets2)
458 for (std::size_t i = 0; i < scalar.simdInternal_.size(); i++)
460 triplets0->simdInternal_[i] = scalar.simdInternal_[i / 3];
461 triplets1->simdInternal_[i] = scalar.simdInternal_[(i + scalar.simdInternal_.size()) / 3];
462 triplets2->simdInternal_[i] = scalar.simdInternal_[(i + 2 * scalar.simdInternal_.size()) / 3];
466 /*! \brief Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
467 * specified by a SIMD integer, transpose into 4 SIMD float variables.
469 * \tparam align Alignment of the memory from which we read, i.e. distance
470 * (measured in elements, not bytes) between index points.
471 * When this is identical to the number of SIMD variables
472 * (i.e., 4 for this routine) the input data is packed without
473 * padding in memory. See the SIMD parameters for exactly
474 * what memory positions are loaded.
475 * \param base Aligned pointer to the start of the memory.
476 * \param offset SIMD integer type with offsets to the start of each triplet.
477 * \param[out] v0 First component, base[align*offset[i]] for each i.
478 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
479 * \param[out] v2 Third component, base[align*offset[i] + 2] for each i.
480 * \param[out] v3 Fourth component, base[align*offset[i] + 3] for each i.
482 * The floating-point memory locations must be aligned, but only to the smaller
483 * of four elements and the floating-point SIMD width.
485 * \note You should NOT scale offsets before calling this routine; it is
486 * done internally by using the alignment template parameter instead.
487 * \note This is a special routine primarily intended for loading Gromacs
488 * table data as efficiently as possible - this is the reason for using
489 * a SIMD offset index, since the result of the real-to-integer conversion
490 * is present in a SIMD register just before calling this routine.
492 template <int align>
493 static inline void gmx_simdcall
494 gatherLoadBySimdIntTranspose(const float * base,
495 SimdFInt32 offset,
496 SimdFloat * v0,
497 SimdFloat * v1,
498 SimdFloat * v2,
499 SimdFloat * v3)
501 // Base pointer must be aligned to the smaller of 4 elements and float SIMD width
502 assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4)*sizeof(float)) == 0);
503 // align parameter must also be a multiple of the above alignment requirement
504 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 4) == 0);
506 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
508 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
509 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
510 v2->simdInternal_[i] = base[align * offset.simdInternal_[i] + 2];
511 v3->simdInternal_[i] = base[align * offset.simdInternal_[i] + 3];
516 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
517 * (unaligned) specified by SIMD integer, transpose into 2 SIMD floats.
519 * \tparam align Alignment of the memory from which we read, i.e. distance
520 * (measured in elements, not bytes) between index points.
521 * When this is identical to the number of SIMD variables
522 * (i.e., 2 for this routine) the input data is packed without
523 * padding in memory. See the SIMD parameters for exactly
524 * what memory positions are loaded.
525 * \param base Pointer to the start of the memory.
526 * \param offset SIMD integer type with offsets to the start of each triplet.
527 * \param[out] v0 First component, base[align*offset[i]] for each i.
528 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
530 * Since some SIMD architectures cannot handle any unaligned loads, this routine
531 * is only available if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE is 1.
533 * \note You should NOT scale offsets before calling this routine; it is
534 * done internally by using the alignment template parameter instead.
535 * \note This is a special routine primarily intended for loading Gromacs
536 * table data as efficiently as possible - this is the reason for using
537 * a SIMD offset index, since the result of the real-to-integer conversion
538 * is present in a SIMD register just before calling this routine.
540 template <int align>
541 static inline void gmx_simdcall
542 gatherLoadUBySimdIntTranspose(const float * base,
543 SimdFInt32 offset,
544 SimdFloat * v0,
545 SimdFloat * v1)
547 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
549 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
550 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
554 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
555 * specified by a SIMD integer, transpose into 2 SIMD float variables.
557 * \tparam align Alignment of the memory from which we read, i.e. distance
558 * (measured in elements, not bytes) between index points.
559 * When this is identical to the number of SIMD variables
560 * (i.e., 2 for this routine) the input data is packed without
561 * padding in memory. See the SIMD parameters for exactly
562 * what memory positions are loaded.
563 * \param base Aligned pointer to the start of the memory.
564 * \param offset SIMD integer type with offsets to the start of each triplet.
565 * \param[out] v0 First component, base[align*offset[i]] for each i.
566 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
568 * The floating-point memory locations must be aligned, but only to the smaller
569 * of two elements and the floating-point SIMD width.
571 * \note You should NOT scale offsets before calling this routine; it is
572 * done internally by using the alignment template parameter instead.
573 * \note This is a special routine primarily intended for loading Gromacs
574 * table data as efficiently as possible - this is the reason for using
575 * a SIMD offset index, since the result of the real-to-integer conversion
576 * is present in a SIMD register just before calling this routine.
578 template <int align>
579 static inline void gmx_simdcall
580 gatherLoadBySimdIntTranspose(const float * base,
581 SimdFInt32 offset,
582 SimdFloat * v0,
583 SimdFloat * v1)
585 // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
586 assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2)*sizeof(float)) == 0);
587 // align parameter must also be a multiple of the above alignment requirement
588 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
590 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
592 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
593 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
598 /*! \brief Reduce each of four SIMD floats, add those values to four consecutive
599 * floats in memory, return sum.
601 * \param m Pointer to memory where four floats should be incremented
602 * \param v0 SIMD variable whose sum should be added to m[0]
603 * \param v1 SIMD variable whose sum should be added to m[1]
604 * \param v2 SIMD variable whose sum should be added to m[2]
605 * \param v3 SIMD variable whose sum should be added to m[3]
607 * \return Sum of all elements in the four SIMD variables.
609 * The pointer m must be aligned to the smaller of four elements and the
610 * floating-point SIMD width.
612 * \note This is a special routine intended for the Gromacs nonbonded kernels.
613 * It is used in the epilogue of the outer loop, where the variables will
614 * contain unrolled forces for one outer-loop-particle each, corresponding to
615 * a single coordinate (i.e, say, four x-coordinate force variables). These
616 * should be summed and added to the force array in memory. Since we always work
617 * with contiguous SIMD-layout , we can use efficient aligned loads/stores.
618 * When calculating the virial, we also need the total sum of all forces for
619 * each coordinate. This is provided as the return value. For routines that
620 * do not need these, this extra code will be optimized away completely if you
621 * just ignore the return value (Checked with gcc-4.9.1 and clang-3.6 for AVX).
623 static inline float gmx_simdcall
624 reduceIncr4ReturnSum(float * m,
625 SimdFloat v0,
626 SimdFloat v1,
627 SimdFloat v2,
628 SimdFloat v3)
630 float sum[4]; // Note that the 4 here corresponds to the 4 m-elements, not any SIMD width
632 // Make sure the memory pointer is aligned to the smaller of 4 elements and float SIMD width
633 assert(std::size_t(m) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4)*sizeof(float)) == 0);
635 sum[0] = reduce(v0);
636 sum[1] = reduce(v1);
637 sum[2] = reduce(v2);
638 sum[3] = reduce(v3);
640 m[0] += sum[0];
641 m[1] += sum[1];
642 m[2] += sum[2];
643 m[3] += sum[3];
645 return sum[0] + sum[1] + sum[2] + sum[3];
649 /*! \}
651 * \name Higher-level SIMD utilities accessing partial (half-width) SIMD floats.
653 * These functions are optional. The are only useful for SIMD implementation
654 * where the width is 8 or larger, and where it would be inefficient
655 * to process 4*8, 8*8, or more, interactions in parallel.
657 * Currently, only Intel provides very wide SIMD implementations, but these
658 * also come with excellent support for loading, storing, accessing and
659 * shuffling parts of the register in so-called 'lanes' of 4 bytes each.
660 * We can use this to load separate parts into the low/high halves of the
661 * register in the inner loop of the nonbonded kernel, which e.g. makes it
662 * possible to process 4*4 nonbonded interactions as a pattern of 2*8. We
663 * can also use implementations with width 16 or greater.
665 * To make this more generic, when \ref GMX_SIMD_HAVE_HSIMD_UTIL_REAL is 1,
666 * the SIMD implementation provides seven special routines that:
668 * - Load the low/high parts of a SIMD variable from different pointers
669 * - Load half the SIMD width from one pointer, and duplicate in low/high parts
670 * - Load two reals, put 1st one in all low elements, and 2nd in all high ones.
671 * - Store the low/high parts of a SIMD variable to different pointers
672 * - Subtract both SIMD halves from a single half-SIMD-width memory location.
673 * - Load aligned pairs (LJ parameters) from two base pointers, with a common
674 * offset list, and put these in the low/high SIMD halves.
675 * - Reduce each half of two SIMD registers (i.e., 4 parts in total), increment
676 * four adjacent memory positions, and return the total sum.
678 * Remember: this is ONLY used when the native SIMD width is large. You will
679 * just waste time if you implement it for normal 16-byte SIMD architectures.
681 * This is part of the new C++ SIMD interface, so these functions are only
682 * available when using C++. Since some Gromacs code reliying on the SIMD
683 * module is still C (not C++), we have kept the C-style naming for now - this
684 * will change once we are entirely C++.
686 * \{
690 /*! \brief Load low & high parts of SIMD float from different locations.
692 * \param m0 Pointer to memory aligned to half SIMD width.
693 * \param m1 Pointer to memory aligned to half SIMD width.
695 * \return SIMD variable with low part loaded from m0, high from m1.
697 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
699 static inline SimdFloat gmx_simdcall
700 loadDualHsimd(const float * m0,
701 const float * m1)
703 SimdFloat a;
705 // Make sure the memory pointers are aligned to half float SIMD width
706 assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
707 assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
709 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
711 a.simdInternal_[i] = m0[i];
712 a.simdInternal_[a.simdInternal_.size()/2 + i] = m1[i];
714 return a;
717 /*! \brief Load half-SIMD-width float data, spread to both halves.
719 * \param m Pointer to memory aligned to half SIMD width.
721 * \return SIMD variable with both halves loaded from m..
723 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
725 static inline SimdFloat gmx_simdcall
726 loadDuplicateHsimd(const float * m)
728 SimdFloat a;
730 // Make sure the memory pointer is aligned
731 assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
733 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
735 a.simdInternal_[i] = m[i];
736 a.simdInternal_[a.simdInternal_.size()/2 + i] = a.simdInternal_[i];
738 return a;
741 /*! \brief Load two floats, spread 1st in low half, 2nd in high half.
743 * \param m Pointer to two adjacent float values.
745 * \return SIMD variable where all elements in the low half have been set
746 * to m[0], and all elements in high half to m[1].
748 * \note This routine always loads two values and sets the halves separately.
749 * If you want to set all elements to the same value, simply use
750 * the standard (non-half-SIMD) operations.
752 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
754 static inline SimdFloat gmx_simdcall
755 load1DualHsimd(const float * m)
757 SimdFloat a;
759 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
761 a.simdInternal_[i] = m[0];
762 a.simdInternal_[a.simdInternal_.size()/2 + i] = m[1];
764 return a;
768 /*! \brief Store low & high parts of SIMD float to different locations.
770 * \param m0 Pointer to memory aligned to half SIMD width.
771 * \param m1 Pointer to memory aligned to half SIMD width.
772 * \param a SIMD variable. Low half should be stored to m0, high to m1.
774 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
776 static inline void gmx_simdcall
777 storeDualHsimd(float * m0,
778 float * m1,
779 SimdFloat a)
781 // Make sure the memory pointers are aligned to half float SIMD width
782 assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
783 assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
785 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
787 m0[i] = a.simdInternal_[i];
788 m1[i] = a.simdInternal_[a.simdInternal_.size()/2 + i];
792 /*! \brief Add each half of SIMD variable to separate memory adresses
794 * \param m0 Pointer to memory aligned to half SIMD width.
795 * \param m1 Pointer to memory aligned to half SIMD width.
796 * \param a SIMD variable. Lower half will be added to m0, upper half to m1.
798 * The memory must be aligned to half SIMD width.
800 * \note The updated m0 value is written before m1 is read from memory, so
801 * the result will be correct even if the memory regions overlap.
803 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
805 static inline void gmx_simdcall
806 incrDualHsimd(float * m0,
807 float * m1,
808 SimdFloat a)
810 // Make sure the memory pointer is aligned to half float SIMD width
811 assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
812 assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
814 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
816 m0[i] += a.simdInternal_[i];
818 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
820 m1[i] += a.simdInternal_[a.simdInternal_.size()/2 + i];
824 /*! \brief Add the two halves of a SIMD float, subtract the sum from
825 * half-SIMD-width consecutive floats in memory.
827 * \param m half-width aligned memory, from which sum of the halves will be subtracted.
828 * \param a SIMD variable. Upper & lower halves will first be added.
830 * If the SIMD width is 8 and contains [a b c d e f g h], the
831 * memory will be modified to [m[0]-(a+e) m[1]-(b+f) m[2]-(c+g) m[3]-(d+h)].
833 * The memory must be aligned to half SIMD width.
835 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
837 static inline void gmx_simdcall
838 decrHsimd(float * m,
839 SimdFloat a)
841 // Make sure the memory pointer is aligned to half float SIMD width
842 assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
844 for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
846 m[i] -= a.simdInternal_[i] + a.simdInternal_[a.simdInternal_.size()/2 + i];
850 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH/2 offsets,
851 * transpose into SIMD float (low half from base0, high from base1).
853 * \tparam align Alignment of the storage, i.e. the distance
854 * (measured in elements, not bytes) between index points.
855 * When this is identical to the number of output components
856 * the data is packed without padding. This must be a
857 * multiple of the alignment to keep all data aligned.
858 * \param base0 Pointer to base of first aligned memory
859 * \param base1 Pointer to base of second aligned memory
860 * \param offset Offset to the start of each pair
861 * \param[out] v0 1st element in each pair, base0 in low and base1 in high half.
862 * \param[out] v1 2nd element in each pair, base0 in low and base1 in high half.
864 * The offset array should be of half the SIMD width length, so it corresponds
865 * to the half-SIMD-register operations. This also means it must be aligned
866 * to half the integer SIMD width (i.e., GMX_SIMD_FINT32_WIDTH/2).
868 * The floating-point memory locations must be aligned, but only to the smaller
869 * of two elements and the floating-point SIMD width.
871 * This routine is primarily designed to load nonbonded parameters in the
872 * kernels. It is the equivalent of the full-width routine
873 * gatherLoadTranspose(), but just
874 * as the other hsimd routines it will pick half-SIMD-width data from base0
875 * and put in the lower half, while the upper half comes from base1.
877 * For an example, assume the SIMD width is 8, align is 2, that
878 * base0 is [A0 A1 B0 B1 C0 C1 D0 D1 ...], and base1 [E0 E1 F0 F1 G0 G1 H0 H1...].
880 * Then we will get v0 as [A0 B0 C0 D0 E0 F0 G0 H0] and v1 as [A1 B1 C1 D1 E1 F1 G1 H1].
882 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
884 template <int align>
885 static inline void gmx_simdcall
886 gatherLoadTransposeHsimd(const float * base0,
887 const float * base1,
888 const std::int32_t offset[],
889 SimdFloat * v0,
890 SimdFloat * v1)
892 // Offset list must be aligned for half SIMD FINT32 width
893 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH/2*sizeof(std::int32_t)) == 0);
894 // base pointers must be aligned to the smaller of 2 elements and float SIMD width
895 assert(std::size_t(base0) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2)*sizeof(float)) == 0);
896 assert(std::size_t(base1) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2)*sizeof(float)) == 0);
897 // alignment parameter must be also be multiple of the above required alignment
898 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
900 for (std::size_t i = 0; i < v0->simdInternal_.size()/2; i++)
902 v0->simdInternal_[i] = base0[align * offset[i]];
903 v1->simdInternal_[i] = base0[align * offset[i] + 1];
904 v0->simdInternal_[v0->simdInternal_.size()/2 + i] = base1[align * offset[i]];
905 v1->simdInternal_[v1->simdInternal_.size()/2 + i] = base1[align * offset[i] + 1];
909 /*! \brief Reduce the 4 half-SIMD-with floats in 2 SIMD variables (sum halves),
910 * increment four consecutive floats in memory, return sum.
912 * \param m Pointer to memory where the four values should be incremented
913 * \param v0 Variable whose half-SIMD sums should be added to m[0]/m[1], respectively.
914 * \param v1 Variable whose half-SIMD sums should be added to m[2]/m[3], respectively.
916 * \return Sum of all elements in the four SIMD variables.
918 * The pointer m must be aligned, but only to the smaller
919 * of four elements and the floating-point SIMD width.
921 * \note This is the half-SIMD-width version of
922 * reduceIncr4ReturnSum(). The only difference is that the
923 * four half-SIMD inputs needed are present in the low/high halves of the
924 * two SIMD arguments.
926 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
928 static inline float gmx_simdcall
929 reduceIncr4ReturnSumHsimd(float * m,
930 SimdFloat v0,
931 SimdFloat v1)
933 // The 4 here corresponds to the 4 elements in memory, not any SIMD width
934 float sum[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
936 for (std::size_t i = 0; i < v0.simdInternal_.size()/2; i++)
938 sum[0] += v0.simdInternal_[i];
939 sum[1] += v0.simdInternal_[v0.simdInternal_.size()/2 + i];
940 sum[2] += v1.simdInternal_[i];
941 sum[3] += v1.simdInternal_[v1.simdInternal_.size()/2 + i];
944 // Make sure the memory pointer is aligned to the smaller of 4 elements and float SIMD width
945 assert(std::size_t(m) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4)*sizeof(float)) == 0);
947 m[0] += sum[0];
948 m[1] += sum[1];
949 m[2] += sum[2];
950 m[3] += sum[3];
952 return sum[0] + sum[1] + sum[2] + sum[3];
955 /*! \} */
957 /*! \} */
958 /*! \endcond */
960 } // namespace gmx
962 #endif // GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H