2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_SCALAR_UTIL_H
36 #define GMX_SIMD_SCALAR_UTIL_H
40 /*! \libinternal \file
42 * \brief Scalar utility functions mimicking GROMACS SIMD utility functions
44 * These versions make it possible to write functions that are templated with
45 * either a SIMD or scalar type. While some of these functions might not appear
46 * SIMD-specific, we have placed them here because the only reason to use these
47 * instead of generic function is in templated combined SIMD/non-SIMD code.
49 * \author Erik Lindahl <erik.lindahl@gmail.com>
52 * \ingroup module_simd
58 /*****************************************************************************
59 * Single-precision utility load/store functions mimicking SIMD versions *
60 *****************************************************************************/
62 /*! \brief Load 4 consecutive floats from base/offset into four variables
64 * \tparam align Alignment of the memory from which we read.
65 * \param base Pointer to the start of the memory area
66 * \param offset Index to data.
67 * \param[out] v0 1st float, base[align*offset[0]].
68 * \param[out] v1 2nd float, base[align*offset[0] + 1].
69 * \param[out] v2 3rd float, base[align*offset[0] + 2].
70 * \param[out] v3 4th float, base[align*offset[0] + 3].
72 * \note This function might be superficially meaningless, but it helps us to
73 * write templated SIMD/non-SIMD code. For clarity it should not be used
78 gatherLoadTranspose(const float* base
, const std::int32_t offset
[], float* v0
, float* v1
, float* v2
, float* v3
)
80 *v0
= base
[align
* offset
[0]];
81 *v1
= base
[align
* offset
[0] + 1];
82 *v2
= base
[align
* offset
[0] + 2];
83 *v3
= base
[align
* offset
[0] + 3];
86 /*! \brief Load 2 consecutive floats from base/offset into four variables
88 * \tparam align Alignment of the memory from which we read.
89 * \param base Pointer to the start of the memory area
90 * \param offset Index to data.
91 * \param[out] v0 1st float, base[align*offset[0]].
92 * \param[out] v1 2nd float, base[align*offset[0] + 1].
94 * \note This function might be superficially meaningless, but it helps us to
95 * write templated SIMD/non-SIMD code. For clarity it should not be used
99 static inline void gatherLoadTranspose(const float* base
, const std::int32_t offset
[], float* v0
, float* v1
)
101 *v0
= base
[align
* offset
[0]];
102 *v1
= base
[align
* offset
[0] + 1];
106 /*! \brief Load 3 consecutive floats from base/offsets, store into three vars.
108 * \tparam align Alignment of the memory from which we read, i.e. distance
109 * (measured in elements, not bytes) between index points.
110 * \param base Pointer to the start of the memory area
111 * \param offset Offset to the start of data.
112 * \param[out] v0 1st value, base[align*offset[0]].
113 * \param[out] v1 2nd value, base[align*offset[0] + 1].
114 * \param[out] v2 3rd value, base[align*offset[0] + 2].
116 * \note This function might be superficially meaningless, but it helps us to
117 * write templated SIMD/non-SIMD code. For clarity it should not be used
122 gatherLoadUTranspose(const float* base
, const std::int32_t offset
[], float* v0
, float* v1
, float* v2
)
124 *v0
= base
[align
* offset
[0]];
125 *v1
= base
[align
* offset
[0] + 1];
126 *v2
= base
[align
* offset
[0] + 2];
129 /*! \brief Store 3 floats to 3 to base/offset.
131 * \tparam align Alignment of the memory to which we write, i.e. distance
132 * (measured in elements, not bytes) between index points.
133 * \param[out] base Pointer to the start of the memory area
134 * \param offset Offset to the start of triplet.
135 * \param v0 1st value, written to base[align*offset[0]].
136 * \param v1 2nd value, written to base[align*offset[0] + 1].
137 * \param v2 3rd value, written to base[align*offset[0] + 2].
139 * \note This function might be superficially meaningless, but it helps us to
140 * write templated SIMD/non-SIMD code. For clarity it should not be used
144 static inline void transposeScatterStoreU(float* base
, const std::int32_t offset
[], float v0
, float v1
, float v2
)
146 base
[align
* offset
[0]] = v0
;
147 base
[align
* offset
[0] + 1] = v1
;
148 base
[align
* offset
[0] + 2] = v2
;
151 /*! \brief Add 3 floats to base/offset.
153 * \tparam align Alignment of the memory to which we write, i.e. distance
154 * (measured in elements, not bytes) between index points.
155 * \param[out] base Pointer to the start of the memory area
156 * \param offset Offset to the start of triplet.
157 * \param v0 1st value, added to base[align*offset[0]].
158 * \param v1 2nd value, added to base[align*offset[0] + 1].
159 * \param v2 3rd value, added to base[align*offset[0] + 2].
161 * \note This function might be superficially meaningless, but it helps us to
162 * write templated SIMD/non-SIMD code. For clarity it should not be used
166 static inline void transposeScatterIncrU(float* base
, const std::int32_t offset
[], float v0
, float v1
, float v2
)
168 base
[align
* offset
[0]] += v0
;
169 base
[align
* offset
[0] + 1] += v1
;
170 base
[align
* offset
[0] + 2] += v2
;
173 /*! \brief Subtract 3 floats from base/offset.
175 * \tparam align Alignment of the memory to which we write, i.e. distance
176 * (measured in elements, not bytes) between index points.
177 * \param[out] base Pointer to the start of the memory area
178 * \param offset Offset to the start of triplet.
179 * \param v0 1st value, subtracted from base[align*offset[0]].
180 * \param v1 2nd value, subtracted from base[align*offset[0] + 1].
181 * \param v2 3rd value, subtracted from base[align*offset[0] + 2].
183 * \note This function might be superficially meaningless, but it helps us to
184 * write templated SIMD/non-SIMD code. For clarity it should not be used
188 static inline void transposeScatterDecrU(float* base
, const std::int32_t offset
[], float v0
, float v1
, float v2
)
190 base
[align
* offset
[0]] -= v0
;
191 base
[align
* offset
[0] + 1] -= v1
;
192 base
[align
* offset
[0] + 2] -= v2
;
195 /*! \brief Copy single float to three variables.
197 * \param scalar Floating-point input.
198 * \param[out] triplets0 Copy 1.
199 * \param[out] triplets1 Copy 2.
200 * \param[out] triplets2 Copy 3.
202 * \note This function might be superficially meaningless, but it helps us to
203 * write templated SIMD/non-SIMD code. For clarity it should not be used
206 static inline void expandScalarsToTriplets(float scalar
, float* triplets0
, float* triplets1
, float* triplets2
)
213 /*! \brief Load 4 floats from base/offsets and store into variables.
215 * \tparam align Alignment of the memory from which we read, i.e. distance
216 * (measured in elements, not bytes) between index points.
217 * \param base Aligned pointer to the start of the memory.
218 * \param offset Integer type with offset to the start of each triplet.
219 * \param[out] v0 First float, base[align*offset[0]].
220 * \param[out] v1 Second float, base[align*offset[0] + 1].
221 * \param[out] v2 Third float, base[align*offset[0] + 2].
222 * \param[out] v3 Fourth float, base[align*offset[0] + 3].
224 * \note This function might be superficially meaningless, but it helps us to
225 * write templated SIMD/non-SIMD code. For clarity it should not be used
230 gatherLoadBySimdIntTranspose(const float* base
, std::int32_t offset
, float* v0
, float* v1
, float* v2
, float* v3
)
232 *v0
= base
[align
* offset
];
233 *v1
= base
[align
* offset
+ 1];
234 *v2
= base
[align
* offset
+ 2];
235 *v3
= base
[align
* offset
+ 3];
238 /*! \brief Load 2 floats from base/offsets and store into variables (unaligned).
240 * \tparam align Alignment of the memory from which we read, i.e. distance
241 * (measured in elements, not bytes) between index points.
242 * \param base Aligned pointer to the start of the memory.
243 * \param offset Integer type with offset to the start of each triplet.
244 * \param[out] v0 First float, base[align*offset[0]].
245 * \param[out] v1 Second float, base[align*offset[0] + 1].
247 * \note This function might be superficially meaningless, but it helps us to
248 * write templated SIMD/non-SIMD code. For clarity it should not be used
252 static inline void gatherLoadUBySimdIntTranspose(const float* base
, std::int32_t offset
, float* v0
, float* v1
)
254 *v0
= base
[align
* offset
];
255 *v1
= base
[align
* offset
+ 1];
258 /*! \brief Load 2 floats from base/offsets and store into variables (aligned).
260 * \tparam align Alignment of the memory from which we read, i.e. distance
261 * (measured in elements, not bytes) between index points.
262 * \param base Aligned pointer to the start of the memory.
263 * \param offset Integer type with offset to the start of each triplet.
264 * \param[out] v0 First float, base[align*offset[0]].
265 * \param[out] v1 Second float, base[align*offset[0] + 1].
267 * \note This function might be superficially meaningless, but it helps us to
268 * write templated SIMD/non-SIMD code. For clarity it should not be used
272 static inline void gatherLoadBySimdIntTranspose(const float* base
, std::int32_t offset
, float* v0
, float* v1
)
274 *v0
= base
[align
* offset
];
275 *v1
= base
[align
* offset
+ 1];
278 /*! \brief Add each float to four consecutive memory locations, return sum.
280 * \param m Pointer to memory where four floats should be incremented
281 * \param v0 float to be added to m[0]
282 * \param v1 float to be added to m[1]
283 * \param v2 float to be added to m[2]
284 * \param v3 float to be added to m[3]
286 * \return v0+v1+v2+v3.
288 * \note This function might be superficially meaningless, but it helps us to
289 * write templated SIMD/non-SIMD code. For clarity it should not be used
292 static inline float reduceIncr4ReturnSum(float* m
, float v0
, float v1
, float v2
, float v3
)
299 return v0
+ v1
+ v2
+ v3
;
303 /*****************************************************************************
304 * Double-precision utility load/store functions mimicking SIMD versions *
305 *****************************************************************************/
307 /*! \brief Load 4 consecutive doubles from base/offset into four variables
309 * \tparam align Alignment of the memory from which we read.
310 * \param base Pointer to the start of the memory area
311 * \param offset Index to data.
312 * \param[out] v0 1st double, base[align*offset[0]].
313 * \param[out] v1 2nd double, base[align*offset[0] + 1].
314 * \param[out] v2 3rd double, base[align*offset[0] + 2].
315 * \param[out] v3 4th double, base[align*offset[0] + 3].
317 * \note This function might be superficially meaningless, but it helps us to
318 * write templated SIMD/non-SIMD code. For clarity it should not be used
322 static inline void gatherLoadTranspose(const double* base
,
323 const std::int32_t offset
[],
329 *v0
= base
[align
* offset
[0]];
330 *v1
= base
[align
* offset
[0] + 1];
331 *v2
= base
[align
* offset
[0] + 2];
332 *v3
= base
[align
* offset
[0] + 3];
335 /*! \brief Load 2 consecutive doubles from base/offset into four variables
337 * \tparam align Alignment of the memory from which we read.
338 * \param base Pointer to the start of the memory area
339 * \param offset Index to data.
340 * \param[out] v0 1st double, base[align*offset[0]].
341 * \param[out] v1 2nd double, base[align*offset[0] + 1].
343 * \note This function might be superficially meaningless, but it helps us to
344 * write templated SIMD/non-SIMD code. For clarity it should not be used
348 static inline void gatherLoadTranspose(const double* base
, const std::int32_t offset
[], double* v0
, double* v1
)
350 *v0
= base
[align
* offset
[0]];
351 *v1
= base
[align
* offset
[0] + 1];
355 /*! \brief Load 3 consecutive doubles from base/offsets, store into three vars.
357 * \tparam align Alignment of the memory from which we read, i.e. distance
358 * (measured in elements, not bytes) between index points.
359 * \param base Pointer to the start of the memory area
360 * \param offset Offset to the start of data.
361 * \param[out] v0 1st double, base[align*offset[0]].
362 * \param[out] v1 2nd double, base[align*offset[0] + 1].
363 * \param[out] v2 3rd double, base[align*offset[0] + 2].
365 * \note This function might be superficially meaningless, but it helps us to
366 * write templated SIMD/non-SIMD code. For clarity it should not be used
371 gatherLoadUTranspose(const double* base
, const std::int32_t offset
[], double* v0
, double* v1
, double* v2
)
373 *v0
= base
[align
* offset
[0]];
374 *v1
= base
[align
* offset
[0] + 1];
375 *v2
= base
[align
* offset
[0] + 2];
378 /*! \brief Store 3 doubles to 3 to base/offset.
380 * \tparam align Alignment of the memory to which we write, i.e. distance
381 * (measured in elements, not bytes) between index points.
382 * \param[out] base Pointer to the start of the memory area
383 * \param offset Offset to the start of triplet.
384 * \param v0 1st value, written to base[align*offset[0]].
385 * \param v1 2nd value, written to base[align*offset[0] + 1].
386 * \param v2 3rd value, written to base[align*offset[0] + 2].
388 * \note This function might be superficially meaningless, but it helps us to
389 * write templated SIMD/non-SIMD code. For clarity it should not be used
393 static inline void transposeScatterStoreU(double* base
, const std::int32_t offset
[], double v0
, double v1
, double v2
)
395 base
[align
* offset
[0]] = v0
;
396 base
[align
* offset
[0] + 1] = v1
;
397 base
[align
* offset
[0] + 2] = v2
;
400 /*! \brief Add 3 doubles to base/offset.
402 * \tparam align Alignment of the memory to which we write, i.e. distance
403 * (measured in elements, not bytes) between index points.
404 * \param[out] base Pointer to the start of the memory area
405 * \param offset Offset to the start of triplet.
406 * \param v0 1st value, added to base[align*offset[0]].
407 * \param v1 2nd value, added to base[align*offset[0] + 1].
408 * \param v2 3rd value, added to base[align*offset[0] + 2].
410 * \note This function might be superficially meaningless, but it helps us to
411 * write templated SIMD/non-SIMD code. For clarity it should not be used
415 static inline void transposeScatterIncrU(double* base
, const std::int32_t offset
[], double v0
, double v1
, double v2
)
417 base
[align
* offset
[0]] += v0
;
418 base
[align
* offset
[0] + 1] += v1
;
419 base
[align
* offset
[0] + 2] += v2
;
422 /*! \brief Subtract 3 doubles from base/offset.
424 * \tparam align Alignment of the memory to which we write, i.e. distance
425 * (measured in elements, not bytes) between index points.
426 * \param[out] base Pointer to the start of the memory area
427 * \param offset Offset to the start of triplet.
428 * \param v0 1st value, subtracted from base[align*offset[0]].
429 * \param v1 2nd value, subtracted from base[align*offset[0] + 1].
430 * \param v2 3rd value, subtracted from base[align*offset[0] + 2].
432 * \note This function might be superficially meaningless, but it helps us to
433 * write templated SIMD/non-SIMD code. For clarity it should not be used
437 static inline void transposeScatterDecrU(double* base
, const std::int32_t offset
[], double v0
, double v1
, double v2
)
439 base
[align
* offset
[0]] -= v0
;
440 base
[align
* offset
[0] + 1] -= v1
;
441 base
[align
* offset
[0] + 2] -= v2
;
444 /*! \brief Copy single double to three variables.
446 * \param scalar Floating-point input.
447 * \param[out] triplets0 Copy 1.
448 * \param[out] triplets1 Copy 2.
449 * \param[out] triplets2 Copy 3.
451 * \note This function might be superficially meaningless, but it helps us to
452 * write templated SIMD/non-SIMD code. For clarity it should not be used
455 static inline void expandScalarsToTriplets(double scalar
, double* triplets0
, double* triplets1
, double* triplets2
)
462 /*! \brief Load 4 doubles from base/offsets and store into variables.
464 * \tparam align Alignment of the memory from which we read, i.e. distance
465 * (measured in elements, not bytes) between index points.
466 * \param base Aligned pointer to the start of the memory.
467 * \param offset Integer type with offset to the start of each triplet.
468 * \param[out] v0 First double, base[align*offset[0]].
469 * \param[out] v1 Second double, base[align*offset[0] + 1].
470 * \param[out] v2 Third double, base[align*offset[0] + 2].
471 * \param[out] v3 Fourth double, base[align*offset[0] + 3].
473 * \note This function might be superficially meaningless, but it helps us to
474 * write templated SIMD/non-SIMD code. For clarity it should not be used
478 static inline void gatherLoadBySimdIntTranspose(const double* base
,
485 *v0
= base
[align
* offset
];
486 *v1
= base
[align
* offset
+ 1];
487 *v2
= base
[align
* offset
+ 2];
488 *v3
= base
[align
* offset
+ 3];
491 /*! \brief Load 2 doubles from base/offsets and store into variables (unaligned).
493 * \tparam align Alignment of the memory from which we read, i.e. distance
494 * (measured in elements, not bytes) between index points.
495 * \param base Aligned pointer to the start of the memory.
496 * \param offset Integer type with offset to the start of each triplet.
497 * \param[out] v0 First double, base[align*offset[0]].
498 * \param[out] v1 Second double, base[align*offset[0] + 1].
500 * \note This function might be superficially meaningless, but it helps us to
501 * write templated SIMD/non-SIMD code. For clarity it should not be used
505 static inline void gatherLoadUBySimdIntTranspose(const double* base
, std::int32_t offset
, double* v0
, double* v1
)
507 *v0
= base
[align
* offset
];
508 *v1
= base
[align
* offset
+ 1];
511 /*! \brief Load 2 doubles from base/offsets and store into variables (aligned).
513 * \tparam align Alignment of the memory from which we read, i.e. distance
514 * (measured in elements, not bytes) between index points.
515 * \param base Aligned pointer to the start of the memory.
516 * \param offset Integer type with offset to the start of each triplet.
517 * \param[out] v0 First double, base[align*offset[0]].
518 * \param[out] v1 Second double, base[align*offset[0] + 1].
520 * \note This function might be superficially meaningless, but it helps us to
521 * write templated SIMD/non-SIMD code. For clarity it should not be used
525 static inline void gatherLoadBySimdIntTranspose(const double* base
, std::int32_t offset
, double* v0
, double* v1
)
527 *v0
= base
[align
* offset
];
528 *v1
= base
[align
* offset
+ 1];
531 /*! \brief Add each double to four consecutive memory locations, return sum.
533 * \param m Pointer to memory where four floats should be incremented
534 * \param v0 double to be added to m[0]
535 * \param v1 double to be added to m[1]
536 * \param v2 double to be added to m[2]
537 * \param v3 double to be added to m[3]
539 * \return v0+v1+v2+v3.
541 * \note This function might be superficially meaningless, but it helps us to
542 * write templated SIMD/non-SIMD code. For clarity it should not be used
545 static inline double reduceIncr4ReturnSum(double* m
, double v0
, double v1
, double v2
, double v3
)
552 return v0
+ v1
+ v2
+ v3
;
558 #endif // GMX_SIMD_SCALAR_UTIL_H