src/gromacs/simd/impl_reference/impl_reference_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H
  37 #define GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference impl., higher-level double prec. SIMD utility functions
  42  *
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48 /* Avoid adding dependencies on the rest of GROMACS here (e.g. gmxassert.h)
  49  * since we want to be able run the low-level SIMD implementations independently
  50  * in simulators for new hardware.
  51  */
  52
  53 #include "config.h"
  54
  55 #include <cassert>
  56 #include <cstddef>
  57 #include <cstdint>
  58
  59 #include <algorithm>
  60
  61 #include "impl_reference_definitions.h"
  62 #include "impl_reference_simd_double.h"
  63
  64 namespace gmx
  65 {
  66
  67 /*! \cond libapi */
  68 /*! \addtogroup module_simd */
  69 /*! \{ */
  70
  71 /*! \name Higher-level SIMD utility functions, double precision.
  72  *
  73  * These include generic functions to work with triplets of data, typically
  74  * coordinates, and a few utility functions to load and update data in the
  75  * nonbonded kernels. These functions should be available on all implementations.
  76  *
  77  * \{
  78  */
  79
  80 /*! \brief Load 4 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets,
  81  *         and transpose into 4 SIMD double variables.
  82  *
  83  * \tparam     align  Alignment of the memory from which we read, i.e. distance
  84  *                    (measured in elements, not bytes) between index points.
  85  *                    When this is identical to the number of SIMD variables
  86  *                    (i.e., 4 for this routine) the input data is packed without
  87  *                    padding in memory. See the SIMD parameters for exactly
  88  *                    what memory positions are loaded.
  89  * \param      base   Pointer to the start of the memory area
  90  * \param      offset Array with offsets to the start of each data point.
  91  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
  92  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
  93  * \param[out] v2     3rd component of data, base[align*offset[i] + 2] for each i.
  94  * \param[out] v3     4th component of data, base[align*offset[i] + 3] for each i.
  95  *
  96  * The floating-point memory locations must be aligned, but only to the smaller
  97  * of four elements and the floating-point SIMD width.
  98  *
  99  * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
 100  *
 101  * \note You should NOT scale offsets before calling this routine; it is
 102  *       done internally by using the alignment template parameter instead.
 103  */
 104 template<int align>
 105 static inline void gmx_simdcall gatherLoadTranspose(const double*      base,
 106                                                     const std::int32_t offset[],
 107                                                     SimdDouble*        v0,
 108                                                     SimdDouble*        v1,
 109                                                     SimdDouble*        v2,
 110                                                     SimdDouble*        v3)
 111 {
 112     // Offset list must be aligned for SIMD DINT32
 113     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH * sizeof(std::int32_t)) == 0);
 114     // Base pointer must be aligned to the smaller of 4 elements and double SIMD width
 115     assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4) * sizeof(double)) == 0);
 116     // align parameter must also be a multiple of the above alignment requirement
 117     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 4) == 0);
 118
 119     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 120     {
 121         v0->simdInternal_[i] = base[align * offset[i]];
 122         v1->simdInternal_[i] = base[align * offset[i] + 1];
 123         v2->simdInternal_[i] = base[align * offset[i] + 2];
 124         v3->simdInternal_[i] = base[align * offset[i] + 3];
 125     }
 126 }
 127
 128
 129 /*! \brief Load 2 consecutive double from each of GMX_SIMD_DOUBLE_WIDTH offsets,
 130  *         and transpose into 2 SIMD double variables.
 131  *
 132  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 133  *                    (measured in elements, not bytes) between index points.
 134  *                    When this is identical to the number of SIMD variables
 135  *                    (i.e., 2 for this routine) the input data is packed without
 136  *                    padding in memory. See the SIMD parameters for exactly
 137  *                    what memory positions are loaded.
 138  * \param      base   Pointer to the start of the memory area
 139  * \param      offset Array with offsets to the start of each data point.
 140  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
 141  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
 142  *
 143  * The floating-point memory locations must be aligned, but only to the smaller
 144  * of two elements and the floating-point SIMD width.
 145  *
 146  * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
 147  *
 148  * \note You should NOT scale offsets before calling this routine; it is
 149  *       done internally by using the alignment template parameter instead.
 150  */
 151 template<int align>
 152 static inline void gmx_simdcall
 153                    gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
 154 {
 155     // Offset list must be aligned for SIMD DINT32
 156     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH * sizeof(std::int32_t)) == 0);
 157     // Base pointer must be aligned to the smaller of 2 elements and double SIMD width
 158     assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2) * sizeof(double)) == 0);
 159     // align parameter must also be a multiple of the above alignment requirement
 160     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
 161
 162     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 163     {
 164         v0->simdInternal_[i] = base[align * offset[i]];
 165         v1->simdInternal_[i] = base[align * offset[i] + 1];
 166     }
 167 }
 168
 169
 170 /*! \brief Best alignment to use for aligned pairs of double data.
 171  *
 172  * \copydetails c_simdBestPairAlignmentFloat
 173  */
 174 static const int c_simdBestPairAlignmentDouble = 2;
 175
 176
 177 /*! \brief Load 3 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets,
 178  *         and transpose into 3 SIMD double variables.
 179  *
 180  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 181  *                    (measured in elements, not bytes) between index points.
 182  *                    When this is identical to the number of SIMD variables
 183  *                    (i.e., 3 for this routine) the input data is packed without
 184  *                    padding in memory. See the SIMD parameters for exactly
 185  *                    what memory positions are loaded.
 186  * \param      base   Pointer to the start of the memory area
 187  * \param      offset Array with offsets to the start of each data point.
 188  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
 189  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
 190  * \param[out] v2     3rd component of data, base[align*offset[i] + 2] for each i.
 191  *
 192  * This function can work with both aligned (better performance) and unaligned
 193  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 194  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 195  * we account for this.
 196  * However, in the case where align is a power-of-two, we assume the base pointer
 197  * also has the same alignment, which will enable many platforms to use faster
 198  * aligned memory load operations.
 199  * An easy way to think of this is that each triplet of data in memory must be
 200  * aligned to the align parameter you specify when it's a power-of-two.
 201  *
 202  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 203  * enables us to use SIMD loads and gather operations on platforms that support it.
 204  *
 205  * \note You should NOT scale offsets before calling this routine; it is
 206  *       done internally by using the alignment template parameter instead.
 207  * \note This routine uses a normal array for the offsets, since we typically
 208  *       load this data from memory. On the architectures we have tested this
 209  *       is faster even when a SIMD integer datatype is present.
 210  * \note To improve performance, this function might use full-SIMD-width
 211  *       unaligned loads. This means you need to ensure the memory is padded
 212  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 213  *       starting at the last offset. If you use the Gromacs aligned memory
 214  *       allocation routines this will always be the case.
 215  */
 216 template<int align>
 217 static inline void gmx_simdcall gatherLoadUTranspose(const double*      base,
 218                                                      const std::int32_t offset[],
 219                                                      SimdDouble*        v0,
 220                                                      SimdDouble*        v1,
 221                                                      SimdDouble*        v2)
 222 {
 223     // Offset list must be aligned for SIMD DINT32
 224     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH * sizeof(std::int32_t)) == 0);
 225
 226     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 227     {
 228         v0->simdInternal_[i] = base[align * offset[i]];
 229         v1->simdInternal_[i] = base[align * offset[i] + 1];
 230         v2->simdInternal_[i] = base[align * offset[i] + 2];
 231     }
 232 }
 233
 234 /*! \brief Transpose and store 3 SIMD doubles to 3 consecutive addresses at
 235  *         GMX_SIMD_DOUBLE_WIDTH offsets.
 236  *
 237  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 238  *                    (measured in elements, not bytes) between index points.
 239  *                    When this is identical to the number of SIMD variables
 240  *                    (i.e., 3 for this routine) the output data is packed without
 241  *                    padding in memory. See the SIMD parameters for exactly
 242  *                    what memory positions are written.
 243  * \param[out] base   Pointer to the start of the memory area
 244  * \param      offset Aligned array with offsets to the start of each triplet.
 245  * \param      v0     1st component of triplets, written to base[align*offset[i]].
 246  * \param      v1     2nd component of triplets, written to base[align*offset[i] + 1].
 247  * \param      v2     3rd component of triplets, written to base[align*offset[i] + 2].
 248  *
 249  * This function can work with both aligned (better performance) and unaligned
 250  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 251  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 252  * we account for this.
 253  * However, in the case where align is a power-of-two, we assume the base pointer
 254  * also has the same alignment, which will enable many platforms to use faster
 255  * aligned memory store operations.
 256  * An easy way to think of this is that each triplet of data in memory must be
 257  * aligned to the align parameter you specify when it's a power-of-two.
 258  *
 259  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 260  * enables us to use SIMD loads and gather operations on platforms that support it.
 261  *
 262  * \note You should NOT scale offsets before calling this routine; it is
 263  *       done internally by using the alignment template parameter instead.
 264  * \note This routine uses a normal array for the offsets, since we typically
 265  *       load the data from memory. On the architectures we have tested this
 266  *       is faster even when a SIMD integer datatype is present.
 267  */
 268 template<int align>
 269 static inline void gmx_simdcall transposeScatterStoreU(double*            base,
 270                                                        const std::int32_t offset[],
 271                                                        SimdDouble         v0,
 272                                                        SimdDouble         v1,
 273                                                        SimdDouble         v2)
 274 {
 275     // Offset list must be aligned for SIMD DINT32
 276     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH * sizeof(std::int32_t)) == 0);
 277
 278     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 279     {
 280         base[align * offset[i]]     = v0.simdInternal_[i];
 281         base[align * offset[i] + 1] = v1.simdInternal_[i];
 282         base[align * offset[i] + 2] = v2.simdInternal_[i];
 283     }
 284 }
 285
 286
 287 /*! \brief Transpose and add 3 SIMD doubles to 3 consecutive addresses at
 288  *         GMX_SIMD_DOUBLE_WIDTH offsets.
 289  *
 290  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 291  *                    (measured in elements, not bytes) between index points.
 292  *                    When this is identical to the number of SIMD variables
 293  *                    (i.e., 3 for this routine) the output data is packed without
 294  *                    padding in memory. See the SIMD parameters for exactly
 295  *                    what memory positions are incremented.
 296  * \param[out] base   Pointer to the start of the memory area
 297  * \param      offset Aligned array with offsets to the start of each triplet.
 298  * \param      v0     1st component of triplets, added to base[align*offset[i]].
 299  * \param      v1     2nd component of triplets, added to base[align*offset[i] + 1].
 300  * \param      v2     3rd component of triplets, added to base[align*offset[i] + 2].
 301  *
 302  * This function can work with both aligned (better performance) and unaligned
 303  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 304  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 305  * we account for this.
 306  * However, in the case where align is a power-of-two, we assume the base pointer
 307  * also has the same alignment, which will enable many platforms to use faster
 308  * aligned memory load/store operations.
 309  * An easy way to think of this is that each triplet of data in memory must be
 310  * aligned to the align parameter you specify when it's a power-of-two.
 311  *
 312  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 313  * enables us to use SIMD loads and gather operations on platforms that support it.
 314  *
 315  * \note You should NOT scale offsets before calling this routine; it is
 316  *       done internally by using the alignment template parameter instead.
 317  * \note This routine uses a normal array for the offsets, since we typically
 318  *       load the data from memory. On the architectures we have tested this
 319  *       is faster even when a SIMD integer datatype is present.
 320  * \note To improve performance, this function might use full-SIMD-width
 321  *       unaligned load/store, and add 0.0 to the extra elements.
 322  *       This means you need to ensure the memory is padded
 323  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 324  *       starting at the last offset. If you use the Gromacs aligned memory
 325  *       allocation routines this will always be the case.
 326  */
 327 template<int align>
 328 static inline void gmx_simdcall
 329                    transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 330 {
 331     // Offset list must be aligned for SIMD DINT32
 332     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH * sizeof(std::int32_t)) == 0);
 333
 334     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 335     {
 336         base[align * offset[i]] += v0.simdInternal_[i];
 337         base[align * offset[i] + 1] += v1.simdInternal_[i];
 338         base[align * offset[i] + 2] += v2.simdInternal_[i];
 339     }
 340 }
 341
 342 /*! \brief Transpose and subtract 3 SIMD doubles to 3 consecutive addresses at
 343  *         GMX_SIMD_DOUBLE_WIDTH offsets.
 344  *
 345  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 346  *                    (measured in elements, not bytes) between index points.
 347  *                    When this is identical to the number of SIMD variables
 348  *                    (i.e., 3 for this routine) the output data is packed without
 349  *                    padding in memory. See the SIMD parameters for exactly
 350  *                    what memory positions are decremented.
 351  * \param[out] base    Pointer to start of memory.
 352  * \param      offset  Aligned array with offsets to the start of each triplet.
 353  * \param      v0      1st component, subtracted from base[align*offset[i]]
 354  * \param      v1      2nd component, subtracted from base[align*offset[i]+1]
 355  * \param      v2      3rd component, subtracted from base[align*offset[i]+2]
 356  *
 357  * This function can work with both aligned (better performance) and unaligned
 358  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 359  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 360  * we account for this.
 361  * However, in the case where align is a power-of-two, we assume the base pointer
 362  * also has the same alignment, which will enable many platforms to use faster
 363  * aligned memory load/store operations.
 364  * An easy way to think of this is that each triplet of data in memory must be
 365  * aligned to the align parameter you specify when it's a power-of-two.
 366  *
 367  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 368  * enables us to use SIMD loads and gather operations on platforms that support it.
 369  *
 370  * \note You should NOT scale offsets before calling this routine; it is
 371  *       done internally by using the alignment template parameter instead.
 372  * \note This routine uses a normal array for the offsets, since we typically
 373  *       load the data from memory. On the architectures we have tested this
 374  *       is faster even when a SIMD integer datatype is present.
 375  * \note To improve performance, this function might use full-SIMD-width
 376  *       unaligned load/store, and subtract 0.0 from the extra elements.
 377  *       This means you need to ensure the memory is padded
 378  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 379  *       starting at the last offset. If you use the Gromacs aligned memory
 380  *       allocation routines this will always be the case.
 381  */
 382 template<int align>
 383 static inline void gmx_simdcall
 384                    transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 385 {
 386     // Offset list must be aligned for SIMD DINT32
 387     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH * sizeof(std::int32_t)) == 0);
 388
 389     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 390     {
 391         base[align * offset[i]] -= v0.simdInternal_[i];
 392         base[align * offset[i] + 1] -= v1.simdInternal_[i];
 393         base[align * offset[i] + 2] -= v2.simdInternal_[i];
 394     }
 395 }
 396
 397
 398 /*! \brief Expand each element of double SIMD variable into three identical
 399  *         consecutive elements in three SIMD outputs.
 400  *
 401  * \param      scalar    Floating-point input, e.g. [s0 s1 s2 s3] if width=4.
 402  * \param[out] triplets0 First output, e.g. [s0 s0 s0 s1] if width=4.
 403  * \param[out] triplets1 Second output, e.g. [s1 s1 s2 s2] if width=4.
 404  * \param[out] triplets2 Third output, e.g. [s2 s3 s3 s3] if width=4.
 405  *
 406  * This routine is meant to use for things like scalar-vector multiplication,
 407  * where the vectors are stored in a merged format like [x0 y0 z0 x1 y1 z1 ...],
 408  * while the scalars are stored as [s0 s1 s2...], and the data cannot easily
 409  * be changed to SIMD-friendly layout.
 410  *
 411  * In this case, load 3 full-width SIMD variables from the vector array (This
 412  * will always correspond to GMX_SIMD_DOUBLE_WIDTH triplets),
 413  * load a single full-width variable from the scalar array, and
 414  * call this routine to expand the data. You can then simply multiply the
 415  * first, second and third pair of SIMD variables, and store the three
 416  * results back into a suitable vector-format array.
 417  */
 418 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble  scalar,
 419                                                         SimdDouble* triplets0,
 420                                                         SimdDouble* triplets1,
 421                                                         SimdDouble* triplets2)
 422 {
 423     for (std::size_t i = 0; i < scalar.simdInternal_.size(); i++)
 424     {
 425         triplets0->simdInternal_[i] = scalar.simdInternal_[i / 3];
 426         triplets1->simdInternal_[i] = scalar.simdInternal_[(i + scalar.simdInternal_.size()) / 3];
 427         triplets2->simdInternal_[i] = scalar.simdInternal_[(i + 2 * scalar.simdInternal_.size()) / 3];
 428     }
 429 }
 430
 431
 432 /*! \brief Load 4 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
 433  *         specified by a SIMD integer, transpose into 4 SIMD double variables.
 434  *
 435  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 436  *                    (measured in elements, not bytes) between index points.
 437  *                    When this is identical to the number of SIMD variables
 438  *                    (i.e., 4 for this routine) the input data is packed without
 439  *                    padding in memory. See the SIMD parameters for exactly
 440  *                    what memory positions are loaded.
 441  * \param      base   Aligned pointer to the start of the memory.
 442  * \param      offset SIMD integer type with offsets to the start of each triplet.
 443  * \param[out] v0     First component, base[align*offset[i]] for each i.
 444  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 445  * \param[out] v2     Third component, base[align*offset[i] + 2] for each i.
 446  * \param[out] v3     Fourth component, base[align*offset[i] + 3] for each i.
 447  *
 448  * The floating-point memory locations must be aligned, but only to the smaller
 449  * of four elements and the floating-point SIMD width.
 450  *
 451  * \note You should NOT scale offsets before calling this routine; it is
 452  *       done internally by using the alignment template parameter instead.
 453  * \note This is a special routine primarily intended for loading Gromacs
 454  *       table data as efficiently as possible - this is the reason for using
 455  *       a SIMD offset index, since the result of the  real-to-integer conversion
 456  *       is present in a SIMD register just before calling this routine.
 457  */
 458 template<int align>
 459 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
 460                                                              SimdDInt32    offset,
 461                                                              SimdDouble*   v0,
 462                                                              SimdDouble*   v1,
 463                                                              SimdDouble*   v2,
 464                                                              SimdDouble*   v3)
 465 {
 466     // Base pointer must be aligned to the smaller of 4 elements and double SIMD width
 467     assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4) * sizeof(double)) == 0);
 468     // align parameter must also be a multiple of the above alignment requirement
 469     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 4) == 0);
 470
 471     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 472     {
 473         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 474         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 475         v2->simdInternal_[i] = base[align * offset.simdInternal_[i] + 2];
 476         v3->simdInternal_[i] = base[align * offset.simdInternal_[i] + 3];
 477     }
 478 }
 479
 480
 481 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
 482  *         (unaligned) specified by SIMD integer, transpose into 2 SIMD doubles.
 483  *
 484  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 485  *                    (measured in elements, not bytes) between index points.
 486  *                    When this is identical to the number of SIMD variables
 487  *                    (i.e., 2 for this routine) the input data is packed without
 488  *                    padding in memory. See the SIMD parameters for exactly
 489  *                    what memory positions are loaded.
 490  * \param      base   Pointer to the start of the memory.
 491  * \param      offset SIMD integer type with offsets to the start of each triplet.
 492  * \param[out] v0     First component, base[align*offset[i]] for each i.
 493  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 494  *
 495  * Since some SIMD architectures cannot handle any unaligned loads, this routine
 496  * is only available if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE is 1.
 497  *
 498  * \note You should NOT scale offsets before calling this routine; it is
 499  *       done internally by using the alignment template parameter instead.
 500  * \note This is a special routine primarily intended for loading Gromacs
 501  *       table data as efficiently as possible - this is the reason for using
 502  *       a SIMD offset index, since the result of the  real-to-integer conversion
 503  *       is present in a SIMD register just before calling this routine.
 504  */
 505 template<int align>
 506 static inline void gmx_simdcall
 507                    gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 508 {
 509     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 510     {
 511         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 512         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 513     }
 514 }
 515
 516 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH offsets
 517  *         specified by a SIMD integer, transpose into 2 SIMD double variables.
 518  *
 519  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 520  *                    (measured in elements, not bytes) between index points.
 521  *                    When this is identical to the number of SIMD variables
 522  *                    (i.e., 2 for this routine) the input data is packed without
 523  *                    padding in memory. See the SIMD parameters for exactly
 524  *                    what memory positions are loaded.
 525  * \param      base   Aligned pointer to the start of the memory.
 526  * \param      offset SIMD integer type with offsets to the start of each triplet.
 527  * \param[out] v0     First component, base[align*offset[i]] for each i.
 528  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 529  *
 530  * The floating-point memory locations must be aligned, but only to the smaller
 531  * of two elements and the floating-point SIMD width.
 532  *
 533  * \note You should NOT scale offsets before calling this routine; it is
 534  *       done internally by using the alignment template parameter instead.
 535  * \note This is a special routine primarily intended for loading Gromacs
 536  *       table data as efficiently as possible - this is the reason for using
 537  *       a SIMD offset index, since the result of the  real-to-integer conversion
 538  *       is present in a SIMD register just before calling this routine.
 539  */
 540 template<int align>
 541 static inline void gmx_simdcall
 542                    gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 543 {
 544     // Base pointer must be aligned to the smaller of 2 elements and double SIMD width
 545     assert(std::size_t(base) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2) * sizeof(double)) == 0);
 546     // align parameter must also be a multiple of the above alignment requirement
 547     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
 548
 549     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 550     {
 551         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 552         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 553     }
 554 }
 555
 556
 557 /*! \brief Reduce each of four SIMD doubles, add those values to four consecutive
 558  *         doubles in memory, return sum.
 559  *
 560  * \param m   Pointer to memory where four doubles should be incremented
 561  * \param v0  SIMD variable whose sum should be added to m[0]
 562  * \param v1  SIMD variable whose sum should be added to m[1]
 563  * \param v2  SIMD variable whose sum should be added to m[2]
 564  * \param v3  SIMD variable whose sum should be added to m[3]
 565  *
 566  * \return Sum of all elements in the four SIMD variables.
 567  *
 568  * The pointer m must be aligned to the smaller of four elements and the
 569  * floating-point SIMD width.
 570  *
 571  * \note This is a special routine intended for the Gromacs nonbonded kernels.
 572  * It is used in the epilogue of the outer loop, where the variables will
 573  * contain unrolled forces for one outer-loop-particle each, corresponding to
 574  * a single coordinate (i.e, say, four x-coordinate force variables). These
 575  * should be summed and added to the force array in memory. Since we always work
 576  * with contiguous SIMD-layout , we can use efficient aligned loads/stores.
 577  * When calculating the virial, we also need the total sum of all forces for
 578  * each coordinate. This is provided as the return value. For routines that
 579  * do not need these, this extra code will be optimized away completely if you
 580  * just ignore the return value (Checked with gcc-4.9.1 and clang-3.6 for AVX).
 581  */
 582 static inline double gmx_simdcall
 583                      reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
 584 {
 585     double sum[4]; // Note that the 4 here corresponds to the 4 m-elements, not any SIMD width
 586
 587     // Make sure the memory pointer is aligned to the smaller of 4 elements and double SIMD width
 588     assert(std::size_t(m) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4) * sizeof(double)) == 0);
 589
 590     sum[0] = reduce(v0);
 591     sum[1] = reduce(v1);
 592     sum[2] = reduce(v2);
 593     sum[3] = reduce(v3);
 594
 595     m[0] += sum[0];
 596     m[1] += sum[1];
 597     m[2] += sum[2];
 598     m[3] += sum[3];
 599
 600     return sum[0] + sum[1] + sum[2] + sum[3];
 601 }
 602
 603
 604 /*! \}
 605  *
 606  * \name Higher-level SIMD utilities accessing partial (half-width) SIMD doubles.
 607  *
 608  * See the single-precision versions for documentation. Since double precision
 609  * is typically half the width of single, this double version is likely only
 610  * useful with 512-bit and larger implementations.
 611  *
 612  * \{
 613  */
 614
 615 /*! \brief Load low & high parts of SIMD double from different locations.
 616  *
 617  * \param m0 Pointer to memory aligned to half SIMD width.
 618  * \param m1 Pointer to memory aligned to half SIMD width.
 619  *
 620  * \return SIMD variable with low part loaded from m0, high from m1.
 621  *
 622  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 623  */
 624 static inline SimdDouble gmx_simdcall loadDualHsimd(const double* m0, const double* m1)
 625 {
 626     SimdDouble a;
 627
 628     // Make sure the memory pointers are aligned to half double SIMD width
 629     assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
 630     assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
 631
 632     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 633     {
 634         a.simdInternal_[i]                              = m0[i];
 635         a.simdInternal_[a.simdInternal_.size() / 2 + i] = m1[i];
 636     }
 637     return a;
 638 }
 639
 640 /*! \brief Load half-SIMD-width double data, spread to both halves.
 641  *
 642  * \param m Pointer to memory aligned to half SIMD width.
 643  *
 644  * \return SIMD variable with both halves loaded from m..
 645  *
 646  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 647  */
 648 static inline SimdDouble gmx_simdcall loadDuplicateHsimd(const double* m)
 649 {
 650     SimdDouble a;
 651
 652     // Make sure the memory pointer is aligned
 653     assert(std::size_t(m) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
 654
 655     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 656     {
 657         a.simdInternal_[i]                              = m[i];
 658         a.simdInternal_[a.simdInternal_.size() / 2 + i] = a.simdInternal_[i];
 659     }
 660     return a;
 661 }
 662
 663 /*! \brief Load two doubles, spread 1st in low half, 2nd in high half.
 664  *
 665  * \param m Pointer to two adjacent double values.
 666  *
 667  * \return SIMD variable where all elements in the low half have been set
 668  *         to m[0], and all elements in high half to m[1].
 669  *
 670  * \note This routine always loads two values and sets the halves separately.
 671  *       If you want to set all elements to the same value, simply use
 672  *       the standard (non-half-SIMD) operations.
 673  *
 674  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 675  */
 676 static inline SimdDouble gmx_simdcall loadU1DualHsimd(const double* m)
 677 {
 678     SimdDouble a;
 679
 680     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 681     {
 682         a.simdInternal_[i]                              = m[0];
 683         a.simdInternal_[a.simdInternal_.size() / 2 + i] = m[1];
 684     }
 685     return a;
 686 }
 687
 688
 689 /*! \brief Store low & high parts of SIMD double to different locations.
 690  *
 691  * \param m0 Pointer to memory aligned to half SIMD width.
 692  * \param m1 Pointer to memory aligned to half SIMD width.
 693  * \param a  SIMD variable. Low half should be stored to m0, high to m1.
 694  *
 695  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 696  */
 697 static inline void gmx_simdcall storeDualHsimd(double* m0, double* m1, SimdDouble a)
 698 {
 699     // Make sure the memory pointers are aligned to half double SIMD width
 700     assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
 701     assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
 702
 703     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 704     {
 705         m0[i] = a.simdInternal_[i];
 706         m1[i] = a.simdInternal_[a.simdInternal_.size() / 2 + i];
 707     }
 708 }
 709
 710 /*! \brief Add each half of SIMD variable to separate memory adresses
 711  *
 712  * \param m0 Pointer to memory aligned to half SIMD width.
 713  * \param m1 Pointer to memory aligned to half SIMD width.
 714  * \param a  SIMD variable. Lower half will be added to m0, upper half to m1.
 715  *
 716  * The memory must be aligned to half SIMD width.
 717  *
 718  * \note The updated m0 value is written before m1 is read from memory, so
 719  *       the result will be correct even if the memory regions overlap.
 720  *
 721  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 722  */
 723 static inline void gmx_simdcall incrDualHsimd(double* m0, double* m1, SimdDouble a)
 724 {
 725     // Make sure the memory pointer is aligned to half double SIMD width
 726     assert(std::size_t(m0) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
 727     assert(std::size_t(m1) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
 728
 729     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 730     {
 731         m0[i] += a.simdInternal_[i];
 732     }
 733     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 734     {
 735         m1[i] += a.simdInternal_[a.simdInternal_.size() / 2 + i];
 736     }
 737 }
 738
 739 /*! \brief Add the two halves of three SIMD doubles, subtract the sum from
 740  *         three half-SIMD-width consecutive doubles in memory.
 741  *
 742  * \param m  half-width aligned memory, from which sum of the halves will be subtracted.
 743  * \param a0 SIMD variable. Upper & lower halves will first be added.
 744  * \param a1 SIMD variable. Upper & lower halves will second be added.
 745  * \param a2 SIMD variable. Upper & lower halves will third be added.
 746  *
 747  * If the SIMD width is 8 and the vectors contain [a0 b0 c0 d0 e0 f0 g0 h0],
 748  * [a1 b1 c1 d1 e1 f1 g1 g1] and [a2 b2 c2 d2 e2 f2 g2 h2], the
 749  * memory will be modified to [m[0]-(a0+e0) m[1]-(b0+f0) m[2]-(c0+g0) m[3]-(d0+h0)
 750  *                             m[4]-(a1+e1) m[5]-(b1+f1) m[6]-(c1+g1) m[7]-(d1+h1)
 751  *                             m[8]-(a2+e2) m[9]-(b2+f2) m[10]-(c2+g2) m[11]-(d2+h2)].
 752  *
 753  * The memory must be aligned to half SIMD width.
 754  *
 755  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 756  */
 757 static inline void gmx_simdcall decr3Hsimd(double* m, SimdDouble a0, SimdDouble a1, SimdDouble a2)
 758 {
 759     assert(std::size_t(m) % (GMX_SIMD_DOUBLE_WIDTH / 2 * sizeof(double)) == 0);
 760     for (std::size_t i = 0; i < a0.simdInternal_.size() / 2; i++)
 761     {
 762         m[i] -= a0.simdInternal_[i] + a0.simdInternal_[a0.simdInternal_.size() / 2 + i];
 763     }
 764     for (std::size_t i = 0; i < a1.simdInternal_.size() / 2; i++)
 765     {
 766         m[a1.simdInternal_.size() / 2 + i] -=
 767                 a1.simdInternal_[i] + a1.simdInternal_[a1.simdInternal_.size() / 2 + i];
 768     }
 769     for (std::size_t i = 0; i < a2.simdInternal_.size() / 2; i++)
 770     {
 771         m[a2.simdInternal_.size() + i] -=
 772                 a2.simdInternal_[i] + a2.simdInternal_[a2.simdInternal_.size() / 2 + i];
 773     }
 774 }
 775
 776
 777 /*! \brief Load 2 consecutive doubles from each of GMX_SIMD_DOUBLE_WIDTH/2 offsets,
 778  *         transpose into SIMD double (low half from base0, high from base1).
 779  *
 780  * \tparam     align  Alignment of the storage, i.e. the distance
 781  *                    (measured in elements, not bytes) between index points.
 782  *                    When this is identical to the number of output components
 783  *                    the data is packed without padding. This must be a
 784  *                    multiple of the alignment to keep all data aligned.
 785  * \param      base0  Pointer to base of first aligned memory
 786  * \param      base1  Pointer to base of second aligned memory
 787  * \param      offset Offset to the start of each pair
 788  * \param[out] v0     1st element in each pair, base0 in low and base1 in high half.
 789  * \param[out] v1     2nd element in each pair, base0 in low and base1 in high half.
 790  *
 791  * The offset array should be of half the SIMD width length, so it corresponds
 792  * to the half-SIMD-register operations. This also means it must be aligned
 793  * to half the integer SIMD width (i.e., GMX_SIMD_DINT32_WIDTH/2).
 794  *
 795  * The floating-point memory locations must be aligned, but only to the smaller
 796  * of two elements and the floating-point SIMD width.
 797  *
 798  * This routine is primarily designed to load nonbonded parameters in the
 799  * kernels. It is the equivalent of the full-width routine
 800  * gatherLoadTranspose(), but just
 801  * as the other hsimd routines it will pick half-SIMD-width data from base0
 802  * and put in the lower half, while the upper half comes from base1.
 803  *
 804  * For an example, assume the SIMD width is 8, align is 2, that
 805  * base0 is [A0 A1 B0 B1 C0 C1 D0 D1 ...], and base1 [E0 E1 F0 F1 G0 G1 H0 H1...].
 806  *
 807  * Then we will get v0 as [A0 B0 C0 D0 E0 F0 G0 H0] and v1 as [A1 B1 C1 D1 E1 F1 G1 H1].
 808  *
 809  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 810  */
 811 template<int align>
 812 static inline void gmx_simdcall gatherLoadTransposeHsimd(const double* base0,
 813                                                          const double* base1,
 814                                                          std::int32_t  offset[],
 815                                                          SimdDouble*   v0,
 816                                                          SimdDouble*   v1)
 817 {
 818     // Offset list must be aligned for half SIMD DINT32 width
 819     assert(std::size_t(offset) % (GMX_SIMD_DINT32_WIDTH / 2 * sizeof(std::int32_t)) == 0);
 820     // base pointers must be aligned to the smaller of 2 elements and double SIMD width
 821     assert(std::size_t(base0) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2) * sizeof(double)) == 0);
 822     assert(std::size_t(base1) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 2) * sizeof(double)) == 0);
 823     // alignment parameter must be also be multiple of the above required alignment
 824     assert(align % std::min(GMX_SIMD_DOUBLE_WIDTH, 2) == 0);
 825
 826     for (std::size_t i = 0; i < v0->simdInternal_.size() / 2; i++)
 827     {
 828         v0->simdInternal_[i]                                = base0[align * offset[i]];
 829         v1->simdInternal_[i]                                = base0[align * offset[i] + 1];
 830         v0->simdInternal_[v0->simdInternal_.size() / 2 + i] = base1[align * offset[i]];
 831         v1->simdInternal_[v1->simdInternal_.size() / 2 + i] = base1[align * offset[i] + 1];
 832     }
 833 }
 834
 835
 836 /*! \brief Reduce the 4 half-SIMD-with doubles in 2 SIMD variables (sum halves),
 837  *         increment four consecutive doubles in memory, return sum.
 838  *
 839  * \param m    Pointer to memory where the four values should be incremented
 840  * \param v0   Variable whose half-SIMD sums should be added to m[0]/m[1], respectively.
 841  * \param v1   Variable whose half-SIMD sums should be added to m[2]/m[3], respectively.
 842  *
 843  * \return Sum of all elements in the four SIMD variables.
 844  *
 845  * The pointer m must be aligned, but only to the smaller
 846  * of four elements and the floating-point SIMD width.
 847  *
 848  * \note This is the half-SIMD-width version of
 849  *      reduceIncr4ReturnSum(). The only difference is that the
 850  *      four half-SIMD inputs needed are present in the low/high halves of the
 851  *      two SIMD arguments.
 852  *
 853  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE is 1.
 854  */
 855 static inline double gmx_simdcall reduceIncr4ReturnSumHsimd(double* m, SimdDouble v0, SimdDouble v1)
 856 {
 857     // The 4 here corresponds to the 4 elements in memory, not any SIMD width
 858     double sum[4] = { 0.0, 0.0, 0.0, 0.0 };
 859
 860     for (std::size_t i = 0; i < v0.simdInternal_.size() / 2; i++)
 861     {
 862         sum[0] += v0.simdInternal_[i];
 863         sum[1] += v0.simdInternal_[v0.simdInternal_.size() / 2 + i];
 864         sum[2] += v1.simdInternal_[i];
 865         sum[3] += v1.simdInternal_[v1.simdInternal_.size() / 2 + i];
 866     }
 867
 868     // Make sure the memory pointer is aligned to the smaller of 4 elements and double SIMD width
 869     assert(std::size_t(m) % (std::min(GMX_SIMD_DOUBLE_WIDTH, 4) * sizeof(double)) == 0);
 870
 871     m[0] += sum[0];
 872     m[1] += sum[1];
 873     m[2] += sum[2];
 874     m[3] += sum[3];
 875
 876     return sum[0] + sum[1] + sum[2] + sum[3];
 877 }
 878
 879 #if GMX_SIMD_DOUBLE_WIDTH > 8 || defined DOXYGEN
 880 /*! \brief Load N doubles and duplicate them 4 times each.
 881  *
 882  * \param m Pointer to unaligned memory
 883  *
 884  * \return SIMD variable with N doubles from m duplicated 4x.
 885  *
 886  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
 887  * N is GMX_SIMD_DOUBLE_WIDTH/4. Duplicated values are
 888  * contigous and different values are 4 positions in SIMD
 889  * apart.
 890  */
 891 static inline SimdDouble gmx_simdcall loadUNDuplicate4(const double* m)
 892 {
 893     SimdDouble a;
 894     for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
 895     {
 896         a.simdInternal_[i * 4]     = m[i];
 897         a.simdInternal_[i * 4 + 1] = m[i];
 898         a.simdInternal_[i * 4 + 2] = m[i];
 899         a.simdInternal_[i * 4 + 3] = m[i];
 900     }
 901     return a;
 902 }
 903
 904 /*! \brief Load 4 doubles and duplicate them N times each.
 905  *
 906  * \param m Pointer to memory aligned to 4 doubles
 907  *
 908  * \return SIMD variable with 4 doubles from m duplicated Nx.
 909  *
 910  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
 911  * N is GMX_SIMD_DOUBLE_WIDTH/4. Different values are
 912  * contigous and same values are 4 positions in SIMD
 913  * apart.
 914  */
 915 static inline SimdDouble gmx_simdcall load4DuplicateN(const double* m)
 916 {
 917     SimdDouble a;
 918     for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
 919     {
 920         a.simdInternal_[i * 4]     = m[0];
 921         a.simdInternal_[i * 4 + 1] = m[1];
 922         a.simdInternal_[i * 4 + 2] = m[2];
 923         a.simdInternal_[i * 4 + 3] = m[3];
 924     }
 925     return a;
 926 }
 927 #endif
 928
 929 #if GMX_SIMD_DOUBLE_WIDTH >= 8 || defined DOXYGEN
 930 /*! \brief Load doubles in blocks of 4 at fixed offsets
 931  *
 932  * \param m Pointer to unaligned memory
 933  * \param offset Offset in memory between input blocks of 4
 934  *
 935  * \return SIMD variable with doubles from m.
 936  *
 937  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
 938  * Blocks of 4 doubles are loaded from m+n*offset where n
 939  * is the n-th block of 4 doubles.
 940  */
 941 static inline SimdDouble gmx_simdcall loadU4NOffset(const double* m, int offset)
 942 {
 943     SimdDouble a;
 944     for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
 945     {
 946         a.simdInternal_[i * 4]     = m[offset * i + 0];
 947         a.simdInternal_[i * 4 + 1] = m[offset * i + 1];
 948         a.simdInternal_[i * 4 + 2] = m[offset * i + 2];
 949         a.simdInternal_[i * 4 + 3] = m[offset * i + 3];
 950     }
 951     return a;
 952 }
 953 #endif
 954
 955
 956 /*! \} */
 957
 958 /*! \} */
 959 /*! \endcond */
 960
 961 } // namespace gmx
 962
 963 #endif // GMX_SIMD_IMPL_REFERENCE_UTIL_DOUBLE_H