src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_util_double.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
   5  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #ifndef GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_DOUBLE_H
  38 #define GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_DOUBLE_H
  39
  40 #include "config.h"
  41
  42 #include "gromacs/utility/basedefinitions.h"
  43
  44 #include "impl_ibm_vsx_definitions.h"
  45 #include "impl_ibm_vsx_simd_double.h"
  46
  47 namespace gmx
  48 {
  49
  50 template<int align>
  51 static inline void gmx_simdcall gatherLoadTranspose(const double*      base,
  52                                                     const std::int32_t offset[],
  53                                                     SimdDouble*        v0,
  54                                                     SimdDouble*        v1,
  55                                                     SimdDouble*        v2,
  56                                                     SimdDouble*        v3)
  57 {
  58     __vector double t1, t2, t3, t4;
  59
  60     t1                = *reinterpret_cast<const __vector double*>(base + align * offset[0]);
  61     t2                = *reinterpret_cast<const __vector double*>(base + align * offset[1]);
  62     t3                = *reinterpret_cast<const __vector double*>(base + align * offset[0] + 2);
  63     t4                = *reinterpret_cast<const __vector double*>(base + align * offset[1] + 2);
  64     v0->simdInternal_ = vec_mergeh(t1, t2);
  65     v1->simdInternal_ = vec_mergel(t1, t2);
  66     v2->simdInternal_ = vec_mergeh(t3, t4);
  67     v3->simdInternal_ = vec_mergel(t3, t4);
  68 }
  69
  70 template<int align>
  71 static inline void gmx_simdcall
  72                    gatherLoadTranspose(const double* base, const std::int32_t offset[], SimdDouble* v0, SimdDouble* v1)
  73 {
  74     __vector double t1, t2;
  75
  76     t1                = *reinterpret_cast<const __vector double*>(base + align * offset[0]);
  77     t2                = *reinterpret_cast<const __vector double*>(base + align * offset[1]);
  78     v0->simdInternal_ = vec_mergeh(t1, t2);
  79     v1->simdInternal_ = vec_mergel(t1, t2);
  80 }
  81
  82 static const int c_simdBestPairAlignmentDouble = 2;
  83
  84 template<int align>
  85 static inline void gmx_simdcall gatherLoadUTranspose(const double*      base,
  86                                                      const std::int32_t offset[],
  87                                                      SimdDouble*        v0,
  88                                                      SimdDouble*        v1,
  89                                                      SimdDouble*        v2)
  90 {
  91     SimdDouble t1, t2;
  92
  93     t1 = simdLoad(base + align * offset[0]);
  94     t2 = simdLoad(base + align * offset[1]);
  95
  96     v0->simdInternal_ = vec_mergeh(t1.simdInternal_, t2.simdInternal_);
  97     v1->simdInternal_ = vec_mergel(t1.simdInternal_, t2.simdInternal_);
  98     v2->simdInternal_ = vec_mergeh(vec_splats(*(base + align * offset[0] + 2)),
  99                                    vec_splats(*(base + align * offset[1] + 2)));
 100 }
 101
 102 // gcc-4.9 fails to recognize that the argument to vec_extract() is used
 103 template<int align>
 104 static inline void gmx_simdcall transposeScatterStoreU(double*            base,
 105                                                        const std::int32_t offset[],
 106                                                        SimdDouble         v0,
 107                                                        SimdDouble         v1,
 108                                                        SimdDouble gmx_unused v2)
 109 {
 110     SimdDouble t1, t2;
 111
 112     t1.simdInternal_ = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 113     t2.simdInternal_ = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 114
 115     store(base + align * offset[0], t1);
 116     base[align * offset[0] + 2] = vec_extract(v2.simdInternal_, 0);
 117     store(base + align * offset[1], t2);
 118     base[align * offset[1] + 2] = vec_extract(v2.simdInternal_, 1);
 119 }
 120
 121 template<int align>
 122 static inline void gmx_simdcall
 123                    transposeScatterIncrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 124 {
 125     if (align % 4 == 0)
 126     {
 127         __vector double t1, t2, t3, t4;
 128         SimdDouble      t5, t6, t7, t8;
 129
 130         t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 131         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 132         t3 = vec_mergeh(v2.simdInternal_, vec_splats(0.0));
 133         t4 = vec_mergel(v2.simdInternal_, vec_splats(0.0));
 134
 135         t5               = simdLoad(base + align * offset[0]);
 136         t6               = simdLoad(base + align * offset[0] + 2);
 137         t5.simdInternal_ = vec_add(t5.simdInternal_, t1);
 138         t6.simdInternal_ = vec_add(t6.simdInternal_, t3);
 139         store(base + align * offset[0], t5);
 140         store(base + align * offset[0] + 2, t6);
 141
 142         t5               = simdLoad(base + align * offset[1]);
 143         t6               = simdLoad(base + align * offset[1] + 2);
 144         t5.simdInternal_ = vec_add(t5.simdInternal_, t2);
 145         t6.simdInternal_ = vec_add(t6.simdInternal_, t4);
 146         store(base + align * offset[1], t5);
 147         store(base + align * offset[1] + 2, t6);
 148     }
 149     else
 150     {
 151         __vector double t1, t2;
 152         SimdDouble      t3, t4;
 153
 154         t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 155         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 156
 157         t3               = simdLoad(base + align * offset[0]);
 158         t3.simdInternal_ = vec_add(t3.simdInternal_, t1);
 159         store(base + align * offset[0], t3);
 160         base[align * offset[0] + 2] += vec_extract(v2.simdInternal_, 0);
 161
 162         t4               = simdLoad(base + align * offset[1]);
 163         t4.simdInternal_ = vec_add(t4.simdInternal_, t2);
 164         store(base + align * offset[1], t4);
 165         base[align * offset[1] + 2] += vec_extract(v2.simdInternal_, 1);
 166     }
 167 }
 168
 169 template<int align>
 170 static inline void gmx_simdcall
 171                    transposeScatterDecrU(double* base, const std::int32_t offset[], SimdDouble v0, SimdDouble v1, SimdDouble v2)
 172 {
 173     if (align % 4 == 0)
 174     {
 175         __vector double t1, t2, t3, t4;
 176         SimdDouble      t5, t6, t7, t8;
 177
 178         t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 179         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 180         t3 = vec_mergeh(v2.simdInternal_, vec_splats(0.0));
 181         t4 = vec_mergel(v2.simdInternal_, vec_splats(0.0));
 182
 183         t5               = simdLoad(base + align * offset[0]);
 184         t6               = simdLoad(base + align * offset[0] + 2);
 185         t5.simdInternal_ = vec_sub(t5.simdInternal_, t1);
 186         t6.simdInternal_ = vec_sub(t6.simdInternal_, t3);
 187         store(base + align * offset[0], t5);
 188         store(base + align * offset[0] + 2, t6);
 189
 190         t5               = simdLoad(base + align * offset[1]);
 191         t6               = simdLoad(base + align * offset[1] + 2);
 192         t5.simdInternal_ = vec_sub(t5.simdInternal_, t2);
 193         t6.simdInternal_ = vec_sub(t6.simdInternal_, t4);
 194         store(base + align * offset[1], t5);
 195         store(base + align * offset[1] + 2, t6);
 196     }
 197     else
 198     {
 199         __vector double t1, t2;
 200         SimdDouble      t3, t4;
 201
 202         t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 203         t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 204
 205         t3               = simdLoad(base + align * offset[0]);
 206         t3.simdInternal_ = vec_sub(t3.simdInternal_, t1);
 207         store(base + align * offset[0], t3);
 208         base[align * offset[0] + 2] -= vec_extract(v2.simdInternal_, 0);
 209
 210         t4               = simdLoad(base + align * offset[1]);
 211         t4.simdInternal_ = vec_sub(t4.simdInternal_, t2);
 212         store(base + align * offset[1], t4);
 213         base[align * offset[1] + 2] -= vec_extract(v2.simdInternal_, 1);
 214     }
 215 }
 216
 217 static inline void gmx_simdcall expandScalarsToTriplets(SimdDouble  scalar,
 218                                                         SimdDouble* triplets0,
 219                                                         SimdDouble* triplets1,
 220                                                         SimdDouble* triplets2)
 221 {
 222     triplets0->simdInternal_ = vec_mergeh(scalar.simdInternal_, scalar.simdInternal_);
 223     triplets1->simdInternal_ = scalar.simdInternal_;
 224     triplets2->simdInternal_ = vec_mergel(scalar.simdInternal_, scalar.simdInternal_);
 225 }
 226
 227 template<int align>
 228 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const double* base,
 229                                                              SimdDInt32    offset,
 230                                                              SimdDouble*   v0,
 231                                                              SimdDouble*   v1,
 232                                                              SimdDouble*   v2,
 233                                                              SimdDouble*   v3)
 234 {
 235     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 236
 237     store(ioffset, offset);
 238     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 239 }
 240
 241 template<int align>
 242 static inline void gmx_simdcall
 243                    gatherLoadBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 244 {
 245     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 246
 247     store(ioffset, offset);
 248     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 249 }
 250
 251
 252 template<int align>
 253 static inline void gmx_simdcall
 254                    gatherLoadUBySimdIntTranspose(const double* base, SimdDInt32 offset, SimdDouble* v0, SimdDouble* v1)
 255 {
 256     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_DINT32_WIDTH];
 257
 258     store(ioffset, offset);
 259
 260     SimdDouble t1     = simdLoadU(base + align * ioffset[0]);
 261     SimdDouble t2     = simdLoadU(base + align * ioffset[1]);
 262     v0->simdInternal_ = vec_mergeh(t1.simdInternal_, t2.simdInternal_);
 263     v1->simdInternal_ = vec_mergel(t1.simdInternal_, t2.simdInternal_);
 264 }
 265
 266 static inline double gmx_simdcall
 267                      reduceIncr4ReturnSum(double* m, SimdDouble v0, SimdDouble v1, SimdDouble v2, SimdDouble v3)
 268 {
 269     __vector double t1, t2, t3, t4;
 270
 271     t1 = vec_mergeh(v0.simdInternal_, v1.simdInternal_);
 272     t2 = vec_mergel(v0.simdInternal_, v1.simdInternal_);
 273     t3 = vec_mergeh(v2.simdInternal_, v3.simdInternal_);
 274     t4 = vec_mergel(v2.simdInternal_, v3.simdInternal_);
 275
 276     t1 = vec_add(t1, t2);
 277     t3 = vec_add(t3, t4);
 278
 279     *reinterpret_cast<__vector double*>(m) += t1;
 280     *reinterpret_cast<__vector double*>(m + 2) += t3;
 281
 282     t1 = vec_add(t1, t3);
 283     return reduce(t1);
 284 }
 285
 286 } // namespace gmx
 287
 288 #endif // GMX_SIMD_IMPLEMENTATION_IBM_VSX_UTIL_DOUBLE_H