src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
   5  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 #ifndef GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
  37 #define GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include <cstddef>
  42 #include <cstdint>
  43
  44 #include "gromacs/utility/basedefinitions.h"
  45
  46 #include "impl_ibm_vmx_definitions.h"
  47 #include "impl_ibm_vmx_simd_float.h"
  48
  49 namespace gmx
  50 {
  51
  52 template<int align>
  53 static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
  54                                                     const std::int32_t offset[],
  55                                                     SimdFloat*         v0,
  56                                                     SimdFloat*         v1,
  57                                                     SimdFloat*         v2,
  58                                                     SimdFloat*         v3)
  59 {
  60     *v0 = simdLoad(base + align * offset[0]);
  61     *v1 = simdLoad(base + align * offset[1]);
  62     *v2 = simdLoad(base + align * offset[2]);
  63     *v3 = simdLoad(base + align * offset[3]);
  64
  65     __vector float t0 = vec_mergeh(v0->simdInternal_, v2->simdInternal_);
  66     __vector float t1 = vec_mergel(v0->simdInternal_, v2->simdInternal_);
  67     __vector float t2 = vec_mergeh(v1->simdInternal_, v3->simdInternal_);
  68     __vector float t3 = vec_mergel(v1->simdInternal_, v3->simdInternal_);
  69     v0->simdInternal_ = vec_mergeh(t0, t2);
  70     v1->simdInternal_ = vec_mergel(t0, t2);
  71     v2->simdInternal_ = vec_mergeh(t1, t3);
  72     v3->simdInternal_ = vec_mergel(t1, t3);
  73 }
  74
  75 template<int align>
  76 static inline void gmx_simdcall
  77                    gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
  78 {
  79     if (align % 4 == 0)
  80     {
  81         SimdFloat t2, t3;
  82
  83         gatherLoadTranspose<align>(base, offset, v0, v1, &t2, &t3);
  84     }
  85     else
  86     {
  87         __vector float         t0, t1, t2, t3, t4, t5, t6, t7;
  88         __vector unsigned char p0, p1, p2, p3;
  89
  90         // This is REALLY slow, since we have no choice but to load individual
  91         // elements when we cannot guarantee that we can access beyond the end of
  92         // the memory. Fortunately, 99% of the usage should be the aligned-to-4
  93         // case above instead.
  94         t0                = vec_lde(0, base + align * offset[0]);
  95         t1                = vec_lde(0, base + align * offset[1]);
  96         t2                = vec_lde(0, base + align * offset[2]);
  97         t3                = vec_lde(0, base + align * offset[3]);
  98         p0                = vec_lvsl(0, base + align * offset[0]);
  99         p1                = vec_lvsl(0, base + align * offset[1]);
 100         p2                = vec_lvsl(0, base + align * offset[2]);
 101         p3                = vec_lvsl(0, base + align * offset[3]);
 102         t0                = vec_perm(t0, t0, p0);
 103         t1                = vec_perm(t1, t1, p1);
 104         t2                = vec_perm(t2, t2, p2);
 105         t3                = vec_perm(t3, t3, p3);
 106         t0                = vec_mergeh(t0, t2);
 107         t1                = vec_mergeh(t1, t3);
 108         v0->simdInternal_ = vec_mergeh(t0, t1);
 109
 110         t4                = vec_lde(0, base + align * offset[0] + 1);
 111         t5                = vec_lde(0, base + align * offset[1] + 1);
 112         t6                = vec_lde(0, base + align * offset[2] + 1);
 113         t7                = vec_lde(0, base + align * offset[3] + 1);
 114         p0                = vec_lvsl(0, base + align * offset[0] + 1);
 115         p1                = vec_lvsl(0, base + align * offset[1] + 1);
 116         p2                = vec_lvsl(0, base + align * offset[2] + 1);
 117         p3                = vec_lvsl(0, base + align * offset[3] + 1);
 118         t4                = vec_perm(t4, t4, p0);
 119         t5                = vec_perm(t5, t5, p1);
 120         t6                = vec_perm(t6, t6, p2);
 121         t7                = vec_perm(t7, t7, p3);
 122         t4                = vec_mergeh(t4, t6);
 123         t5                = vec_mergeh(t5, t7);
 124         v1->simdInternal_ = vec_mergeh(t4, t5);
 125     }
 126 }
 127
 128 static const int c_simdBestPairAlignmentFloat = 2;
 129
 130 template<int align>
 131 static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
 132                                                      const std::int32_t offset[],
 133                                                      SimdFloat*         v0,
 134                                                      SimdFloat*         v1,
 135                                                      SimdFloat*         v2)
 136 {
 137     if (align % 4 == 0)
 138     {
 139         SimdFloat t3;
 140         gatherLoadTranspose<align>(base, offset, v0, v1, v2, &t3);
 141     }
 142     else
 143     {
 144         __vector float         t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
 145         __vector unsigned char p0, p1, p2, p3;
 146
 147         // This is REALLY slow, since we have no choice but to load individual
 148         // elements when we cannot guarantee that we can access beyond the end of
 149         // the memory. Unfortunately this is likely the most common case.
 150         t0                = vec_lde(0, base + align * offset[0]);
 151         t1                = vec_lde(0, base + align * offset[1]);
 152         t2                = vec_lde(0, base + align * offset[2]);
 153         t3                = vec_lde(0, base + align * offset[3]);
 154         p0                = vec_lvsl(0, base + align * offset[0]);
 155         p1                = vec_lvsl(0, base + align * offset[1]);
 156         p2                = vec_lvsl(0, base + align * offset[2]);
 157         p3                = vec_lvsl(0, base + align * offset[3]);
 158         t0                = vec_perm(t0, t0, p0);
 159         t1                = vec_perm(t1, t1, p1);
 160         t2                = vec_perm(t2, t2, p2);
 161         t3                = vec_perm(t3, t3, p3);
 162         t0                = vec_mergeh(t0, t2);
 163         t1                = vec_mergeh(t1, t3);
 164         v0->simdInternal_ = vec_mergeh(t0, t1);
 165
 166         t4                = vec_lde(0, base + align * offset[0] + 1);
 167         t5                = vec_lde(0, base + align * offset[1] + 1);
 168         t6                = vec_lde(0, base + align * offset[2] + 1);
 169         t7                = vec_lde(0, base + align * offset[3] + 1);
 170         p0                = vec_lvsl(0, base + align * offset[0] + 1);
 171         p1                = vec_lvsl(0, base + align * offset[1] + 1);
 172         p2                = vec_lvsl(0, base + align * offset[2] + 1);
 173         p3                = vec_lvsl(0, base + align * offset[3] + 1);
 174         t4                = vec_perm(t4, t4, p0);
 175         t5                = vec_perm(t5, t5, p1);
 176         t6                = vec_perm(t6, t6, p2);
 177         t7                = vec_perm(t7, t7, p3);
 178         t4                = vec_mergeh(t4, t6);
 179         t5                = vec_mergeh(t5, t7);
 180         v1->simdInternal_ = vec_mergeh(t4, t5);
 181
 182         t8                = vec_lde(0, base + align * offset[0] + 2);
 183         t9                = vec_lde(0, base + align * offset[1] + 2);
 184         t10               = vec_lde(0, base + align * offset[2] + 2);
 185         t11               = vec_lde(0, base + align * offset[3] + 2);
 186         p0                = vec_lvsl(0, base + align * offset[0] + 2);
 187         p1                = vec_lvsl(0, base + align * offset[1] + 2);
 188         p2                = vec_lvsl(0, base + align * offset[2] + 2);
 189         p3                = vec_lvsl(0, base + align * offset[3] + 2);
 190         t8                = vec_perm(t8, t8, p0);
 191         t9                = vec_perm(t9, t9, p1);
 192         t10               = vec_perm(t10, t10, p2);
 193         t11               = vec_perm(t11, t11, p3);
 194         t8                = vec_mergeh(t8, t10);
 195         t9                = vec_mergeh(t9, t11);
 196         v2->simdInternal_ = vec_mergeh(t8, t9);
 197     }
 198 }
 199
 200
 201 template<int align>
 202 static inline void gmx_simdcall
 203                    transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 204 {
 205     __vector unsigned char p0, p1, p2, p3;
 206
 207     __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 208     __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 209     __vector float t2 = vec_mergeh(v1.simdInternal_, v2.simdInternal_);
 210     __vector float t3 = vec_mergel(v1.simdInternal_, v2.simdInternal_);
 211     __vector float t4 = vec_mergeh(t0, t2);
 212     __vector float t5 = vec_mergel(t0, t2);
 213     __vector float t6 = vec_mergeh(t1, t3);
 214     __vector float t7 = vec_mergel(t1, t3);
 215
 216     p0 = vec_lvsr(0, base + align * offset[0]);
 217     p1 = vec_lvsr(0, base + align * offset[1]);
 218     p2 = vec_lvsr(0, base + align * offset[2]);
 219     p3 = vec_lvsr(0, base + align * offset[3]);
 220
 221     t4 = vec_perm(t4, t4, p0);
 222     t5 = vec_perm(t5, t5, p1);
 223     t6 = vec_perm(t6, t6, p2);
 224     t7 = vec_perm(t7, t7, p3);
 225
 226     vec_ste(t4, 0, base + align * offset[0]);
 227     vec_ste(t4, 4, base + align * offset[0]);
 228     vec_ste(t4, 8, base + align * offset[0]);
 229     vec_ste(t5, 0, base + align * offset[1]);
 230     vec_ste(t5, 4, base + align * offset[1]);
 231     vec_ste(t5, 8, base + align * offset[1]);
 232     vec_ste(t6, 0, base + align * offset[2]);
 233     vec_ste(t6, 4, base + align * offset[2]);
 234     vec_ste(t6, 8, base + align * offset[2]);
 235     vec_ste(t7, 0, base + align * offset[3]);
 236     vec_ste(t7, 4, base + align * offset[3]);
 237     vec_ste(t7, 8, base + align * offset[3]);
 238 }
 239
 240
 241 template<int align>
 242 static inline void gmx_simdcall
 243                    transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 244 {
 245     if (align % 4 == 0)
 246     {
 247         __vector float zero = reinterpret_cast<__vector float>(vec_splat_u32(0));
 248         __vector float t0   = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 249         __vector float t1   = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 250         __vector float t2   = vec_mergeh(v1.simdInternal_, zero);
 251         __vector float t3   = vec_mergel(v1.simdInternal_, zero);
 252         __vector float t4   = vec_mergeh(t0, t2);
 253         __vector float t5   = vec_mergel(t0, t2);
 254         __vector float t6   = vec_mergeh(t1, t3);
 255         __vector float t7   = vec_mergel(t1, t3);
 256
 257         vec_st(vec_add(vec_ld(0, base + align * offset[0]), t4), 0, base + align * offset[0]);
 258         vec_st(vec_add(vec_ld(0, base + align * offset[1]), t5), 0, base + align * offset[1]);
 259         vec_st(vec_add(vec_ld(0, base + align * offset[2]), t6), 0, base + align * offset[2]);
 260         vec_st(vec_add(vec_ld(0, base + align * offset[3]), t7), 0, base + align * offset[3]);
 261     }
 262     else
 263     {
 264         alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
 265         alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
 266         alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
 267
 268         vec_st(v0.simdInternal_, 0, rdata0);
 269         vec_st(v1.simdInternal_, 0, rdata1);
 270         vec_st(v2.simdInternal_, 0, rdata2);
 271
 272         base[align * offset[0] + 0] += rdata0[0];
 273         base[align * offset[0] + 1] += rdata1[0];
 274         base[align * offset[0] + 2] += rdata2[0];
 275         base[align * offset[1] + 0] += rdata0[1];
 276         base[align * offset[1] + 1] += rdata1[1];
 277         base[align * offset[1] + 2] += rdata2[1];
 278         base[align * offset[2] + 0] += rdata0[2];
 279         base[align * offset[2] + 1] += rdata1[2];
 280         base[align * offset[2] + 2] += rdata2[2];
 281         base[align * offset[3] + 0] += rdata0[3];
 282         base[align * offset[3] + 1] += rdata1[3];
 283         base[align * offset[3] + 2] += rdata2[3];
 284     }
 285 }
 286
 287 template<int align>
 288 static inline void gmx_simdcall
 289                    transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 290 {
 291     if (align % 4 == 0)
 292     {
 293         __vector float zero = reinterpret_cast<__vector float>(vec_splat_u32(0));
 294         __vector float t0   = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 295         __vector float t1   = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 296         __vector float t2   = vec_mergeh(v1.simdInternal_, zero);
 297         __vector float t3   = vec_mergel(v1.simdInternal_, zero);
 298         __vector float t4   = vec_mergeh(t0, t2);
 299         __vector float t5   = vec_mergel(t0, t2);
 300         __vector float t6   = vec_mergeh(t1, t3);
 301         __vector float t7   = vec_mergel(t1, t3);
 302
 303         vec_st(vec_sub(vec_ld(0, base + align * offset[0]), t4), 0, base + align * offset[0]);
 304         vec_st(vec_sub(vec_ld(0, base + align * offset[1]), t5), 0, base + align * offset[1]);
 305         vec_st(vec_sub(vec_ld(0, base + align * offset[2]), t6), 0, base + align * offset[2]);
 306         vec_st(vec_sub(vec_ld(0, base + align * offset[3]), t7), 0, base + align * offset[3]);
 307     }
 308     else
 309     {
 310         alignas(GMX_SIMD_ALIGNMENT) float rdata0[GMX_SIMD_FLOAT_WIDTH];
 311         alignas(GMX_SIMD_ALIGNMENT) float rdata1[GMX_SIMD_FLOAT_WIDTH];
 312         alignas(GMX_SIMD_ALIGNMENT) float rdata2[GMX_SIMD_FLOAT_WIDTH];
 313
 314         vec_st(v0.simdInternal_, 0, rdata0);
 315         vec_st(v1.simdInternal_, 0, rdata1);
 316         vec_st(v2.simdInternal_, 0, rdata2);
 317
 318         base[align * offset[0] + 0] -= rdata0[0];
 319         base[align * offset[0] + 1] -= rdata1[0];
 320         base[align * offset[0] + 2] -= rdata2[0];
 321         base[align * offset[1] + 0] -= rdata0[1];
 322         base[align * offset[1] + 1] -= rdata1[1];
 323         base[align * offset[1] + 2] -= rdata2[1];
 324         base[align * offset[2] + 0] -= rdata0[2];
 325         base[align * offset[2] + 1] -= rdata1[2];
 326         base[align * offset[2] + 2] -= rdata2[2];
 327         base[align * offset[3] + 0] -= rdata0[3];
 328         base[align * offset[3] + 1] -= rdata1[3];
 329         base[align * offset[3] + 2] -= rdata2[3];
 330     }
 331 }
 332
 333 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
 334                                                         SimdFloat* triplets0,
 335                                                         SimdFloat* triplets1,
 336                                                         SimdFloat* triplets2)
 337 {
 338     const __vector unsigned char perm0 = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 };
 339     const __vector unsigned char perm1 = { 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11 };
 340     const __vector unsigned char perm2 = { 8,  9,  10, 11, 12, 13, 14, 15,
 341                                            12, 13, 14, 15, 12, 13, 14, 15 };
 342
 343     triplets0->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm0);
 344     triplets1->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm1);
 345     triplets2->simdInternal_ = vec_perm(scalar.simdInternal_, scalar.simdInternal_, perm2);
 346 }
 347
 348
 349 template<int align>
 350 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
 351                                                              SimdFInt32   offset,
 352                                                              SimdFloat*   v0,
 353                                                              SimdFloat*   v1,
 354                                                              SimdFloat*   v2,
 355                                                              SimdFloat*   v3)
 356 {
 357     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 358
 359     vec_st(offset.simdInternal_, 0, ioffset);
 360     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 361 }
 362
 363 template<int align>
 364 static inline void gmx_simdcall
 365                    gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 366 {
 367     alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
 368
 369     vec_st(offset.simdInternal_, 0, ioffset);
 370     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 371 }
 372
 373
 374 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 375 {
 376     __vector float t0 = vec_mergeh(v0.simdInternal_, v2.simdInternal_);
 377     __vector float t1 = vec_mergel(v0.simdInternal_, v2.simdInternal_);
 378     __vector float t2 = vec_mergeh(v1.simdInternal_, v3.simdInternal_);
 379     __vector float t3 = vec_mergel(v1.simdInternal_, v3.simdInternal_);
 380     v0.simdInternal_  = vec_mergeh(t0, t2);
 381     v1.simdInternal_  = vec_mergel(t0, t2);
 382     v2.simdInternal_  = vec_mergeh(t1, t3);
 383     v3.simdInternal_  = vec_mergel(t1, t3);
 384
 385     v0 = v0 + v1;
 386     v2 = v2 + v3;
 387     v0 = v0 + v2;
 388     v2 = v0 + simdLoad(m);
 389     store(m, v2);
 390
 391     return reduce(v0);
 392 }
 393
 394 } // namespace gmx
 395
 396 #endif // GMX_SIMD_IMPL_IBM_VMX_UTIL_FLOAT_H