src/gromacs/ewald/pme_simd4.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 /* This include file has code between ifdef's to make sure
  40  * that this performance sensitive code is inlined
  41  * and to remove conditionals and variable loop bounds at compile time.
  42  */
  43
  44 #ifdef PME_SPREAD_SIMD4_ORDER4
  45 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
  46  * This code does not assume any memory alignment for the grid.
  47  */
  48 {
  49     using namespace gmx;
  50     Simd4Real ty_S0(thy[0]);
  51     Simd4Real ty_S1(thy[1]);
  52     Simd4Real ty_S2(thy[2]);
  53     Simd4Real ty_S3(thy[3]);
  54     Simd4Real tz_S;
  55     Simd4Real vx_S;
  56     Simd4Real vx_tz_S;
  57     Simd4Real sum_S0, sum_S1, sum_S2, sum_S3;
  58     Simd4Real gri_S0, gri_S1, gri_S2, gri_S3;
  59
  60     /* With order 4 the z-spline is actually aligned */
  61     tz_S = load4(thz);
  62
  63     for (ithx = 0; (ithx < 4); ithx++)
  64     {
  65         index_x = (i0 + ithx) * pny * pnz;
  66         valx    = coefficient * thx[ithx];
  67
  68         vx_S = Simd4Real(valx);
  69
  70         vx_tz_S = vx_S * tz_S;
  71
  72         gri_S0 = load4U(grid + index_x + (j0 + 0) * pnz + k0);
  73         gri_S1 = load4U(grid + index_x + (j0 + 1) * pnz + k0);
  74         gri_S2 = load4U(grid + index_x + (j0 + 2) * pnz + k0);
  75         gri_S3 = load4U(grid + index_x + (j0 + 3) * pnz + k0);
  76
  77         sum_S0 = fma(vx_tz_S, ty_S0, gri_S0);
  78         sum_S1 = fma(vx_tz_S, ty_S1, gri_S1);
  79         sum_S2 = fma(vx_tz_S, ty_S2, gri_S2);
  80         sum_S3 = fma(vx_tz_S, ty_S3, gri_S3);
  81
  82         store4U(grid + index_x + (j0 + 0) * pnz + k0, sum_S0);
  83         store4U(grid + index_x + (j0 + 1) * pnz + k0, sum_S1);
  84         store4U(grid + index_x + (j0 + 2) * pnz + k0, sum_S2);
  85         store4U(grid + index_x + (j0 + 3) * pnz + k0, sum_S3);
  86     }
  87 }
  88 #    undef PME_SPREAD_SIMD4_ORDER4
  89 #endif
  90
  91
  92 #ifdef PME_SPREAD_SIMD4_ALIGNED
  93 /* This code assumes that the grid is allocated 4-real aligned
  94  * and that pnz is a multiple of 4.
  95  * This code supports pme_order <= 5.
  96  */
  97 {
  98     using namespace gmx;
  99     int       offset;
 100     int       index;
 101     Simd4Real ty_S0(thy[0]);
 102     Simd4Real ty_S1(thy[1]);
 103     Simd4Real ty_S2(thy[2]);
 104     Simd4Real ty_S3(thy[3]);
 105     Simd4Real tz_S0;
 106     Simd4Real tz_S1;
 107     Simd4Real vx_S;
 108     Simd4Real vx_tz_S0;
 109     Simd4Real vx_tz_S1;
 110     Simd4Real sum_S00, sum_S01, sum_S02, sum_S03;
 111     Simd4Real sum_S10, sum_S11, sum_S12, sum_S13;
 112     Simd4Real gri_S00, gri_S01, gri_S02, gri_S03;
 113     Simd4Real gri_S10, gri_S11, gri_S12, gri_S13;
 114 #    if PME_ORDER == 5
 115     Simd4Real ty_S4(thy[4]);
 116     Simd4Real sum_S04;
 117     Simd4Real sum_S14;
 118     Simd4Real gri_S04;
 119     Simd4Real gri_S14;
 120 #    endif
 121
 122     offset = k0 & 3;
 123
 124 #    ifdef PME_SIMD4_UNALIGNED
 125     tz_S0 = load4U(thz - offset);
 126     tz_S1 = load4U(thz - offset + 4);
 127 #    else
 128     {
 129         int i;
 130         /* Copy thz to an aligned buffer (unused buffer parts are masked) */
 131         for (i = 0; i < PME_ORDER; i++)
 132         {
 133             thz_aligned[offset + i] = thz[i];
 134         }
 135         tz_S0 = load4(thz_aligned);
 136         tz_S1 = load4(thz_aligned + 4);
 137     }
 138 #    endif
 139     tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
 140     tz_S1 = selectByMask(tz_S1, work->mask_S1[offset]);
 141
 142     for (ithx = 0; (ithx < PME_ORDER); ithx++)
 143     {
 144         index = (i0 + ithx) * pny * pnz + j0 * pnz + k0 - offset;
 145         valx  = coefficient * thx[ithx];
 146
 147         vx_S = Simd4Real(valx);
 148
 149         vx_tz_S0 = vx_S * tz_S0;
 150         vx_tz_S1 = vx_S * tz_S1;
 151
 152         gri_S00 = load4(grid + index + 0 * pnz);
 153         gri_S01 = load4(grid + index + 1 * pnz);
 154         gri_S02 = load4(grid + index + 2 * pnz);
 155         gri_S03 = load4(grid + index + 3 * pnz);
 156 #    if PME_ORDER == 5
 157         gri_S04 = load4(grid + index + 4 * pnz);
 158 #    endif
 159         gri_S10 = load4(grid + index + 0 * pnz + 4);
 160         gri_S11 = load4(grid + index + 1 * pnz + 4);
 161         gri_S12 = load4(grid + index + 2 * pnz + 4);
 162         gri_S13 = load4(grid + index + 3 * pnz + 4);
 163 #    if PME_ORDER == 5
 164         gri_S14 = load4(grid + index + 4 * pnz + 4);
 165 #    endif
 166
 167         sum_S00 = fma(vx_tz_S0, ty_S0, gri_S00);
 168         sum_S01 = fma(vx_tz_S0, ty_S1, gri_S01);
 169         sum_S02 = fma(vx_tz_S0, ty_S2, gri_S02);
 170         sum_S03 = fma(vx_tz_S0, ty_S3, gri_S03);
 171 #    if PME_ORDER == 5
 172         sum_S04 = fma(vx_tz_S0, ty_S4, gri_S04);
 173 #    endif
 174         sum_S10 = fma(vx_tz_S1, ty_S0, gri_S10);
 175         sum_S11 = fma(vx_tz_S1, ty_S1, gri_S11);
 176         sum_S12 = fma(vx_tz_S1, ty_S2, gri_S12);
 177         sum_S13 = fma(vx_tz_S1, ty_S3, gri_S13);
 178 #    if PME_ORDER == 5
 179         sum_S14 = fma(vx_tz_S1, ty_S4, gri_S14);
 180 #    endif
 181
 182         store4(grid + index + 0 * pnz, sum_S00);
 183         store4(grid + index + 1 * pnz, sum_S01);
 184         store4(grid + index + 2 * pnz, sum_S02);
 185         store4(grid + index + 3 * pnz, sum_S03);
 186 #    if PME_ORDER == 5
 187         store4(grid + index + 4 * pnz, sum_S04);
 188 #    endif
 189         store4(grid + index + 0 * pnz + 4, sum_S10);
 190         store4(grid + index + 1 * pnz + 4, sum_S11);
 191         store4(grid + index + 2 * pnz + 4, sum_S12);
 192         store4(grid + index + 3 * pnz + 4, sum_S13);
 193 #    if PME_ORDER == 5
 194         store4(grid + index + 4 * pnz + 4, sum_S14);
 195 #    endif
 196     }
 197 }
 198 #    undef PME_ORDER
 199 #    undef PME_SPREAD_SIMD4_ALIGNED
 200 #endif