Update instructions in containers.rst
[gromacs.git] / src / gromacs / ewald / pme_simd4.h
bloba2c8e3b24db5f2d6220de037dbae8aaca9e2dc4d
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 /* This include file has code between ifdef's to make sure
40 * that this performance sensitive code is inlined
41 * and to remove conditionals and variable loop bounds at compile time.
44 #ifdef PME_SPREAD_SIMD4_ORDER4
45 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
46 * This code does not assume any memory alignment for the grid.
49 using namespace gmx;
50 Simd4Real ty_S0(thy[0]);
51 Simd4Real ty_S1(thy[1]);
52 Simd4Real ty_S2(thy[2]);
53 Simd4Real ty_S3(thy[3]);
54 Simd4Real tz_S;
55 Simd4Real vx_S;
56 Simd4Real vx_tz_S;
57 Simd4Real sum_S0, sum_S1, sum_S2, sum_S3;
58 Simd4Real gri_S0, gri_S1, gri_S2, gri_S3;
60 /* With order 4 the z-spline is actually aligned */
61 tz_S = load4(thz);
63 for (ithx = 0; (ithx < 4); ithx++)
65 index_x = (i0 + ithx) * pny * pnz;
66 valx = coefficient * thx[ithx];
68 vx_S = Simd4Real(valx);
70 vx_tz_S = vx_S * tz_S;
72 gri_S0 = load4U(grid + index_x + (j0 + 0) * pnz + k0);
73 gri_S1 = load4U(grid + index_x + (j0 + 1) * pnz + k0);
74 gri_S2 = load4U(grid + index_x + (j0 + 2) * pnz + k0);
75 gri_S3 = load4U(grid + index_x + (j0 + 3) * pnz + k0);
77 sum_S0 = fma(vx_tz_S, ty_S0, gri_S0);
78 sum_S1 = fma(vx_tz_S, ty_S1, gri_S1);
79 sum_S2 = fma(vx_tz_S, ty_S2, gri_S2);
80 sum_S3 = fma(vx_tz_S, ty_S3, gri_S3);
82 store4U(grid + index_x + (j0 + 0) * pnz + k0, sum_S0);
83 store4U(grid + index_x + (j0 + 1) * pnz + k0, sum_S1);
84 store4U(grid + index_x + (j0 + 2) * pnz + k0, sum_S2);
85 store4U(grid + index_x + (j0 + 3) * pnz + k0, sum_S3);
88 # undef PME_SPREAD_SIMD4_ORDER4
89 #endif
92 #ifdef PME_SPREAD_SIMD4_ALIGNED
93 /* This code assumes that the grid is allocated 4-real aligned
94 * and that pnz is a multiple of 4.
95 * This code supports pme_order <= 5.
98 using namespace gmx;
99 int offset;
100 int index;
101 Simd4Real ty_S0(thy[0]);
102 Simd4Real ty_S1(thy[1]);
103 Simd4Real ty_S2(thy[2]);
104 Simd4Real ty_S3(thy[3]);
105 Simd4Real tz_S0;
106 Simd4Real tz_S1;
107 Simd4Real vx_S;
108 Simd4Real vx_tz_S0;
109 Simd4Real vx_tz_S1;
110 Simd4Real sum_S00, sum_S01, sum_S02, sum_S03;
111 Simd4Real sum_S10, sum_S11, sum_S12, sum_S13;
112 Simd4Real gri_S00, gri_S01, gri_S02, gri_S03;
113 Simd4Real gri_S10, gri_S11, gri_S12, gri_S13;
114 # if PME_ORDER == 5
115 Simd4Real ty_S4(thy[4]);
116 Simd4Real sum_S04;
117 Simd4Real sum_S14;
118 Simd4Real gri_S04;
119 Simd4Real gri_S14;
120 # endif
122 offset = k0 & 3;
124 # ifdef PME_SIMD4_UNALIGNED
125 tz_S0 = load4U(thz - offset);
126 tz_S1 = load4U(thz - offset + 4);
127 # else
129 int i;
130 /* Copy thz to an aligned buffer (unused buffer parts are masked) */
131 for (i = 0; i < PME_ORDER; i++)
133 thz_aligned[offset + i] = thz[i];
135 tz_S0 = load4(thz_aligned);
136 tz_S1 = load4(thz_aligned + 4);
138 # endif
139 tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
140 tz_S1 = selectByMask(tz_S1, work->mask_S1[offset]);
142 for (ithx = 0; (ithx < PME_ORDER); ithx++)
144 index = (i0 + ithx) * pny * pnz + j0 * pnz + k0 - offset;
145 valx = coefficient * thx[ithx];
147 vx_S = Simd4Real(valx);
149 vx_tz_S0 = vx_S * tz_S0;
150 vx_tz_S1 = vx_S * tz_S1;
152 gri_S00 = load4(grid + index + 0 * pnz);
153 gri_S01 = load4(grid + index + 1 * pnz);
154 gri_S02 = load4(grid + index + 2 * pnz);
155 gri_S03 = load4(grid + index + 3 * pnz);
156 # if PME_ORDER == 5
157 gri_S04 = load4(grid + index + 4 * pnz);
158 # endif
159 gri_S10 = load4(grid + index + 0 * pnz + 4);
160 gri_S11 = load4(grid + index + 1 * pnz + 4);
161 gri_S12 = load4(grid + index + 2 * pnz + 4);
162 gri_S13 = load4(grid + index + 3 * pnz + 4);
163 # if PME_ORDER == 5
164 gri_S14 = load4(grid + index + 4 * pnz + 4);
165 # endif
167 sum_S00 = fma(vx_tz_S0, ty_S0, gri_S00);
168 sum_S01 = fma(vx_tz_S0, ty_S1, gri_S01);
169 sum_S02 = fma(vx_tz_S0, ty_S2, gri_S02);
170 sum_S03 = fma(vx_tz_S0, ty_S3, gri_S03);
171 # if PME_ORDER == 5
172 sum_S04 = fma(vx_tz_S0, ty_S4, gri_S04);
173 # endif
174 sum_S10 = fma(vx_tz_S1, ty_S0, gri_S10);
175 sum_S11 = fma(vx_tz_S1, ty_S1, gri_S11);
176 sum_S12 = fma(vx_tz_S1, ty_S2, gri_S12);
177 sum_S13 = fma(vx_tz_S1, ty_S3, gri_S13);
178 # if PME_ORDER == 5
179 sum_S14 = fma(vx_tz_S1, ty_S4, gri_S14);
180 # endif
182 store4(grid + index + 0 * pnz, sum_S00);
183 store4(grid + index + 1 * pnz, sum_S01);
184 store4(grid + index + 2 * pnz, sum_S02);
185 store4(grid + index + 3 * pnz, sum_S03);
186 # if PME_ORDER == 5
187 store4(grid + index + 4 * pnz, sum_S04);
188 # endif
189 store4(grid + index + 0 * pnz + 4, sum_S10);
190 store4(grid + index + 1 * pnz + 4, sum_S11);
191 store4(grid + index + 2 * pnz + 4, sum_S12);
192 store4(grid + index + 3 * pnz + 4, sum_S13);
193 # if PME_ORDER == 5
194 store4(grid + index + 4 * pnz + 4, sum_S14);
195 # endif
198 # undef PME_ORDER
199 # undef PME_SPREAD_SIMD4_ALIGNED
200 #endif