Update instructions in containers.rst
[gromacs.git] / src / gromacs / ewald / pme_gather.cpp
blob2a7925cfe281335cd195176f2497f87aab10265c
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2017,2018 by the GROMACS development team.
7 * Copyright (c) 2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "gmxpre.h"
41 #include "pme_gather.h"
43 #include "gromacs/math/vec.h"
44 #include "gromacs/simd/simd.h"
45 #include "gromacs/utility/basedefinitions.h"
46 #include "gromacs/utility/gmxassert.h"
47 #include "gromacs/utility/smalloc.h"
48 #include "gromacs/utility/typetraits.h"
50 #include "pme_internal.h"
51 #include "pme_simd.h"
52 #include "pme_spline_work.h"
54 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
56 /* Spline function. Goals: 1) Force compiler to instantiate function separately
57 for each compile-time value of order and once more for any possible (runtime)
58 value. 2) Allow overloading for specific compile-time values.
59 The Int template argument can be either int (runtime) or an object of type
60 integral_constant<int, N> (compile-time). Common runtime values can be
61 converted to compile-time values with a switch statement. For the compile
62 value the compiler is required to instantiate the function separately for
63 each value. The function can be overloaded for specific compile-time values
64 using integral_constant<int, N> where N is either a specific value or an
65 enable_if constrained non-type template parameter. The most specific overload
66 (specific value > int template parameter > general function) is called. Inside
67 the function the order argument can be used as regular int because
68 integral_constant has a proper conversion.
70 SIMD do_fspline() template funtions will be used for PME order 4 and 5
71 when the SIMD module has support for SIMD4 for the architecture used.
72 For SIMD4 without unaligned load/store support:
73 order 4 and 5 use the order 4+5 aligned SIMD template
74 For SIMD4 with unaligned load/store support:
75 order 4 uses the order 4 unaligned SIMD template
76 order 5 uses the order 4+5 aligned SIMD template
78 struct do_fspline
80 do_fspline(const gmx_pme_t* pme,
81 const real* gmx_restrict grid,
82 const PmeAtomComm* gmx_restrict atc,
83 const splinedata_t* gmx_restrict spline,
84 int nn) :
85 pme(pme),
86 grid(grid),
87 atc(atc),
88 spline(spline),
89 nn(nn)
93 template<typename Int>
94 RVec operator()(Int order) const
96 static_assert(isIntegralConstant<Int, int>::value || std::is_same_v<Int, int>,
97 "'order' needs to be either of type integral_constant<int,N> or int.");
99 const int norder = nn * order;
101 /* Pointer arithmetic alert, next six statements */
102 const real* const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
103 const real* const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
104 const real* const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
105 const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
106 const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
107 const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
109 RVec f(0, 0, 0);
111 for (int ithx = 0; (ithx < order); ithx++)
113 const int index_x = (idxX + ithx) * gridNY * gridNZ;
114 const real tx = thx[ithx];
115 const real dx = dthx[ithx];
117 for (int ithy = 0; (ithy < order); ithy++)
119 const int index_xy = index_x + (idxY + ithy) * gridNZ;
120 const real ty = thy[ithy];
121 const real dy = dthy[ithy];
122 real fxy1 = 0, fz1 = 0;
124 for (int ithz = 0; (ithz < order); ithz++)
126 const real gval = grid[index_xy + (idxZ + ithz)];
127 fxy1 += thz[ithz] * gval;
128 fz1 += dthz[ithz] * gval;
130 f[XX] += dx * ty * fxy1;
131 f[YY] += tx * dy * fxy1;
132 f[ZZ] += tx * ty * fz1;
136 return f;
139 // TODO: Consider always have at least a dummy implementation of Simd (enough for first phase of two-phase lookup) and then use enable_if instead of #ifdef
140 #if PME_4NSIMD_GATHER
141 /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
142 * Uses 4N SIMD where N is SIMD_WIDTH/4 to operate on all of z and N of y.
143 * This code does not assume any memory alignment for the grid.
145 RVec operator()(std::integral_constant<int, 4> /*unused*/) const
147 const int norder = nn * 4;
148 /* Pointer arithmetic alert, next six statements */
149 const real* const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
150 const real* const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
151 const real* const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
152 const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
153 const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
154 const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
156 Simd4NReal fx_S = setZero();
157 Simd4NReal fy_S = setZero();
158 Simd4NReal fz_S = setZero();
160 /* With order 4 the z-spline is actually aligned */
161 const Simd4NReal tz_S = load4DuplicateN(thz);
162 const Simd4NReal dz_S = load4DuplicateN(dthz);
164 for (int ithx = 0; ithx < 4; ithx++)
166 const int index_x = (idxX + ithx) * gridNY * gridNZ;
167 const Simd4NReal tx_S = Simd4NReal(thx[ithx]);
168 const Simd4NReal dx_S = Simd4NReal(dthx[ithx]);
170 for (int ithy = 0; ithy < 4; ithy += GMX_SIMD4N_REAL_WIDTH / 4)
172 const int index_xy = index_x + (idxY + ithy) * gridNZ;
174 const Simd4NReal ty_S = loadUNDuplicate4(thy + ithy);
175 const Simd4NReal dy_S = loadUNDuplicate4(dthy + ithy);
177 const Simd4NReal gval_S = loadU4NOffset(grid + index_xy + idxZ, gridNZ);
180 const Simd4NReal fxy1_S = tz_S * gval_S;
181 const Simd4NReal fz1_S = dz_S * gval_S;
183 fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
184 fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
185 fz_S = fma(tx_S * ty_S, fz1_S, fz_S);
189 return { reduce(fx_S), reduce(fy_S), reduce(fz_S) };
191 #endif
193 #ifdef PME_SIMD4_SPREAD_GATHER
194 /* Load order elements from unaligned memory into two 4-wide SIMD */
195 template<int order>
196 static inline void loadOrderU(const real* data,
197 std::integral_constant<int, order> /*unused*/,
198 int offset,
199 Simd4Real* S0,
200 Simd4Real* S1)
202 # ifdef PME_SIMD4_UNALIGNED
203 *S0 = load4U(data - offset);
204 *S1 = load4U(data - offset + 4);
205 # else
206 alignas(GMX_SIMD_ALIGNMENT) real buf_aligned[GMX_SIMD4_WIDTH * 2];
207 /* Copy data to an aligned buffer */
208 for (int i = 0; i < order; i++)
210 buf_aligned[offset + i] = data[i];
212 *S0 = load4(buf_aligned);
213 *S1 = load4(buf_aligned + 4);
214 # endif
216 #endif
218 #ifdef PME_SIMD4_SPREAD_GATHER
219 /* This code assumes that the grid is allocated 4-real aligned
220 * and that pme->pmegrid_nz is a multiple of 4.
221 * This code supports pme_order <= 5.
223 template<int Order>
224 std::enable_if_t<Order == 4 || Order == 5, RVec> operator()(std::integral_constant<int, Order> order) const
226 const int norder = nn * order;
227 GMX_ASSERT(gridNZ % 4 == 0,
228 "For aligned SIMD4 operations the grid size has to be padded up to a multiple "
229 "of 4");
230 /* Pointer arithmetic alert, next six statements */
231 const real* const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
232 const real* const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
233 const real* const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
234 const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
235 const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
236 const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
238 struct pme_spline_work* const work = pme->spline_work;
240 const int offset = idxZ & 3;
242 Simd4Real fx_S = setZero();
243 Simd4Real fy_S = setZero();
244 Simd4Real fz_S = setZero();
246 Simd4Real tz_S0, tz_S1, dz_S0, dz_S1;
247 loadOrderU(thz, order, offset, &tz_S0, &tz_S1);
248 loadOrderU(dthz, order, offset, &dz_S0, &dz_S1);
250 tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
251 dz_S0 = selectByMask(dz_S0, work->mask_S0[offset]);
252 tz_S1 = selectByMask(tz_S1, work->mask_S1[offset]);
253 dz_S1 = selectByMask(dz_S1, work->mask_S1[offset]);
255 for (int ithx = 0; (ithx < order); ithx++)
257 const int index_x = (idxX + ithx) * gridNY * gridNZ;
258 const Simd4Real tx_S = Simd4Real(thx[ithx]);
259 const Simd4Real dx_S = Simd4Real(dthx[ithx]);
261 for (int ithy = 0; (ithy < order); ithy++)
263 const int index_xy = index_x + (idxY + ithy) * gridNZ;
264 const Simd4Real ty_S = Simd4Real(thy[ithy]);
265 const Simd4Real dy_S = Simd4Real(dthy[ithy]);
267 const Simd4Real gval_S0 = load4(grid + index_xy + idxZ - offset);
268 const Simd4Real gval_S1 = load4(grid + index_xy + idxZ - offset + 4);
270 const Simd4Real fxy1_S0 = tz_S0 * gval_S0;
271 const Simd4Real fz1_S0 = dz_S0 * gval_S0;
272 const Simd4Real fxy1_S1 = tz_S1 * gval_S1;
273 const Simd4Real fz1_S1 = dz_S1 * gval_S1;
275 const Simd4Real fxy1_S = fxy1_S0 + fxy1_S1;
276 const Simd4Real fz1_S = fz1_S0 + fz1_S1;
278 fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
279 fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
280 fz_S = fma(tx_S * ty_S, fz1_S, fz_S);
284 return { reduce(fx_S), reduce(fy_S), reduce(fz_S) };
286 #endif
287 private:
288 const gmx_pme_t* const pme;
289 const real* const gmx_restrict grid;
290 const PmeAtomComm* const gmx_restrict atc;
291 const splinedata_t* const gmx_restrict spline;
292 const int nn;
294 const int gridNY = pme->pmegrid_ny;
295 const int gridNZ = pme->pmegrid_nz;
297 const int* const idxptr = atc->idx[spline->ind[nn]];
298 const int idxX = idxptr[XX];
299 const int idxY = idxptr[YY];
300 const int idxZ = idxptr[ZZ];
304 void gather_f_bsplines(const gmx_pme_t* pme,
305 const real* grid,
306 gmx_bool bClearF,
307 const PmeAtomComm* atc,
308 const splinedata_t* spline,
309 real scale)
311 /* sum forces for local particles */
313 const int order = pme->pme_order;
314 const int nx = pme->nkx;
315 const int ny = pme->nky;
316 const int nz = pme->nkz;
318 const real rxx = pme->recipbox[XX][XX];
319 const real ryx = pme->recipbox[YY][XX];
320 const real ryy = pme->recipbox[YY][YY];
321 const real rzx = pme->recipbox[ZZ][XX];
322 const real rzy = pme->recipbox[ZZ][YY];
323 const real rzz = pme->recipbox[ZZ][ZZ];
325 /* Extract the buffer for force output */
326 rvec* gmx_restrict force = as_rvec_array(atc->f.data());
328 /* Note that unrolling this loop by templating this function on order
329 * deteriorates performance significantly with gcc5/6/7.
331 for (int nn = 0; nn < spline->n; nn++)
333 const int n = spline->ind[nn];
334 const real coefficient = scale * atc->coefficient[n];
336 if (bClearF)
338 force[n][XX] = 0;
339 force[n][YY] = 0;
340 force[n][ZZ] = 0;
342 if (coefficient != 0)
344 RVec f;
345 const auto spline_func = do_fspline(pme, grid, atc, spline, nn);
347 switch (order)
349 case 4: f = spline_func(std::integral_constant<int, 4>()); break;
350 case 5: f = spline_func(std::integral_constant<int, 5>()); break;
351 default: f = spline_func(order); break;
354 force[n][XX] += -coefficient * (f[XX] * nx * rxx);
355 force[n][YY] += -coefficient * (f[XX] * nx * ryx + f[YY] * ny * ryy);
356 force[n][ZZ] += -coefficient * (f[XX] * nx * rzx + f[YY] * ny * rzy + f[ZZ] * nz * rzz);
359 /* Since the energy and not forces are interpolated
360 * the net force might not be exactly zero.
361 * This can be solved by also interpolating F, but
362 * that comes at a cost.
363 * A better hack is to remove the net force every
364 * step, but that must be done at a higher level
365 * since this routine doesn't see all atoms if running
366 * in parallel. Don't know how important it is? EL 990726
371 real gather_energy_bsplines(gmx_pme_t* pme, const real* grid, PmeAtomComm* atc)
373 splinedata_t* spline;
374 int ithx, ithy, ithz, i0, j0, k0;
375 int index_x, index_xy;
376 int* idxptr;
377 real energy, pot, tx, ty, coefficient, gval;
378 real * thx, *thy, *thz;
379 int norder;
380 int order;
382 spline = &atc->spline[0];
384 order = pme->pme_order;
386 energy = 0;
387 for (int n = 0; n < atc->numAtoms(); n++)
389 coefficient = atc->coefficient[n];
391 if (coefficient != 0)
393 idxptr = atc->idx[n];
394 norder = n * order;
396 i0 = idxptr[XX];
397 j0 = idxptr[YY];
398 k0 = idxptr[ZZ];
400 /* Pointer arithmetic alert, next three statements */
401 thx = spline->theta.coefficients[XX] + norder;
402 thy = spline->theta.coefficients[YY] + norder;
403 thz = spline->theta.coefficients[ZZ] + norder;
405 pot = 0;
406 for (ithx = 0; (ithx < order); ithx++)
408 index_x = (i0 + ithx) * pme->pmegrid_ny * pme->pmegrid_nz;
409 tx = thx[ithx];
411 for (ithy = 0; (ithy < order); ithy++)
413 index_xy = index_x + (j0 + ithy) * pme->pmegrid_nz;
414 ty = thy[ithy];
416 for (ithz = 0; (ithz < order); ithz++)
418 gval = grid[index_xy + (k0 + ithz)];
419 pot += tx * ty * thz[ithz] * gval;
424 energy += pot * coefficient;
428 return energy;