2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2017,2018 by the GROMACS development team.
7 * Copyright (c) 2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
41 #include "pme_gather.h"
43 #include "gromacs/math/vec.h"
44 #include "gromacs/simd/simd.h"
45 #include "gromacs/utility/basedefinitions.h"
46 #include "gromacs/utility/gmxassert.h"
47 #include "gromacs/utility/smalloc.h"
48 #include "gromacs/utility/typetraits.h"
50 #include "pme_internal.h"
52 #include "pme_spline_work.h"
54 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
56 /* Spline function. Goals: 1) Force compiler to instantiate function separately
57 for each compile-time value of order and once more for any possible (runtime)
58 value. 2) Allow overloading for specific compile-time values.
59 The Int template argument can be either int (runtime) or an object of type
60 integral_constant<int, N> (compile-time). Common runtime values can be
61 converted to compile-time values with a switch statement. For the compile
62 value the compiler is required to instantiate the function separately for
63 each value. The function can be overloaded for specific compile-time values
64 using integral_constant<int, N> where N is either a specific value or an
65 enable_if constrained non-type template parameter. The most specific overload
66 (specific value > int template parameter > general function) is called. Inside
67 the function the order argument can be used as regular int because
68 integral_constant has a proper conversion.
70 SIMD do_fspline() template funtions will be used for PME order 4 and 5
71 when the SIMD module has support for SIMD4 for the architecture used.
72 For SIMD4 without unaligned load/store support:
73 order 4 and 5 use the order 4+5 aligned SIMD template
74 For SIMD4 with unaligned load/store support:
75 order 4 uses the order 4 unaligned SIMD template
76 order 5 uses the order 4+5 aligned SIMD template
80 do_fspline(const gmx_pme_t
* pme
,
81 const real
* gmx_restrict grid
,
82 const PmeAtomComm
* gmx_restrict atc
,
83 const splinedata_t
* gmx_restrict spline
,
93 template<typename Int
>
94 RVec
operator()(Int order
) const
96 static_assert(isIntegralConstant
<Int
, int>::value
|| std::is_same_v
<Int
, int>,
97 "'order' needs to be either of type integral_constant<int,N> or int.");
99 const int norder
= nn
* order
;
101 /* Pointer arithmetic alert, next six statements */
102 const real
* const gmx_restrict thx
= spline
->theta
.coefficients
[XX
] + norder
;
103 const real
* const gmx_restrict thy
= spline
->theta
.coefficients
[YY
] + norder
;
104 const real
* const gmx_restrict thz
= spline
->theta
.coefficients
[ZZ
] + norder
;
105 const real
* const gmx_restrict dthx
= spline
->dtheta
.coefficients
[XX
] + norder
;
106 const real
* const gmx_restrict dthy
= spline
->dtheta
.coefficients
[YY
] + norder
;
107 const real
* const gmx_restrict dthz
= spline
->dtheta
.coefficients
[ZZ
] + norder
;
111 for (int ithx
= 0; (ithx
< order
); ithx
++)
113 const int index_x
= (idxX
+ ithx
) * gridNY
* gridNZ
;
114 const real tx
= thx
[ithx
];
115 const real dx
= dthx
[ithx
];
117 for (int ithy
= 0; (ithy
< order
); ithy
++)
119 const int index_xy
= index_x
+ (idxY
+ ithy
) * gridNZ
;
120 const real ty
= thy
[ithy
];
121 const real dy
= dthy
[ithy
];
122 real fxy1
= 0, fz1
= 0;
124 for (int ithz
= 0; (ithz
< order
); ithz
++)
126 const real gval
= grid
[index_xy
+ (idxZ
+ ithz
)];
127 fxy1
+= thz
[ithz
] * gval
;
128 fz1
+= dthz
[ithz
] * gval
;
130 f
[XX
] += dx
* ty
* fxy1
;
131 f
[YY
] += tx
* dy
* fxy1
;
132 f
[ZZ
] += tx
* ty
* fz1
;
139 // TODO: Consider always have at least a dummy implementation of Simd (enough for first phase of two-phase lookup) and then use enable_if instead of #ifdef
140 #if PME_4NSIMD_GATHER
141 /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
142 * Uses 4N SIMD where N is SIMD_WIDTH/4 to operate on all of z and N of y.
143 * This code does not assume any memory alignment for the grid.
145 RVec
operator()(std::integral_constant
<int, 4> /*unused*/) const
147 const int norder
= nn
* 4;
148 /* Pointer arithmetic alert, next six statements */
149 const real
* const gmx_restrict thx
= spline
->theta
.coefficients
[XX
] + norder
;
150 const real
* const gmx_restrict thy
= spline
->theta
.coefficients
[YY
] + norder
;
151 const real
* const gmx_restrict thz
= spline
->theta
.coefficients
[ZZ
] + norder
;
152 const real
* const gmx_restrict dthx
= spline
->dtheta
.coefficients
[XX
] + norder
;
153 const real
* const gmx_restrict dthy
= spline
->dtheta
.coefficients
[YY
] + norder
;
154 const real
* const gmx_restrict dthz
= spline
->dtheta
.coefficients
[ZZ
] + norder
;
156 Simd4NReal fx_S
= setZero();
157 Simd4NReal fy_S
= setZero();
158 Simd4NReal fz_S
= setZero();
160 /* With order 4 the z-spline is actually aligned */
161 const Simd4NReal tz_S
= load4DuplicateN(thz
);
162 const Simd4NReal dz_S
= load4DuplicateN(dthz
);
164 for (int ithx
= 0; ithx
< 4; ithx
++)
166 const int index_x
= (idxX
+ ithx
) * gridNY
* gridNZ
;
167 const Simd4NReal tx_S
= Simd4NReal(thx
[ithx
]);
168 const Simd4NReal dx_S
= Simd4NReal(dthx
[ithx
]);
170 for (int ithy
= 0; ithy
< 4; ithy
+= GMX_SIMD4N_REAL_WIDTH
/ 4)
172 const int index_xy
= index_x
+ (idxY
+ ithy
) * gridNZ
;
174 const Simd4NReal ty_S
= loadUNDuplicate4(thy
+ ithy
);
175 const Simd4NReal dy_S
= loadUNDuplicate4(dthy
+ ithy
);
177 const Simd4NReal gval_S
= loadU4NOffset(grid
+ index_xy
+ idxZ
, gridNZ
);
180 const Simd4NReal fxy1_S
= tz_S
* gval_S
;
181 const Simd4NReal fz1_S
= dz_S
* gval_S
;
183 fx_S
= fma(dx_S
* ty_S
, fxy1_S
, fx_S
);
184 fy_S
= fma(tx_S
* dy_S
, fxy1_S
, fy_S
);
185 fz_S
= fma(tx_S
* ty_S
, fz1_S
, fz_S
);
189 return { reduce(fx_S
), reduce(fy_S
), reduce(fz_S
) };
193 #ifdef PME_SIMD4_SPREAD_GATHER
194 /* Load order elements from unaligned memory into two 4-wide SIMD */
196 static inline void loadOrderU(const real
* data
,
197 std::integral_constant
<int, order
> /*unused*/,
202 # ifdef PME_SIMD4_UNALIGNED
203 *S0
= load4U(data
- offset
);
204 *S1
= load4U(data
- offset
+ 4);
206 alignas(GMX_SIMD_ALIGNMENT
) real buf_aligned
[GMX_SIMD4_WIDTH
* 2];
207 /* Copy data to an aligned buffer */
208 for (int i
= 0; i
< order
; i
++)
210 buf_aligned
[offset
+ i
] = data
[i
];
212 *S0
= load4(buf_aligned
);
213 *S1
= load4(buf_aligned
+ 4);
218 #ifdef PME_SIMD4_SPREAD_GATHER
219 /* This code assumes that the grid is allocated 4-real aligned
220 * and that pme->pmegrid_nz is a multiple of 4.
221 * This code supports pme_order <= 5.
224 std::enable_if_t
<Order
== 4 || Order
== 5, RVec
> operator()(std::integral_constant
<int, Order
> order
) const
226 const int norder
= nn
* order
;
227 GMX_ASSERT(gridNZ
% 4 == 0,
228 "For aligned SIMD4 operations the grid size has to be padded up to a multiple "
230 /* Pointer arithmetic alert, next six statements */
231 const real
* const gmx_restrict thx
= spline
->theta
.coefficients
[XX
] + norder
;
232 const real
* const gmx_restrict thy
= spline
->theta
.coefficients
[YY
] + norder
;
233 const real
* const gmx_restrict thz
= spline
->theta
.coefficients
[ZZ
] + norder
;
234 const real
* const gmx_restrict dthx
= spline
->dtheta
.coefficients
[XX
] + norder
;
235 const real
* const gmx_restrict dthy
= spline
->dtheta
.coefficients
[YY
] + norder
;
236 const real
* const gmx_restrict dthz
= spline
->dtheta
.coefficients
[ZZ
] + norder
;
238 struct pme_spline_work
* const work
= pme
->spline_work
;
240 const int offset
= idxZ
& 3;
242 Simd4Real fx_S
= setZero();
243 Simd4Real fy_S
= setZero();
244 Simd4Real fz_S
= setZero();
246 Simd4Real tz_S0
, tz_S1
, dz_S0
, dz_S1
;
247 loadOrderU(thz
, order
, offset
, &tz_S0
, &tz_S1
);
248 loadOrderU(dthz
, order
, offset
, &dz_S0
, &dz_S1
);
250 tz_S0
= selectByMask(tz_S0
, work
->mask_S0
[offset
]);
251 dz_S0
= selectByMask(dz_S0
, work
->mask_S0
[offset
]);
252 tz_S1
= selectByMask(tz_S1
, work
->mask_S1
[offset
]);
253 dz_S1
= selectByMask(dz_S1
, work
->mask_S1
[offset
]);
255 for (int ithx
= 0; (ithx
< order
); ithx
++)
257 const int index_x
= (idxX
+ ithx
) * gridNY
* gridNZ
;
258 const Simd4Real tx_S
= Simd4Real(thx
[ithx
]);
259 const Simd4Real dx_S
= Simd4Real(dthx
[ithx
]);
261 for (int ithy
= 0; (ithy
< order
); ithy
++)
263 const int index_xy
= index_x
+ (idxY
+ ithy
) * gridNZ
;
264 const Simd4Real ty_S
= Simd4Real(thy
[ithy
]);
265 const Simd4Real dy_S
= Simd4Real(dthy
[ithy
]);
267 const Simd4Real gval_S0
= load4(grid
+ index_xy
+ idxZ
- offset
);
268 const Simd4Real gval_S1
= load4(grid
+ index_xy
+ idxZ
- offset
+ 4);
270 const Simd4Real fxy1_S0
= tz_S0
* gval_S0
;
271 const Simd4Real fz1_S0
= dz_S0
* gval_S0
;
272 const Simd4Real fxy1_S1
= tz_S1
* gval_S1
;
273 const Simd4Real fz1_S1
= dz_S1
* gval_S1
;
275 const Simd4Real fxy1_S
= fxy1_S0
+ fxy1_S1
;
276 const Simd4Real fz1_S
= fz1_S0
+ fz1_S1
;
278 fx_S
= fma(dx_S
* ty_S
, fxy1_S
, fx_S
);
279 fy_S
= fma(tx_S
* dy_S
, fxy1_S
, fy_S
);
280 fz_S
= fma(tx_S
* ty_S
, fz1_S
, fz_S
);
284 return { reduce(fx_S
), reduce(fy_S
), reduce(fz_S
) };
288 const gmx_pme_t
* const pme
;
289 const real
* const gmx_restrict grid
;
290 const PmeAtomComm
* const gmx_restrict atc
;
291 const splinedata_t
* const gmx_restrict spline
;
294 const int gridNY
= pme
->pmegrid_ny
;
295 const int gridNZ
= pme
->pmegrid_nz
;
297 const int* const idxptr
= atc
->idx
[spline
->ind
[nn
]];
298 const int idxX
= idxptr
[XX
];
299 const int idxY
= idxptr
[YY
];
300 const int idxZ
= idxptr
[ZZ
];
304 void gather_f_bsplines(const gmx_pme_t
* pme
,
307 const PmeAtomComm
* atc
,
308 const splinedata_t
* spline
,
311 /* sum forces for local particles */
313 const int order
= pme
->pme_order
;
314 const int nx
= pme
->nkx
;
315 const int ny
= pme
->nky
;
316 const int nz
= pme
->nkz
;
318 const real rxx
= pme
->recipbox
[XX
][XX
];
319 const real ryx
= pme
->recipbox
[YY
][XX
];
320 const real ryy
= pme
->recipbox
[YY
][YY
];
321 const real rzx
= pme
->recipbox
[ZZ
][XX
];
322 const real rzy
= pme
->recipbox
[ZZ
][YY
];
323 const real rzz
= pme
->recipbox
[ZZ
][ZZ
];
325 /* Extract the buffer for force output */
326 rvec
* gmx_restrict force
= as_rvec_array(atc
->f
.data());
328 /* Note that unrolling this loop by templating this function on order
329 * deteriorates performance significantly with gcc5/6/7.
331 for (int nn
= 0; nn
< spline
->n
; nn
++)
333 const int n
= spline
->ind
[nn
];
334 const real coefficient
= scale
* atc
->coefficient
[n
];
342 if (coefficient
!= 0)
345 const auto spline_func
= do_fspline(pme
, grid
, atc
, spline
, nn
);
349 case 4: f
= spline_func(std::integral_constant
<int, 4>()); break;
350 case 5: f
= spline_func(std::integral_constant
<int, 5>()); break;
351 default: f
= spline_func(order
); break;
354 force
[n
][XX
] += -coefficient
* (f
[XX
] * nx
* rxx
);
355 force
[n
][YY
] += -coefficient
* (f
[XX
] * nx
* ryx
+ f
[YY
] * ny
* ryy
);
356 force
[n
][ZZ
] += -coefficient
* (f
[XX
] * nx
* rzx
+ f
[YY
] * ny
* rzy
+ f
[ZZ
] * nz
* rzz
);
359 /* Since the energy and not forces are interpolated
360 * the net force might not be exactly zero.
361 * This can be solved by also interpolating F, but
362 * that comes at a cost.
363 * A better hack is to remove the net force every
364 * step, but that must be done at a higher level
365 * since this routine doesn't see all atoms if running
366 * in parallel. Don't know how important it is? EL 990726
371 real
gather_energy_bsplines(gmx_pme_t
* pme
, const real
* grid
, PmeAtomComm
* atc
)
373 splinedata_t
* spline
;
374 int ithx
, ithy
, ithz
, i0
, j0
, k0
;
375 int index_x
, index_xy
;
377 real energy
, pot
, tx
, ty
, coefficient
, gval
;
378 real
* thx
, *thy
, *thz
;
382 spline
= &atc
->spline
[0];
384 order
= pme
->pme_order
;
387 for (int n
= 0; n
< atc
->numAtoms(); n
++)
389 coefficient
= atc
->coefficient
[n
];
391 if (coefficient
!= 0)
393 idxptr
= atc
->idx
[n
];
400 /* Pointer arithmetic alert, next three statements */
401 thx
= spline
->theta
.coefficients
[XX
] + norder
;
402 thy
= spline
->theta
.coefficients
[YY
] + norder
;
403 thz
= spline
->theta
.coefficients
[ZZ
] + norder
;
406 for (ithx
= 0; (ithx
< order
); ithx
++)
408 index_x
= (i0
+ ithx
) * pme
->pmegrid_ny
* pme
->pmegrid_nz
;
411 for (ithy
= 0; (ithy
< order
); ithy
++)
413 index_xy
= index_x
+ (j0
+ ithy
) * pme
->pmegrid_nz
;
416 for (ithz
= 0; (ithz
< order
); ithz
++)
418 gval
= grid
[index_xy
+ (k0
+ ithz
)];
419 pot
+= tx
* ty
* thz
[ithz
] * gval
;
424 energy
+= pot
* coefficient
;