2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
5 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
39 * Utility constant and function declaration for the CUDA non-bonded kernels.
40 * This header should be included once at the top level, just before the
41 * kernels are included (has to be preceded by nbnxn_cuda_types.h).
43 * \author Szilárd Páll <pall.szilard@gmail.com>
44 * \ingroup module_nbnxm
48 /* Note that floating-point constants in CUDA code should be suffixed
49 * with f (e.g. 0.5f), to stop the compiler producing intermediate
50 * code that is in double precision.
53 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
54 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
55 #include "gromacs/gpu_utils/vectype_ops.cuh"
57 #include "nbnxm_cuda_types.h"
59 #ifndef NBNXM_CUDA_KERNEL_UTILS_CUH
60 # define NBNXM_CUDA_KERNEL_UTILS_CUH
62 /*! \brief Log of the i and j cluster size.
63 * change this together with c_clSize !*/
64 static const int __device__ c_clSizeLog2 = 3;
65 /*! \brief Square of cluster size. */
66 static const int __device__ c_clSizeSq = c_clSize * c_clSize;
67 /*! \brief j-cluster size after split (4 in the current implementation). */
68 static const int __device__ c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
69 /*! \brief Stride in the force accumualation buffer */
70 static const int __device__ c_fbufStride = c_clSizeSq;
71 /*! \brief i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
72 static const unsigned __device__ superClInteractionMask =
73 ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
75 static const float __device__ c_oneSixth = 0.16666667f;
76 static const float __device__ c_oneTwelveth = 0.08333333f;
79 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
80 static __forceinline__ __device__ void
81 convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon, float* c6, float* c12)
85 sigma2 = sigma * sigma;
86 sigma6 = sigma2 * sigma2 * sigma2;
87 *c6 = epsilon * sigma6;
91 /*! Apply force switch, force + energy version. */
92 static __forceinline__ __device__ void
93 calculate_force_switch_F(const cu_nbparam_t nbparam, float c6, float c12, float inv_r, float r2, float* F_invr)
97 /* force switch constants */
98 float disp_shift_V2 = nbparam.dispersion_shift.c2;
99 float disp_shift_V3 = nbparam.dispersion_shift.c3;
100 float repu_shift_V2 = nbparam.repulsion_shift.c2;
101 float repu_shift_V3 = nbparam.repulsion_shift.c3;
104 r_switch = r - nbparam.rvdw_switch;
105 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
107 *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
108 + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
111 /*! Apply force switch, force-only version. */
112 static __forceinline__ __device__ void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
122 /* force switch constants */
123 float disp_shift_V2 = nbparam.dispersion_shift.c2;
124 float disp_shift_V3 = nbparam.dispersion_shift.c3;
125 float repu_shift_V2 = nbparam.repulsion_shift.c2;
126 float repu_shift_V3 = nbparam.repulsion_shift.c3;
128 float disp_shift_F2 = nbparam.dispersion_shift.c2 / 3;
129 float disp_shift_F3 = nbparam.dispersion_shift.c3 / 4;
130 float repu_shift_F2 = nbparam.repulsion_shift.c2 / 3;
131 float repu_shift_F3 = nbparam.repulsion_shift.c3 / 4;
134 r_switch = r - nbparam.rvdw_switch;
135 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
137 *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
138 + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
139 *E_lj += c6 * (disp_shift_F2 + disp_shift_F3 * r_switch) * r_switch * r_switch * r_switch
140 - c12 * (repu_shift_F2 + repu_shift_F3 * r_switch) * r_switch * r_switch * r_switch;
143 /*! Apply potential switch, force-only version. */
144 static __forceinline__ __device__ void
145 calculate_potential_switch_F(const cu_nbparam_t nbparam, float inv_r, float r2, float* F_invr, float* E_lj)
150 /* potential switch constants */
151 float switch_V3 = nbparam.vdw_switch.c3;
152 float switch_V4 = nbparam.vdw_switch.c4;
153 float switch_V5 = nbparam.vdw_switch.c5;
154 float switch_F2 = 3 * nbparam.vdw_switch.c3;
155 float switch_F3 = 4 * nbparam.vdw_switch.c4;
156 float switch_F4 = 5 * nbparam.vdw_switch.c5;
159 r_switch = r - nbparam.rvdw_switch;
161 /* Unlike in the F+E kernel, conditional is faster here */
164 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
165 dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
167 *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
171 /*! Apply potential switch, force + energy version. */
172 static __forceinline__ __device__ void
173 calculate_potential_switch_F_E(const cu_nbparam_t nbparam, float inv_r, float r2, float* F_invr, float* E_lj)
178 /* potential switch constants */
179 float switch_V3 = nbparam.vdw_switch.c3;
180 float switch_V4 = nbparam.vdw_switch.c4;
181 float switch_V5 = nbparam.vdw_switch.c5;
182 float switch_F2 = 3 * nbparam.vdw_switch.c3;
183 float switch_F3 = 4 * nbparam.vdw_switch.c4;
184 float switch_F4 = 5 * nbparam.vdw_switch.c5;
187 r_switch = r - nbparam.rvdw_switch;
188 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
190 /* Unlike in the F-only kernel, masking is faster here */
191 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
192 dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
194 *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
199 /*! \brief Fetch C6 grid contribution coefficients and return the product of these.
201 * Depending on what is supported, it fetches parameters either
202 * using direct load, texture objects, or texrefs.
204 static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam, int typei, int typej)
206 # if DISABLE_CUDA_TEXTURES
207 return LDG(&nbparam.nbfp_comb[2 * typei]) * LDG(&nbparam.nbfp_comb[2 * typej]);
209 return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typei)
210 * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typej);
211 # endif /* DISABLE_CUDA_TEXTURES */
215 /*! Calculate LJ-PME grid force contribution with
216 * geometric combination rule.
218 static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
227 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
229 c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
231 /* Recalculate inv_r6 without exclusion mask */
232 inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
233 cr2 = lje_coeff2 * r2;
234 expmcr2 = expf(-cr2);
235 poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
237 /* Subtract the grid force from the total LJ force */
238 *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
242 /*! Calculate LJ-PME grid force + energy contribution with
243 * geometric combination rule.
245 static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
256 float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
258 c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
260 /* Recalculate inv_r6 without exclusion mask */
261 inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
262 cr2 = lje_coeff2 * r2;
263 expmcr2 = expf(-cr2);
264 poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
266 /* Subtract the grid force from the total LJ force */
267 *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
269 /* Shift should be applied only to real LJ pairs */
270 sh_mask = nbparam.sh_lj_ewald * int_bit;
271 *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
274 /*! Fetch per-type LJ parameters.
276 * Depending on what is supported, it fetches parameters either
277 * using direct load, texture objects, or texrefs.
279 static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam, int type)
282 # if DISABLE_CUDA_TEXTURES
283 /* Force an 8-byte fetch to save a memory instruction. */
284 float2* nbfp_comb = (float2*)nbparam.nbfp_comb;
285 c6c12 = LDG(&nbfp_comb[type]);
287 /* NOTE: as we always do 8-byte aligned loads, we could
288 fetch float2 here too just as above. */
289 c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type);
290 c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type + 1);
291 # endif /* DISABLE_CUDA_TEXTURES */
297 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != nullptr) with
298 * Lorentz-Berthelot combination rule.
299 * We use a single F+E kernel with conditional because the performance impact
300 * of this is pretty small and LB on the CPU is anyway very slow.
302 static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
313 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
314 float sigma, sigma2, epsilon;
316 /* sigma and epsilon are scaled to give 6*C6 */
317 float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
318 float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
320 sigma = c6c12_i.x + c6c12_j.x;
321 epsilon = c6c12_i.y * c6c12_j.y;
323 sigma2 = sigma * sigma;
324 c6grid = epsilon * sigma2 * sigma2 * sigma2;
326 /* Recalculate inv_r6 without exclusion mask */
327 inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
328 cr2 = lje_coeff2 * r2;
329 expmcr2 = expf(-cr2);
330 poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
332 /* Subtract the grid force from the total LJ force */
333 *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
339 /* Shift should be applied only to real LJ pairs */
340 sh_mask = nbparam.sh_lj_ewald * int_bit;
341 *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
346 /*! Fetch two consecutive values from the Ewald correction F*r table.
348 * Depending on what is supported, it fetches parameters either
349 * using direct load, texture objects, or texrefs.
351 static __forceinline__ __device__ float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam, int index)
355 # if DISABLE_CUDA_TEXTURES
356 /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
357 d.x = LDG(&nbparam.coulomb_tab[index]);
358 d.y = LDG(&nbparam.coulomb_tab[index + 1]);
360 d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
361 d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
362 # endif // DISABLE_CUDA_TEXTURES
367 /*! Linear interpolation using exactly two FMA operations.
369 * Implements numeric equivalent of: (1-t)*d0 + t*d1
370 * Note that CUDA does not have fnms, otherwise we'd use
371 * fma(t, d1, fnms(t, d0, d0)
372 * but input modifiers are designed for this and are fast.
375 __forceinline__ __host__ __device__ T lerp(T d0, T d1, T t)
377 return fma(t, d1, fma(-t, d0, d0));
380 /*! Interpolate Ewald coulomb force correction using the F*r table.
382 static __forceinline__ __device__ float interpolate_coulomb_force_r(const cu_nbparam_t nbparam, float r)
384 float normalized = nbparam.coulomb_tab_scale * r;
385 int index = (int)normalized;
386 float fraction = normalized - index;
388 float2 d01 = fetch_coulomb_force_r(nbparam, index);
390 return lerp(d01.x, d01.y, fraction);
393 /*! Fetch C6 and C12 from the parameter table.
395 * Depending on what is supported, it fetches parameters either
396 * using direct load, texture objects, or texrefs.
398 static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const cu_nbparam_t nbparam, int baseIndex)
400 # if DISABLE_CUDA_TEXTURES
401 /* Force an 8-byte fetch to save a memory instruction. */
402 float2* nbfp = (float2*)nbparam.nbfp;
404 c6c12 = LDG(&nbfp[baseIndex]);
408 /* NOTE: as we always do 8-byte aligned loads, we could
409 fetch float2 here too just as above. */
410 c6 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex);
411 c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex + 1);
412 # endif // DISABLE_CUDA_TEXTURES
416 /*! Calculate analytical Ewald correction term. */
417 static __forceinline__ __device__ float pmecorrF(float z2)
419 const float FN6 = -1.7357322914161492954e-8f;
420 const float FN5 = 1.4703624142580877519e-6f;
421 const float FN4 = -0.000053401640219807709149f;
422 const float FN3 = 0.0010054721316683106153f;
423 const float FN2 = -0.019278317264888380590f;
424 const float FN1 = 0.069670166153766424023f;
425 const float FN0 = -0.75225204789749321333f;
427 const float FD4 = 0.0011193462567257629232f;
428 const float FD3 = 0.014866955030185295499f;
429 const float FD2 = 0.11583842382862377919f;
430 const float FD1 = 0.50736591960530292870f;
431 const float FD0 = 1.0f;
434 float polyFN0, polyFN1, polyFD0, polyFD1;
438 polyFD0 = FD4 * z4 + FD2;
439 polyFD1 = FD3 * z4 + FD1;
440 polyFD0 = polyFD0 * z4 + FD0;
441 polyFD0 = polyFD1 * z2 + polyFD0;
443 polyFD0 = 1.0f / polyFD0;
445 polyFN0 = FN6 * z4 + FN4;
446 polyFN1 = FN5 * z4 + FN3;
447 polyFN0 = polyFN0 * z4 + FN2;
448 polyFN1 = polyFN1 * z4 + FN1;
449 polyFN0 = polyFN0 * z4 + FN0;
450 polyFN0 = polyFN1 * z2 + polyFN0;
452 return polyFN0 * polyFD0;
455 /*! Final j-force reduction; this generic implementation works with
456 * arbitrary array sizes.
458 static __forceinline__ __device__ void
459 reduce_force_j_generic(float* f_buf, float3* fout, int tidxi, int tidxj, int aidx)
464 for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
466 f += f_buf[c_fbufStride * tidxi + j];
469 atomicAdd((&fout[aidx].x) + tidxi, f);
473 /*! Final j-force reduction; this implementation only with power of two
476 static __forceinline__ __device__ void
477 reduce_force_j_warp_shfl(float3 f, float3* fout, int tidxi, int aidx, const unsigned int activemask)
479 f.x += __shfl_down_sync(activemask, f.x, 1);
480 f.y += __shfl_up_sync(activemask, f.y, 1);
481 f.z += __shfl_down_sync(activemask, f.z, 1);
488 f.x += __shfl_down_sync(activemask, f.x, 2);
489 f.z += __shfl_up_sync(activemask, f.z, 2);
496 f.x += __shfl_down_sync(activemask, f.x, 4);
500 atomicAdd((&fout[aidx].x) + tidxi, f.x);
504 /*! Final i-force reduction; this generic implementation works with
505 * arbitrary array sizes.
506 * TODO: add the tidxi < 3 trick
508 static __forceinline__ __device__ void reduce_force_i_generic(float* f_buf,
519 for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
521 f += f_buf[tidxj * c_fbufStride + j];
524 atomicAdd(&fout[aidx].x + tidxj, f);
533 /*! Final i-force reduction; this implementation works only with power of two
536 static __forceinline__ __device__ void reduce_force_i_pow2(volatile float* f_buf,
547 assert(c_clSize == 1 << c_clSizeLog2);
549 /* Reduce the initial c_clSize values for each i atom to half
550 * every step by using c_clSize * i threads.
551 * Can't just use i as loop variable because than nvcc refuses to unroll.
555 for (j = c_clSizeLog2 - 1; j > 0; j--)
560 f_buf[tidxj * c_clSize + tidxi] += f_buf[(tidxj + i) * c_clSize + tidxi];
561 f_buf[c_fbufStride + tidxj * c_clSize + tidxi] +=
562 f_buf[c_fbufStride + (tidxj + i) * c_clSize + tidxi];
563 f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] +=
564 f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
569 /* i == 1, last reduction step, writing to global mem */
572 /* tidxj*c_fbufStride selects x, y or z */
573 f = f_buf[tidxj * c_fbufStride + tidxi] + f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
575 atomicAdd(&(fout[aidx].x) + tidxj, f);
584 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
585 * on whether the size of the array to be reduced is power of two or not.
587 static __forceinline__ __device__ void
588 reduce_force_i(float* f_buf, float3* f, float* fshift_buf, bool bCalcFshift, int tidxi, int tidxj, int ai)
590 if ((c_clSize & (c_clSize - 1)))
592 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
596 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
600 /*! Final i-force reduction; this implementation works only with power of two
603 static __forceinline__ __device__ void reduce_force_i_warp_shfl(float3 fin,
609 const unsigned int activemask)
611 fin.x += __shfl_down_sync(activemask, fin.x, c_clSize);
612 fin.y += __shfl_up_sync(activemask, fin.y, c_clSize);
613 fin.z += __shfl_down_sync(activemask, fin.z, c_clSize);
620 fin.x += __shfl_down_sync(activemask, fin.x, 2 * c_clSize);
621 fin.z += __shfl_up_sync(activemask, fin.z, 2 * c_clSize);
628 /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
631 atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
635 *fshift_buf += fin.x;
640 /*! Energy reduction; this implementation works only with power of two
643 static __forceinline__ __device__ void
644 reduce_energy_pow2(volatile float* buf, float* e_lj, float* e_el, unsigned int tidx)
648 unsigned int i = warp_size / 2;
650 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
652 for (int j = warp_size_log2 - 1; j > 0; j--)
656 buf[tidx] += buf[tidx + i];
657 buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
662 // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
663 // memory locations to make sense
664 /* last reduction step, writing to global mem */
667 e1 = buf[tidx] + buf[tidx + i];
668 e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
675 /*! Energy reduction; this implementation works only with power of two
678 static __forceinline__ __device__ void
679 reduce_energy_warp_shfl(float E_lj, float E_el, float* e_lj, float* e_el, int tidx, const unsigned int activemask)
685 for (i = 0; i < 5; i++)
687 E_lj += __shfl_down_sync(activemask, E_lj, sh);
688 E_el += __shfl_down_sync(activemask, E_el, sh);
692 /* The first thread in the warp writes the reduced energies */
693 if (tidx == 0 || tidx == warp_size)
695 atomicAdd(e_lj, E_lj);
696 atomicAdd(e_el, E_el);
700 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */