2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * Utility constant and function declaration for the CUDA non-bonded kernels.
39 * This header should be included once at the top level, just before the
40 * kernels are included (has to be preceded by nbnxn_cuda_types.h).
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_mdlib
47 /* Note that floating-point constants in CUDA code should be suffixed
48 * with f (e.g. 0.5f), to stop the compiler producing intermediate
49 * code that is in double precision.
52 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
53 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
54 #include "gromacs/gpu_utils/vectype_ops.cuh"
56 #include "nbnxn_cuda_types.h"
58 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
59 #define NBNXN_CUDA_KERNEL_UTILS_CUH
61 /* Use texture objects if supported by the target hardware (and in host pass). */
62 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
63 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
67 /*! \brief Log of the i and j cluster size.
68 * change this together with c_clSize !*/
69 static const int c_clSizeLog2 = 3;
70 /*! \brief Square of cluster size. */
71 static const int c_clSizeSq = c_clSize*c_clSize;
72 /*! \brief j-cluster size after split (4 in the current implementation). */
73 static const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
74 /*! \brief Stride in the force accumualation buffer */
75 static const int c_fbufStride = c_clSizeSq;
77 static const float c_oneSixth = 0.16666667f;
78 static const float c_oneTwelveth = 0.08333333f;
81 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
82 static __forceinline__ __device__
83 void convert_sigma_epsilon_to_c6_c12(const float sigma,
90 sigma2 = sigma * sigma;
91 sigma6 = sigma2 *sigma2 * sigma2;
92 *c6 = epsilon * sigma6;
96 /*! Apply force switch, force + energy version. */
97 static __forceinline__ __device__
98 void calculate_force_switch_F(const cu_nbparam_t nbparam,
107 /* force switch constants */
108 float disp_shift_V2 = nbparam.dispersion_shift.c2;
109 float disp_shift_V3 = nbparam.dispersion_shift.c3;
110 float repu_shift_V2 = nbparam.repulsion_shift.c2;
111 float repu_shift_V3 = nbparam.repulsion_shift.c3;
114 r_switch = r - nbparam.rvdw_switch;
115 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
118 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
119 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
122 /*! Apply force switch, force-only version. */
123 static __forceinline__ __device__
124 void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
134 /* force switch constants */
135 float disp_shift_V2 = nbparam.dispersion_shift.c2;
136 float disp_shift_V3 = nbparam.dispersion_shift.c3;
137 float repu_shift_V2 = nbparam.repulsion_shift.c2;
138 float repu_shift_V3 = nbparam.repulsion_shift.c3;
140 float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
141 float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
142 float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
143 float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
146 r_switch = r - nbparam.rvdw_switch;
147 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
150 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
151 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
153 c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
154 c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
157 /*! Apply potential switch, force-only version. */
158 static __forceinline__ __device__
159 void calculate_potential_switch_F(const cu_nbparam_t nbparam,
168 /* potential switch constants */
169 float switch_V3 = nbparam.vdw_switch.c3;
170 float switch_V4 = nbparam.vdw_switch.c4;
171 float switch_V5 = nbparam.vdw_switch.c5;
172 float switch_F2 = 3*nbparam.vdw_switch.c3;
173 float switch_F3 = 4*nbparam.vdw_switch.c4;
174 float switch_F4 = 5*nbparam.vdw_switch.c5;
177 r_switch = r - nbparam.rvdw_switch;
179 /* Unlike in the F+E kernel, conditional is faster here */
182 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
183 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
185 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
189 /*! Apply potential switch, force + energy version. */
190 static __forceinline__ __device__
191 void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
200 /* potential switch constants */
201 float switch_V3 = nbparam.vdw_switch.c3;
202 float switch_V4 = nbparam.vdw_switch.c4;
203 float switch_V5 = nbparam.vdw_switch.c5;
204 float switch_F2 = 3*nbparam.vdw_switch.c3;
205 float switch_F3 = 4*nbparam.vdw_switch.c4;
206 float switch_F4 = 5*nbparam.vdw_switch.c5;
209 r_switch = r - nbparam.rvdw_switch;
210 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
212 /* Unlike in the F-only kernel, masking is faster here */
213 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
214 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
216 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
221 /*! \brief Fetch C6 grid contribution coefficients and return the product of these.
223 * Depending on what is supported, it fetches parameters either
224 * using direct load, texture objects, or texrefs.
226 static __forceinline__ __device__
227 float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
231 #if DISABLE_CUDA_TEXTURES
232 return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
235 return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
237 return tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
238 #endif /* USE_TEXOBJ */
239 #endif /* DISABLE_CUDA_TEXTURES */
243 /*! Calculate LJ-PME grid force contribution with
244 * geometric combination rule.
246 static __forceinline__ __device__
247 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
256 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
258 c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
260 /* Recalculate inv_r6 without exclusion mask */
261 inv_r6_nm = inv_r2*inv_r2*inv_r2;
263 expmcr2 = expf(-cr2);
264 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
266 /* Subtract the grid force from the total LJ force */
267 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
271 /*! Calculate LJ-PME grid force + energy contribution with
272 * geometric combination rule.
274 static __forceinline__ __device__
275 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
286 float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
288 c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
290 /* Recalculate inv_r6 without exclusion mask */
291 inv_r6_nm = inv_r2*inv_r2*inv_r2;
293 expmcr2 = expf(-cr2);
294 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
296 /* Subtract the grid force from the total LJ force */
297 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
299 /* Shift should be applied only to real LJ pairs */
300 sh_mask = nbparam.sh_lj_ewald*int_bit;
301 *E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
304 /*! Fetch per-type LJ parameters.
306 * Depending on what is supported, it fetches parameters either
307 * using direct load, texture objects, or texrefs.
309 static __forceinline__ __device__
310 float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
314 #if DISABLE_CUDA_TEXTURES
315 /* Force an 8-byte fetch to save a memory instruction. */
316 float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
317 c6c12 = LDG(&nbfp_comb[type]);
319 /* NOTE: as we always do 8-byte aligned loads, we could
320 fetch float2 here too just as above. */
322 c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
323 c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
325 c6c12.x = tex1Dfetch(nbfp_comb_texref, 2*type);
326 c6c12.y = tex1Dfetch(nbfp_comb_texref, 2*type + 1);
327 #endif /* USE_TEXOBJ */
328 #endif /* DISABLE_CUDA_TEXTURES */
334 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
335 * Lorentz-Berthelot combination rule.
336 * We use a single F+E kernel with conditional because the performance impact
337 * of this is pretty small and LB on the CPU is anyway very slow.
339 static __forceinline__ __device__
340 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
351 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
352 float sigma, sigma2, epsilon;
354 /* sigma and epsilon are scaled to give 6*C6 */
355 float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
356 float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
358 sigma = c6c12_i.x + c6c12_j.x;
359 epsilon = c6c12_i.y * c6c12_j.y;
361 sigma2 = sigma*sigma;
362 c6grid = epsilon*sigma2*sigma2*sigma2;
364 /* Recalculate inv_r6 without exclusion mask */
365 inv_r6_nm = inv_r2*inv_r2*inv_r2;
367 expmcr2 = expf(-cr2);
368 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
370 /* Subtract the grid force from the total LJ force */
371 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
377 /* Shift should be applied only to real LJ pairs */
378 sh_mask = nbparam.sh_lj_ewald*int_bit;
379 *E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
384 /*! Fetch two consecutive values from the Ewald correction F*r table.
386 * Depending on what is supported, it fetches parameters either
387 * using direct load, texture objects, or texrefs.
389 static __forceinline__ __device__
390 float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
395 #if DISABLE_CUDA_TEXTURES
396 /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
397 d.x = LDG(&nbparam.coulomb_tab[index]);
398 d.y = LDG(&nbparam.coulomb_tab[index + 1]);
401 d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
402 d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
404 d.x = tex1Dfetch(coulomb_tab_texref, index);
405 d.y = tex1Dfetch(coulomb_tab_texref, index + 1);
407 #endif // DISABLE_CUDA_TEXTURES
412 /*! Linear interpolation using exactly two FMA operations.
414 * Implements numeric equivalent of: (1-t)*d0 + t*d1
415 * Note that CUDA does not have fnms, otherwise we'd use
416 * fma(t, d1, fnms(t, d0, d0)
417 * but input modifiers are designed for this and are fast.
419 template <typename T>
420 __forceinline__ __host__ __device__
421 T lerp(T d0, T d1, T t)
423 return fma(t, d1, fma(-t, d0, d0));
426 /*! Interpolate Ewald coulomb force correction using the F*r table.
428 static __forceinline__ __device__
429 float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
432 float normalized = nbparam.coulomb_tab_scale * r;
433 int index = (int) normalized;
434 float fraction = normalized - index;
436 float2 d01 = fetch_coulomb_force_r(nbparam, index);
438 return lerp(d01.x, d01.y, fraction);
441 /*! Fetch C6 and C12 from the parameter table.
443 * Depending on what is supported, it fetches parameters either
444 * using direct load, texture objects, or texrefs.
446 static __forceinline__ __device__
447 void fetch_nbfp_c6_c12(float &c6,
449 const cu_nbparam_t nbparam,
452 #if DISABLE_CUDA_TEXTURES
453 /* Force an 8-byte fetch to save a memory instruction. */
454 float2 *nbfp = (float2 *)nbparam.nbfp;
456 c6c12 = LDG(&nbfp[baseIndex]);
460 /* NOTE: as we always do 8-byte aligned loads, we could
461 fetch float2 here too just as above. */
463 c6 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
464 c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
466 c6 = tex1Dfetch(nbfp_texref, 2*baseIndex);
467 c12 = tex1Dfetch(nbfp_texref, 2*baseIndex + 1);
469 #endif // DISABLE_CUDA_TEXTURES
473 /*! Calculate analytical Ewald correction term. */
474 static __forceinline__ __device__
475 float pmecorrF(float z2)
477 const float FN6 = -1.7357322914161492954e-8f;
478 const float FN5 = 1.4703624142580877519e-6f;
479 const float FN4 = -0.000053401640219807709149f;
480 const float FN3 = 0.0010054721316683106153f;
481 const float FN2 = -0.019278317264888380590f;
482 const float FN1 = 0.069670166153766424023f;
483 const float FN0 = -0.75225204789749321333f;
485 const float FD4 = 0.0011193462567257629232f;
486 const float FD3 = 0.014866955030185295499f;
487 const float FD2 = 0.11583842382862377919f;
488 const float FD1 = 0.50736591960530292870f;
489 const float FD0 = 1.0f;
492 float polyFN0, polyFN1, polyFD0, polyFD1;
496 polyFD0 = FD4*z4 + FD2;
497 polyFD1 = FD3*z4 + FD1;
498 polyFD0 = polyFD0*z4 + FD0;
499 polyFD0 = polyFD1*z2 + polyFD0;
501 polyFD0 = 1.0f/polyFD0;
503 polyFN0 = FN6*z4 + FN4;
504 polyFN1 = FN5*z4 + FN3;
505 polyFN0 = polyFN0*z4 + FN2;
506 polyFN1 = polyFN1*z4 + FN1;
507 polyFN0 = polyFN0*z4 + FN0;
508 polyFN0 = polyFN1*z2 + polyFN0;
510 return polyFN0*polyFD0;
513 /*! Final j-force reduction; this generic implementation works with
514 * arbitrary array sizes.
516 static __forceinline__ __device__
517 void reduce_force_j_generic(float *f_buf, float3 *fout,
518 int tidxi, int tidxj, int aidx)
523 for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
525 f += f_buf[c_fbufStride * tidxi + j];
528 atomicAdd((&fout[aidx].x)+tidxi, f);
532 /*! Final j-force reduction; this implementation only with power of two
533 * array sizes and with sm >= 3.0
535 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
536 static __forceinline__ __device__
537 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
540 f.x += __shfl_down(f.x, 1);
541 f.y += __shfl_up (f.y, 1);
542 f.z += __shfl_down(f.z, 1);
549 f.x += __shfl_down(f.x, 2);
550 f.z += __shfl_up (f.z, 2);
557 f.x += __shfl_down(f.x, 4);
561 atomicAdd((&fout[aidx].x) + tidxi, f.x);
566 /*! Final i-force reduction; this generic implementation works with
567 * arbitrary array sizes.
568 * TODO: add the tidxi < 3 trick
570 static __forceinline__ __device__
571 void reduce_force_i_generic(float *f_buf, float3 *fout,
572 float *fshift_buf, bool bCalcFshift,
573 int tidxi, int tidxj, int aidx)
578 for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
580 f += f_buf[tidxj * c_fbufStride + j];
583 atomicAdd(&fout[aidx].x + tidxj, f);
592 /*! Final i-force reduction; this implementation works only with power of two
595 static __forceinline__ __device__
596 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
597 float *fshift_buf, bool bCalcFshift,
598 int tidxi, int tidxj, int aidx)
603 assert(c_clSize == 1 << c_clSizeLog2);
605 /* Reduce the initial c_clSize values for each i atom to half
606 * every step by using c_clSize * i threads.
607 * Can't just use i as loop variable because than nvcc refuses to unroll.
611 for (j = c_clSizeLog2 - 1; j > 0; j--)
616 f_buf[ tidxj * c_clSize + tidxi] += f_buf[ (tidxj + i) * c_clSize + tidxi];
617 f_buf[ c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[ c_fbufStride + (tidxj + i) * c_clSize + tidxi];
618 f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
623 /* i == 1, last reduction step, writing to global mem */
626 /* tidxj*c_fbufStride selects x, y or z */
627 f = f_buf[tidxj * c_fbufStride + tidxi] +
628 f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
630 atomicAdd(&(fout[aidx].x) + tidxj, f);
640 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
641 * on whether the size of the array to be reduced is power of two or not.
643 static __forceinline__ __device__
644 void reduce_force_i(float *f_buf, float3 *f,
645 float *fshift_buf, bool bCalcFshift,
646 int tidxi, int tidxj, int ai)
648 if ((c_clSize & (c_clSize - 1)))
650 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
654 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
658 /*! Final i-force reduction; this implementation works only with power of two
659 * array sizes and with sm >= 3.0
661 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
662 static __forceinline__ __device__
663 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
664 float *fshift_buf, bool bCalcFshift,
667 fin.x += __shfl_down(fin.x, c_clSize);
668 fin.y += __shfl_up (fin.y, c_clSize);
669 fin.z += __shfl_down(fin.z, c_clSize);
676 fin.x += __shfl_down(fin.x, 2*c_clSize);
677 fin.z += __shfl_up (fin.z, 2*c_clSize);
684 /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
687 atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
691 *fshift_buf += fin.x;
697 /*! Energy reduction; this implementation works only with power of two
700 static __forceinline__ __device__
701 void reduce_energy_pow2(volatile float *buf,
702 float *e_lj, float *e_el,
710 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
712 for (j = warp_size_log2 - 1; j > 0; j--)
716 buf[ tidx] += buf[ tidx + i];
717 buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
722 // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
723 // memory locations to make sense
724 /* last reduction step, writing to global mem */
727 e1 = buf[ tidx] + buf[ tidx + i];
728 e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
735 /*! Energy reduction; this implementation works only with power of two
736 * array sizes and with sm >= 3.0
738 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
739 static __forceinline__ __device__
740 void reduce_energy_warp_shfl(float E_lj, float E_el,
741 float *e_lj, float *e_el,
748 for (i = 0; i < 5; i++)
750 E_lj += __shfl_down(E_lj, sh);
751 E_el += __shfl_down(E_el, sh);
755 /* The first thread in the warp writes the reduced energies */
756 if (tidx == 0 || tidx == warp_size)
758 atomicAdd(e_lj, E_lj);
759 atomicAdd(e_el, E_el);
762 #endif /* GMX_PTX_ARCH */
766 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */