2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "kernel_gpu_ref.h"
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/force_flags.h"
47 #include "gromacs/mdtypes/md_enums.h"
48 #include "gromacs/nbnxm/atomdata.h"
49 #include "gromacs/nbnxm/nbnxm.h"
50 #include "gromacs/nbnxm/pairlist.h"
51 #include "gromacs/pbcutil/ishift.h"
52 #include "gromacs/utility/fatalerror.h"
54 static const int c_numClPerSupercl
= c_nbnxnGpuNumClusterPerSupercluster
;
55 static const int c_clSize
= c_nbnxnGpuClusterSize
;
58 nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu
*nbl
,
59 const nbnxn_atomdata_t
*nbat
,
60 const interaction_const_t
*iconst
,
64 gmx::ArrayRef
<real
> f
,
71 const real
*Ftab
= nullptr;
72 real rcut2
, rvdw2
, rlist2
;
77 int cj4_ind0
, cj4_ind1
, cj4_ind
;
79 int ic
, jc
, ia
, ja
, is
, ifs
, js
, jfs
, im
, jm
;
83 real fscal
, tx
, ty
, tz
;
86 real qq
, vcoul
= 0, krsq
, vctot
;
92 real Vvdw_rep
, Vvdw_disp
;
93 real ix
, iy
, iz
, fix
, fiy
, fiz
;
95 real dx
, dy
, dz
, rsq
, rinv
;
99 const nbnxn_excl_t
*excl
[2];
101 int npair_tot
, npair
;
102 int nhwu
, nhwu_pruned
;
104 if (nbl
->na_ci
!= c_clSize
)
106 gmx_fatal(FARGS
, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl
->na_ci
, c_clSize
);
109 if (clearF
== enbvClearFYes
)
117 bEner
= ((force_flags
& GMX_FORCE_ENERGY
) != 0);
119 bEwald
= EEL_FULL(iconst
->eeltype
);
122 Ftab
= iconst
->coulombEwaldTables
->tableF
.data();
125 rcut2
= iconst
->rcoulomb
*iconst
->rcoulomb
;
126 rvdw2
= iconst
->rvdw
*iconst
->rvdw
;
128 rlist2
= nbl
->rlist
*nbl
->rlist
;
130 const int *type
= nbat
->params().type
.data();
131 facel
= iconst
->epsfac
;
132 const real
*shiftvec
= shift_vec
[0];
133 const real
*vdwparam
= nbat
->params().nbfp
.data();
134 ntype
= nbat
->params().numTypes
;
136 const real
*x
= nbat
->x().data();
142 for (const nbnxn_sci_t
&nbln
: nbl
->sci
)
145 shX
= shiftvec
[ish3
];
146 shY
= shiftvec
[ish3
+1];
147 shZ
= shiftvec
[ish3
+2];
148 cj4_ind0
= nbln
.cj4_ind_start
;
149 cj4_ind1
= nbln
.cj4_ind_end
;
154 if (nbln
.shift
== CENTRAL
&&
155 nbl
->cj4
[cj4_ind0
].cj
[0] == sci
*c_numClPerSupercl
)
157 /* we have the diagonal:
158 * add the charge self interaction energy term
160 for (im
= 0; im
< c_numClPerSupercl
; im
++)
162 ci
= sci
*c_numClPerSupercl
+ im
;
163 for (ic
= 0; ic
< c_clSize
; ic
++)
165 ia
= ci
*c_clSize
+ ic
;
166 iq
= x
[ia
*nbat
->xstride
+3];
172 vctot
*= -facel
*0.5*iconst
->c_rf
;
176 /* last factor 1/sqrt(pi) */
177 vctot
*= -facel
*iconst
->ewaldcoeff_q
*M_1_SQRTPI
;
181 for (cj4_ind
= cj4_ind0
; (cj4_ind
< cj4_ind1
); cj4_ind
++)
183 excl
[0] = &nbl
->excl
[nbl
->cj4
[cj4_ind
].imei
[0].excl_ind
];
184 excl
[1] = &nbl
->excl
[nbl
->cj4
[cj4_ind
].imei
[1].excl_ind
];
186 for (jm
= 0; jm
< c_nbnxnGpuJgroupSize
; jm
++)
188 cj
= nbl
->cj4
[cj4_ind
].cj
[jm
];
190 for (im
= 0; im
< c_numClPerSupercl
; im
++)
192 /* We're only using the first imask,
193 * but here imei[1].imask is identical.
195 if ((nbl
->cj4
[cj4_ind
].imei
[0].imask
>> (jm
*c_numClPerSupercl
+ im
)) & 1)
197 gmx_bool within_rlist
;
199 ci
= sci
*c_numClPerSupercl
+ im
;
201 within_rlist
= FALSE
;
203 for (ic
= 0; ic
< c_clSize
; ic
++)
205 ia
= ci
*c_clSize
+ ic
;
207 is
= ia
*nbat
->xstride
;
208 ifs
= ia
*nbat
->fstride
;
213 nti
= ntype
*2*type
[ia
];
219 for (jc
= 0; jc
< c_clSize
; jc
++)
221 ja
= cj
*c_clSize
+ jc
;
223 if (nbln
.shift
== CENTRAL
&&
224 ci
== cj
&& ja
<= ia
)
229 constexpr int clusterPerSplit
= c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
;
230 int_bit
= ((excl
[jc
/clusterPerSplit
]->pair
[(jc
& (clusterPerSplit
- 1))*c_clSize
+ ic
]
231 >> (jm
*c_numClPerSupercl
+ im
)) & 1);
233 js
= ja
*nbat
->xstride
;
234 jfs
= ja
*nbat
->fstride
;
241 rsq
= dx
*dx
+ dy
*dy
+ dz
*dz
;
251 if (type
[ia
] != ntype
-1 && type
[ja
] != ntype
-1)
256 // Ensure distance do not become so small that r^-12 overflows
257 rsq
= std::max(rsq
, NBNXN_MIN_RSQ
);
259 rinv
= gmx::invsqrt(rsq
);
266 krsq
= iconst
->k_rf
*rsq
;
267 fscal
= qq
*(int_bit
*rinv
- 2*krsq
)*rinvsq
;
270 vcoul
= qq
*(int_bit
*rinv
+ krsq
- iconst
->c_rf
);
276 rt
= r
*iconst
->coulombEwaldTables
->scale
;
277 n0
= static_cast<int>(rt
);
280 fexcl
= (1 - eps
)*Ftab
[n0
] + eps
*Ftab
[n0
+1];
282 fscal
= qq
*(int_bit
*rinvsq
- fexcl
)*rinv
;
286 vcoul
= qq
*((int_bit
- std::erf(iconst
->ewaldcoeff_q
*r
))*rinv
- int_bit
*iconst
->sh_ewald
);
292 tj
= nti
+ 2*type
[ja
];
294 /* Vanilla Lennard-Jones cutoff */
296 c12
= vdwparam
[tj
+1];
298 rinvsix
= int_bit
*rinvsq
*rinvsq
*rinvsq
;
299 Vvdw_disp
= c6
*rinvsix
;
300 Vvdw_rep
= c12
*rinvsix
*rinvsix
;
301 fscal
+= (Vvdw_rep
- Vvdw_disp
)*rinvsq
;
308 (Vvdw_rep
+ int_bit
*c12
*iconst
->repulsion_shift
.cpot
)/12 -
309 (Vvdw_disp
+ int_bit
*c6
*iconst
->dispersion_shift
.cpot
)/6;
327 fshift
[ish3
] = fshift
[ish3
] + fix
;
328 fshift
[ish3
+1] = fshift
[ish3
+1] + fiy
;
329 fshift
[ish3
+2] = fshift
[ish3
+2] + fiz
;
331 /* Count in half work-units.
332 * In CUDA one work-unit is 2 warps.
334 if ((ic
+1) % (c_clSize
/c_nbnxnGpuClusterpairSplit
) == 0)
344 within_rlist
= FALSE
;
356 Vc
[ggid
] = Vc
[ggid
] + vctot
;
357 Vvdw
[ggid
] = Vvdw
[ggid
] + Vvdwtot
;
363 fprintf(debug
, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
364 nbl
->na_ci
, nbl
->na_ci
,
365 nhwu
, nhwu_pruned
, nhwu_pruned
/static_cast<double>(nhwu
));
366 fprintf(debug
, "generic kernel pair interactions: %d\n",
367 nhwu
*nbl
->na_ci
/2*nbl
->na_ci
);
368 fprintf(debug
, "generic kernel post-prune pair interactions: %d\n",
369 nhwu_pruned
*nbl
->na_ci
/2*nbl
->na_ci
);
370 fprintf(debug
, "generic kernel non-zero pair interactions: %d\n",
372 fprintf(debug
, "ratio non-zero/post-prune pair interactions: %4.2f\n",
373 npair_tot
/static_cast<double>(nhwu_pruned
*gmx::exactDiv(nbl
->na_ci
, 2)*nbl
->na_ci
));