2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "gromacs/gmxlib/nrnb.h"
39 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
40 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
41 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
42 #include "gromacs/math/vectypes.h"
43 #include "gromacs/mdlib/enerdata_utils.h"
44 #include "gromacs/mdlib/force.h"
45 #include "gromacs/mdlib/force_flags.h"
46 #include "gromacs/mdlib/gmx_omp_nthreads.h"
47 #include "gromacs/mdtypes/enerdata.h"
48 #include "gromacs/mdtypes/inputrec.h"
49 #include "gromacs/mdtypes/interaction_const.h"
50 #include "gromacs/mdtypes/md_enums.h"
51 #include "gromacs/mdtypes/mdatom.h"
52 #include "gromacs/nbnxm/gpu_data_mgmt.h"
53 #include "gromacs/nbnxm/nbnxm.h"
54 #include "gromacs/nbnxm/nbnxm_simd.h"
55 #include "gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h"
56 #include "gromacs/simd/simd.h"
57 #include "gromacs/timing/wallcycle.h"
58 #include "gromacs/utility/gmxassert.h"
59 #include "gromacs/utility/real.h"
61 #include "kernel_common.h"
62 #include "pairlistset.h"
63 #include "pairlistsets.h"
64 #define INCLUDE_KERNELFUNCTION_TABLES
65 #include "gromacs/nbnxm/kernels_reference/kernel_ref.h"
66 #ifdef GMX_NBNXN_SIMD_2XNN
67 #include "gromacs/nbnxm/kernels_simd_2xmm/kernels.h"
69 #ifdef GMX_NBNXN_SIMD_4XN
70 #include "gromacs/nbnxm/kernels_simd_4xm/kernels.h"
72 #undef INCLUDE_FUNCTION_TABLES
74 /*! \brief Clears the energy group output buffers
76 * \param[in,out] out nbnxn kernel output struct
78 static void clearGroupEnergies(nbnxn_atomdata_output_t
*out
)
80 std::fill(out
->Vvdw
.begin(), out
->Vvdw
.end(), 0.0_real
);
81 std::fill(out
->Vc
.begin(), out
->Vc
.end(), 0.0_real
);
82 std::fill(out
->VSvdw
.begin(), out
->VSvdw
.end(), 0.0_real
);
83 std::fill(out
->VSc
.begin(), out
->VSc
.end(), 0.0_real
);
86 /*! \brief Reduce the group-pair energy buffers produced by a SIMD kernel
87 * to single terms in the output buffers.
89 * The SIMD kernels produce a large number of energy buffer in SIMD registers
90 * to avoid scattered reads and writes.
92 * \tparam unrollj The unroll size for j-particles in the SIMD kernel
93 * \param[in] numGroups The number of energy groups
94 * \param[in] numGroups_2log Log2 of numGroups, rounded up
95 * \param[in,out] out Struct with energy buffers
97 template <int unrollj
> static void
98 reduceGroupEnergySimdBuffers(int numGroups
,
100 nbnxn_atomdata_output_t
*out
)
102 const int unrollj_half
= unrollj
/2;
103 /* Energies are stored in SIMD registers with size 2^numGroups_2log */
104 const int numGroupsStorage
= (1 << numGroups_2log
);
106 const real
* gmx_restrict vVdwSimd
= out
->VSvdw
.data();
107 const real
* gmx_restrict vCoulombSimd
= out
->VSc
.data();
108 real
* gmx_restrict vVdw
= out
->Vvdw
.data();
109 real
* gmx_restrict vCoulomb
= out
->Vc
.data();
111 /* The size of the SIMD energy group buffer array is:
112 * numGroups*numGroups*numGroupsStorage*unrollj_half*simd_width
114 for (int i
= 0; i
< numGroups
; i
++)
116 for (int j1
= 0; j1
< numGroups
; j1
++)
118 for (int j0
= 0; j0
< numGroups
; j0
++)
120 int c
= ((i
*numGroups
+ j1
)*numGroupsStorage
+ j0
)*unrollj_half
*unrollj
;
121 for (int s
= 0; s
< unrollj_half
; s
++)
123 vVdw
[i
*numGroups
+ j0
] += vVdwSimd
[c
+ 0];
124 vVdw
[i
*numGroups
+ j1
] += vVdwSimd
[c
+ 1];
125 vCoulomb
[i
*numGroups
+ j0
] += vCoulombSimd
[c
+ 0];
126 vCoulomb
[i
*numGroups
+ j1
] += vCoulombSimd
[c
+ 1];
134 /*! \brief Dispatches the non-bonded N versus M atom cluster CPU kernels.
136 * OpenMP parallelization is performed within this function.
137 * Energy reduction, but not force and shift force reduction, is performed
138 * within this function.
140 * \param[in] pairlistSet Pairlists with local or non-local interactions to compute
141 * \param[in] kernelSetup The non-bonded kernel setup
142 * \param[in,out] nbat The atomdata for the interactions
143 * \param[in] ic Non-bonded interaction constants
144 * \param[in] shiftVectors The PBC shift vectors
145 * \param[in] forceFlags Flags that tell what to compute
146 * \param[in] clearF Enum that tells if to clear the force output buffer
147 * \param[out] vCoulomb Output buffer for Coulomb energies
148 * \param[out] vVdw Output buffer for Van der Waals energies
149 * \param[in] wcycle Pointer to cycle counting data structure.
152 nbnxn_kernel_cpu(const PairlistSet
&pairlistSet
,
153 const Nbnxm::KernelSetup
&kernelSetup
,
154 nbnxn_atomdata_t
*nbat
,
155 const interaction_const_t
&ic
,
161 gmx_wallcycle
*wcycle
)
165 if (EEL_RF(ic
.eeltype
) || ic
.eeltype
== eelCUT
)
171 if (kernelSetup
.ewaldExclusionType
== Nbnxm::EwaldExclusionType::Table
)
173 if (ic
.rcoulomb
== ic
.rvdw
)
179 coulkt
= coulktTAB_TWIN
;
184 if (ic
.rcoulomb
== ic
.rvdw
)
186 coulkt
= coulktEWALD
;
190 coulkt
= coulktEWALD_TWIN
;
195 const nbnxn_atomdata_t::Params
&nbatParams
= nbat
->params();
198 if (ic
.vdwtype
== evdwCUT
)
200 switch (ic
.vdw_modifier
)
203 case eintmodPOTSHIFT
:
204 switch (nbatParams
.comb_rule
)
206 case ljcrGEOM
: vdwkt
= vdwktLJCUT_COMBGEOM
; break;
207 case ljcrLB
: vdwkt
= vdwktLJCUT_COMBLB
; break;
208 case ljcrNONE
: vdwkt
= vdwktLJCUT_COMBNONE
; break;
210 GMX_RELEASE_ASSERT(false, "Unknown combination rule");
213 case eintmodFORCESWITCH
:
214 vdwkt
= vdwktLJFORCESWITCH
;
216 case eintmodPOTSWITCH
:
217 vdwkt
= vdwktLJPOTSWITCH
;
220 GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction modifier");
223 else if (ic
.vdwtype
== evdwPME
)
225 if (ic
.ljpme_comb_rule
== eljpmeGEOM
)
227 vdwkt
= vdwktLJEWALDCOMBGEOM
;
231 vdwkt
= vdwktLJEWALDCOMBLB
;
232 /* At setup we (should have) selected the C reference kernel */
233 GMX_RELEASE_ASSERT(kernelSetup
.kernelType
== Nbnxm::KernelType::Cpu4x4_PlainC
, "Only the C reference nbnxn SIMD kernel supports LJ-PME with LB combination rules");
238 GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction type");
241 gmx::ArrayRef
<const NbnxnPairlistCpu
> pairlists
= pairlistSet
.cpuLists();
243 int gmx_unused nthreads
= gmx_omp_nthreads_get(emntNonbonded
);
244 wallcycle_sub_start(wcycle
, ewcsNONBONDED_CLEAR
);
245 #pragma omp parallel for schedule(static) num_threads(nthreads)
246 for (int nb
= 0; nb
< pairlists
.ssize(); nb
++)
248 // Presently, the kernels do not call C++ code that can throw,
249 // so no need for a try/catch pair in this OpenMP region.
250 nbnxn_atomdata_output_t
*out
= &nbat
->out
[nb
];
252 if (clearF
== enbvClearFYes
)
254 clearForceBuffer(nbat
, nb
);
256 clear_fshift(out
->fshift
.data());
261 wallcycle_sub_stop(wcycle
, ewcsNONBONDED_CLEAR
);
262 wallcycle_sub_start(wcycle
, ewcsNONBONDED_KERNEL
);
265 // TODO: Change to reference
266 const NbnxnPairlistCpu
*pairlist
= &pairlists
[nb
];
268 if (!(forceFlags
& GMX_FORCE_ENERGY
))
270 /* Don't calculate energies */
271 switch (kernelSetup
.kernelType
)
273 case Nbnxm::KernelType::Cpu4x4_PlainC
:
274 nbnxn_kernel_noener_ref
[coulkt
][vdwkt
](pairlist
, nbat
,
279 #ifdef GMX_NBNXN_SIMD_2XNN
280 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
281 nbnxm_kernel_noener_simd_2xmm
[coulkt
][vdwkt
](pairlist
, nbat
,
287 #ifdef GMX_NBNXN_SIMD_4XN
288 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
289 nbnxm_kernel_noener_simd_4xm
[coulkt
][vdwkt
](pairlist
, nbat
,
296 GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
299 else if (out
->Vvdw
.size() == 1)
301 /* A single energy group (pair) */
305 switch (kernelSetup
.kernelType
)
307 case Nbnxm::KernelType::Cpu4x4_PlainC
:
308 nbnxn_kernel_ener_ref
[coulkt
][vdwkt
](pairlist
, nbat
,
313 #ifdef GMX_NBNXN_SIMD_2XNN
314 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
315 nbnxm_kernel_ener_simd_2xmm
[coulkt
][vdwkt
](pairlist
, nbat
,
321 #ifdef GMX_NBNXN_SIMD_4XN
322 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
323 nbnxm_kernel_ener_simd_4xm
[coulkt
][vdwkt
](pairlist
, nbat
,
330 GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
335 /* Calculate energy group contributions */
336 clearGroupEnergies(out
);
340 switch (kernelSetup
.kernelType
)
342 case Nbnxm::KernelType::Cpu4x4_PlainC
:
343 unrollj
= c_nbnxnCpuIClusterSize
;
344 nbnxn_kernel_energrp_ref
[coulkt
][vdwkt
](pairlist
, nbat
,
349 #ifdef GMX_NBNXN_SIMD_2XNN
350 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
351 unrollj
= GMX_SIMD_REAL_WIDTH
/2;
352 nbnxm_kernel_energrp_simd_2xmm
[coulkt
][vdwkt
](pairlist
, nbat
,
358 #ifdef GMX_NBNXN_SIMD_4XN
359 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
360 unrollj
= GMX_SIMD_REAL_WIDTH
;
361 nbnxm_kernel_energrp_simd_4xm
[coulkt
][vdwkt
](pairlist
, nbat
,
368 GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
371 if (kernelSetup
.kernelType
!= Nbnxm::KernelType::Cpu4x4_PlainC
)
376 reduceGroupEnergySimdBuffers
<2>(nbatParams
.nenergrp
,
381 reduceGroupEnergySimdBuffers
<4>(nbatParams
.nenergrp
,
386 reduceGroupEnergySimdBuffers
<8>(nbatParams
.nenergrp
,
391 GMX_RELEASE_ASSERT(false, "Unsupported j-unroll size");
396 wallcycle_sub_stop(wcycle
, ewcsNONBONDED_KERNEL
);
398 if (forceFlags
& GMX_FORCE_ENERGY
)
400 reduce_energies_over_lists(nbat
, pairlists
.ssize(), vVdw
, vCoulomb
);
404 static void accountFlops(t_nrnb
*nrnb
,
405 const PairlistSet
&pairlistSet
,
406 const nonbonded_verlet_t
&nbv
,
407 const interaction_const_t
&ic
,
408 const int forceFlags
)
410 const bool usingGpuKernels
= nbv
.useGpu();
412 int enr_nbnxn_kernel_ljc
;
413 if (EEL_RF(ic
.eeltype
) || ic
.eeltype
== eelCUT
)
415 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_RF
;
417 else if ((!usingGpuKernels
&& nbv
.kernelSetup().ewaldExclusionType
== Nbnxm::EwaldExclusionType::Analytical
) ||
418 (usingGpuKernels
&& Nbnxm::gpu_is_kernel_ewald_analytical(nbv
.gpu_nbv
)))
420 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_EWALD
;
424 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_TAB
;
426 int enr_nbnxn_kernel_lj
= eNR_NBNXN_LJ
;
427 if (forceFlags
& GMX_FORCE_ENERGY
)
429 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
430 enr_nbnxn_kernel_ljc
+= 1;
431 enr_nbnxn_kernel_lj
+= 1;
434 inc_nrnb(nrnb
, enr_nbnxn_kernel_ljc
,
435 pairlistSet
.natpair_ljq_
);
436 inc_nrnb(nrnb
, enr_nbnxn_kernel_lj
,
437 pairlistSet
.natpair_lj_
);
438 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
439 inc_nrnb(nrnb
, enr_nbnxn_kernel_ljc
-eNR_NBNXN_LJ_RF
+eNR_NBNXN_RF
,
440 pairlistSet
.natpair_q_
);
442 const bool calcEnergy
= ((forceFlags
& GMX_FORCE_ENERGY
) != 0);
443 if (ic
.vdw_modifier
== eintmodFORCESWITCH
)
445 /* We add up the switch cost separately */
446 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_FSW
+ (calcEnergy
? 1 : 0),
447 pairlistSet
.natpair_ljq_
+ pairlistSet
.natpair_lj_
);
449 if (ic
.vdw_modifier
== eintmodPOTSWITCH
)
451 /* We add up the switch cost separately */
452 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_PSW
+ (calcEnergy
? 1 : 0),
453 pairlistSet
.natpair_ljq_
+ pairlistSet
.natpair_lj_
);
455 if (ic
.vdwtype
== evdwPME
)
457 /* We add up the LJ Ewald cost separately */
458 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_EWALD
+ (calcEnergy
? 1 : 0),
459 pairlistSet
.natpair_ljq_
+ pairlistSet
.natpair_lj_
);
464 nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
,
465 const interaction_const_t
&ic
,
468 const t_forcerec
&fr
,
469 gmx_enerdata_t
*enerd
,
472 const PairlistSet
&pairlistSet
= pairlistSets().pairlistSet(iLocality
);
474 switch (kernelSetup().kernelType
)
476 case Nbnxm::KernelType::Cpu4x4_PlainC
:
477 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
478 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
479 nbnxn_kernel_cpu(pairlistSet
,
486 enerd
->grpp
.ener
[egCOULSR
].data(),
488 enerd
->grpp
.ener
[egBHAMSR
].data() :
489 enerd
->grpp
.ener
[egLJSR
].data(),
493 case Nbnxm::KernelType::Gpu8x8x8
:
494 Nbnxm::gpu_launch_kernel(gpu_nbv
, forceFlags
, iLocality
);
497 case Nbnxm::KernelType::Cpu8x8x8_PlainC
:
498 nbnxn_kernel_gpu_ref(pairlistSet
.gpuList(),
504 nbat
->out
[0].fshift
.data(),
505 enerd
->grpp
.ener
[egCOULSR
].data(),
507 enerd
->grpp
.ener
[egBHAMSR
].data() :
508 enerd
->grpp
.ener
[egLJSR
].data());
512 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
516 accountFlops(nrnb
, pairlistSet
, *this, ic
, forceFlags
);
520 nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocality
,
524 const t_mdatoms
&mdatoms
,
527 gmx_enerdata_t
*enerd
,
528 const int forceFlags
,
531 const auto nbl_fep
= pairlistSets().pairlistSet(iLocality
).fepLists();
533 /* When the first list is empty, all are empty and there is nothing to do */
534 if (!pairlistSets().params().haveFep
|| nbl_fep
[0]->nrj
== 0)
540 /* Add short-range interactions */
541 donb_flags
|= GMX_NONBONDED_DO_SR
;
543 /* Currently all group scheme kernels always calculate (shift-)forces */
544 if (forceFlags
& GMX_FORCE_FORCES
)
546 donb_flags
|= GMX_NONBONDED_DO_FORCE
;
548 if (forceFlags
& GMX_FORCE_VIRIAL
)
550 donb_flags
|= GMX_NONBONDED_DO_SHIFTFORCE
;
552 if (forceFlags
& GMX_FORCE_ENERGY
)
554 donb_flags
|= GMX_NONBONDED_DO_POTENTIAL
;
557 nb_kernel_data_t kernel_data
;
558 real dvdl_nb
[efptNR
] = { 0 };
559 kernel_data
.flags
= donb_flags
;
560 kernel_data
.lambda
= lambda
;
561 kernel_data
.dvdl
= dvdl_nb
;
563 kernel_data
.energygrp_elec
= enerd
->grpp
.ener
[egCOULSR
].data();
564 kernel_data
.energygrp_vdw
= enerd
->grpp
.ener
[egLJSR
].data();
566 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded
) == nbl_fep
.ssize(), "Number of lists should be same as number of NB threads");
568 wallcycle_sub_start(wcycle_
, ewcsNONBONDED_FEP
);
569 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
570 for (int th
= 0; th
< nbl_fep
.ssize(); th
++)
574 gmx_nb_free_energy_kernel(nbl_fep
[th
].get(),
575 x
, f
, fr
, &mdatoms
, &kernel_data
, nrnb
);
577 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
580 if (fepvals
->sc_alpha
!= 0)
582 enerd
->dvdl_nonlin
[efptVDW
] += dvdl_nb
[efptVDW
];
583 enerd
->dvdl_nonlin
[efptCOUL
] += dvdl_nb
[efptCOUL
];
587 enerd
->dvdl_lin
[efptVDW
] += dvdl_nb
[efptVDW
];
588 enerd
->dvdl_lin
[efptCOUL
] += dvdl_nb
[efptCOUL
];
591 /* If we do foreign lambda and we have soft-core interactions
592 * we have to recalculate the (non-linear) energies contributions.
594 if (fepvals
->n_lambda
> 0 && (forceFlags
& GMX_FORCE_DHDL
) && fepvals
->sc_alpha
!= 0)
597 kernel_data
.flags
= (donb_flags
& ~(GMX_NONBONDED_DO_FORCE
| GMX_NONBONDED_DO_SHIFTFORCE
)) | GMX_NONBONDED_DO_FOREIGNLAMBDA
;
598 kernel_data
.lambda
= lam_i
;
599 kernel_data
.energygrp_elec
= enerd
->foreign_grpp
.ener
[egCOULSR
].data();
600 kernel_data
.energygrp_vdw
= enerd
->foreign_grpp
.ener
[egLJSR
].data();
601 /* Note that we add to kernel_data.dvdl, but ignore the result */
603 for (size_t i
= 0; i
< enerd
->enerpart_lambda
.size(); i
++)
605 for (int j
= 0; j
< efptNR
; j
++)
607 lam_i
[j
] = (i
== 0 ? lambda
[j
] : fepvals
->all_lambda
[j
][i
-1]);
609 reset_foreign_enerdata(enerd
);
610 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
611 for (int th
= 0; th
< nbl_fep
.ssize(); th
++)
615 gmx_nb_free_energy_kernel(nbl_fep
[th
].get(),
616 x
, f
, fr
, &mdatoms
, &kernel_data
, nrnb
);
618 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
621 sum_epot(&(enerd
->foreign_grpp
), enerd
->foreign_term
);
622 enerd
->enerpart_lambda
[i
] += enerd
->foreign_term
[F_EPOT
];
625 wallcycle_sub_stop(wcycle_
, ewcsNONBONDED_FEP
);