2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
59 #include "nbnxm_gpu_data_mgmt.h"
61 #include "gromacs/mdtypes/interaction_const.h"
62 #include "gromacs/nbnxm/gpu_data_mgmt.h"
63 #include "gromacs/timing/gpu_timing.h"
64 #include "gromacs/utility/cstringutil.h"
66 #include "nbnxm_gpu.h"
67 #include "pairlistsets.h"
72 void init_ewald_coulomb_force_table(const EwaldCorrectionTables
& tables
,
74 const DeviceContext
& deviceContext
)
78 destroyParamLookupTable(&nbp
->coulomb_tab
, nbp
->coulomb_tab_texobj
);
81 nbp
->coulomb_tab_scale
= tables
.scale
;
82 initParamLookupTable(&nbp
->coulomb_tab
, &nbp
->coulomb_tab_texobj
, tables
.tableF
.data(),
83 tables
.tableF
.size(), deviceContext
);
86 void inline printEnvironmentVariableDeprecationMessage(bool isEnvironmentVariableSet
,
87 const std::string
& environmentVariableSuffix
)
89 if (isEnvironmentVariableSet
)
92 "Environment variables GMX_CUDA_%s and GMX_OCL_%s are deprecated and will be\n"
93 "removed in release 2022, please use GMX_GPU_%s instead.",
94 environmentVariableSuffix
.c_str(), environmentVariableSuffix
.c_str(),
95 environmentVariableSuffix
.c_str());
99 enum ElecType
nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t
& ic
)
101 bool bTwinCut
= (ic
.rcoulomb
!= ic
.rvdw
);
103 /* Benchmarking/development environment variables to force the use of
104 analytical or tabulated Ewald kernel. */
106 // Remove these when old environment variables are deprecated
107 const bool forceAnalyticalEwaldLegacy
= (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr)
108 || (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
109 const bool forceTabulatedEwaldLegacy
= (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr)
110 || (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
111 const bool forceTwinCutoffEwaldLegacy
= (getenv("GMX_CUDA_NB_EWALD_TWINCUT") != nullptr)
112 || (getenv("GMX_OCL_NB_EWALD_TWINCUT") != nullptr);
114 printEnvironmentVariableDeprecationMessage(forceAnalyticalEwaldLegacy
, "NB_ANA_EWALD");
115 printEnvironmentVariableDeprecationMessage(forceTabulatedEwaldLegacy
, "NB_TAB_EWALD");
116 printEnvironmentVariableDeprecationMessage(forceTwinCutoffEwaldLegacy
, "NB_EWALD_TWINCUT");
118 const bool forceAnalyticalEwald
=
119 (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr) || forceAnalyticalEwaldLegacy
;
120 const bool forceTabulatedEwald
=
121 (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr) || forceTabulatedEwaldLegacy
;
122 const bool forceTwinCutoffEwald
=
123 (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr) || forceTwinCutoffEwaldLegacy
;
125 if (forceAnalyticalEwald
&& forceTabulatedEwald
)
128 "Both analytical and tabulated Ewald GPU non-bonded kernels "
129 "requested through environment variables.");
132 /* By default, use analytical Ewald
133 * TODO: tabulated does not work in OpenCL, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
136 bool bUseAnalyticalEwald
= true;
137 if (forceAnalyticalEwald
)
141 fprintf(debug
, "Using analytical Ewald GPU kernels\n");
144 else if (forceTabulatedEwald
)
146 bUseAnalyticalEwald
= false;
150 fprintf(debug
, "Using tabulated Ewald GPU kernels\n");
154 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
155 forces it (use it for debugging/benchmarking only). */
156 if (!bTwinCut
&& !forceTwinCutoffEwald
)
158 return bUseAnalyticalEwald
? ElecType::EwaldAna
: ElecType::EwaldTab
;
162 return bUseAnalyticalEwald
? ElecType::EwaldAnaTwin
: ElecType::EwaldTabTwin
;
166 void set_cutoff_parameters(NBParamGpu
* nbp
, const interaction_const_t
* ic
, const PairlistParams
& listParams
)
168 nbp
->ewald_beta
= ic
->ewaldcoeff_q
;
169 nbp
->sh_ewald
= ic
->sh_ewald
;
170 nbp
->epsfac
= ic
->epsfac
;
171 nbp
->two_k_rf
= 2.0 * ic
->k_rf
;
172 nbp
->c_rf
= ic
->c_rf
;
173 nbp
->rvdw_sq
= ic
->rvdw
* ic
->rvdw
;
174 nbp
->rcoulomb_sq
= ic
->rcoulomb
* ic
->rcoulomb
;
175 nbp
->rlistOuter_sq
= listParams
.rlistOuter
* listParams
.rlistOuter
;
176 nbp
->rlistInner_sq
= listParams
.rlistInner
* listParams
.rlistInner
;
177 nbp
->useDynamicPruning
= listParams
.useDynamicPruning
;
179 nbp
->sh_lj_ewald
= ic
->sh_lj_ewald
;
180 nbp
->ewaldcoeff_lj
= ic
->ewaldcoeff_lj
;
182 nbp
->rvdw_switch
= ic
->rvdw_switch
;
183 nbp
->dispersion_shift
= ic
->dispersion_shift
;
184 nbp
->repulsion_shift
= ic
->repulsion_shift
;
185 nbp
->vdw_switch
= ic
->vdw_switch
;
188 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t
* nbv
, const interaction_const_t
* ic
)
190 if (!nbv
|| !nbv
->useGpu())
194 NbnxmGpu
* nb
= nbv
->gpu_nbv
;
195 NBParamGpu
* nbp
= nb
->nbparam
;
197 set_cutoff_parameters(nbp
, ic
, nbv
->pairlistSets().params());
199 nbp
->elecType
= nbnxn_gpu_pick_ewald_kernel_type(*ic
);
201 GMX_RELEASE_ASSERT(ic
->coulombEwaldTables
, "Need valid Coulomb Ewald correction tables");
202 init_ewald_coulomb_force_table(*ic
->coulombEwaldTables
, nbp
, *nb
->deviceContext_
);
205 void init_plist(gpu_plist
* pl
)
207 /* initialize to nullptr pointers to data that is not allocated here and will
208 need reallocation in nbnxn_gpu_init_pairlist */
214 /* size -1 indicates that the respective array hasn't been initialized yet */
221 pl
->imask_nalloc
= -1;
223 pl
->excl_nalloc
= -1;
224 pl
->haveFreshList
= false;
227 void init_timings(gmx_wallclock_gpu_nbnxn_t
* t
)
236 for (i
= 0; i
< 2; i
++)
238 for (j
= 0; j
< 2; j
++)
240 t
->ktime
[i
][j
].t
= 0.0;
241 t
->ktime
[i
][j
].c
= 0;
245 t
->pruneTime
.t
= 0.0;
246 t
->dynamicPruneTime
.c
= 0;
247 t
->dynamicPruneTime
.t
= 0.0;
250 //! This function is documented in the header file
251 void gpu_init_pairlist(NbnxmGpu
* nb
, const NbnxnPairlistGpu
* h_plist
, const InteractionLocality iloc
)
254 // Timing accumulation should happen only if there was work to do
255 // because getLastRangeTime() gets skipped with empty lists later
256 // which leads to the counter not being reset.
257 bool bDoTime
= (nb
->bDoTime
&& !h_plist
->sci
.empty());
258 const DeviceStream
& deviceStream
= *nb
->deviceStreams
[iloc
];
259 gpu_plist
* d_plist
= nb
->plist
[iloc
];
261 if (d_plist
->na_c
< 0)
263 d_plist
->na_c
= h_plist
->na_ci
;
267 if (d_plist
->na_c
!= h_plist
->na_ci
)
269 sprintf(sbuf
, "In init_plist: the #atoms per cell has changed (from %d to %d)",
270 d_plist
->na_c
, h_plist
->na_ci
);
275 gpu_timers_t::Interaction
& iTimers
= nb
->timers
->interaction
[iloc
];
279 iTimers
.pl_h2d
.openTimingRegion(deviceStream
);
280 iTimers
.didPairlistH2D
= true;
283 // TODO most of this function is same in CUDA and OpenCL, move into the header
284 const DeviceContext
& deviceContext
= *nb
->deviceContext_
;
286 reallocateDeviceBuffer(&d_plist
->sci
, h_plist
->sci
.size(), &d_plist
->nsci
, &d_plist
->sci_nalloc
,
288 copyToDeviceBuffer(&d_plist
->sci
, h_plist
->sci
.data(), 0, h_plist
->sci
.size(), deviceStream
,
289 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
291 reallocateDeviceBuffer(&d_plist
->cj4
, h_plist
->cj4
.size(), &d_plist
->ncj4
, &d_plist
->cj4_nalloc
,
293 copyToDeviceBuffer(&d_plist
->cj4
, h_plist
->cj4
.data(), 0, h_plist
->cj4
.size(), deviceStream
,
294 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
296 reallocateDeviceBuffer(&d_plist
->imask
, h_plist
->cj4
.size() * c_nbnxnGpuClusterpairSplit
,
297 &d_plist
->nimask
, &d_plist
->imask_nalloc
, deviceContext
);
299 reallocateDeviceBuffer(&d_plist
->excl
, h_plist
->excl
.size(), &d_plist
->nexcl
,
300 &d_plist
->excl_nalloc
, deviceContext
);
301 copyToDeviceBuffer(&d_plist
->excl
, h_plist
->excl
.data(), 0, h_plist
->excl
.size(), deviceStream
,
302 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
306 iTimers
.pl_h2d
.closeTimingRegion(deviceStream
);
309 /* need to prune the pair list during the next step */
310 d_plist
->haveFreshList
= true;
313 //! This function is documented in the header file
314 gmx_wallclock_gpu_nbnxn_t
* gpu_get_timings(NbnxmGpu
* nb
)
316 return (nb
!= nullptr && nb
->bDoTime
) ? nb
->timings
: nullptr;
319 //! This function is documented in the header file
320 void gpu_reset_timings(nonbonded_verlet_t
* nbv
)
322 if (nbv
->gpu_nbv
&& nbv
->gpu_nbv
->bDoTime
)
324 init_timings(nbv
->gpu_nbv
->timings
);
328 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu
* nb
)
330 return ((nb
->nbparam
->elecType
== ElecType::EwaldAna
)
331 || (nb
->nbparam
->elecType
== ElecType::EwaldAnaTwin
));
334 enum ElecType
nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t
* ic
)
336 if (ic
->eeltype
== eelCUT
)
338 return ElecType::Cut
;
340 else if (EEL_RF(ic
->eeltype
))
344 else if ((EEL_PME(ic
->eeltype
) || ic
->eeltype
== eelEWALD
))
346 return nbnxn_gpu_pick_ewald_kernel_type(*ic
);
350 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
351 GMX_THROW(gmx::InconsistentInputError(
352 gmx::formatString("The requested electrostatics type %s (%d) is not implemented in "
353 "the GPU accelerated kernels!",
354 EELTYPE(ic
->eeltype
), ic
->eeltype
)));
359 enum VdwType
nbnxmGpuPickVdwKernelType(const interaction_const_t
* ic
, int combRule
)
361 if (ic
->vdwtype
== evdwCUT
)
363 switch (ic
->vdw_modifier
)
366 case eintmodPOTSHIFT
:
369 case ljcrNONE
: return VdwType::Cut
;
370 case ljcrGEOM
: return VdwType::CutCombGeom
;
371 case ljcrLB
: return VdwType::CutCombLB
;
373 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
374 "The requested LJ combination rule %s (%d) is not implemented in "
375 "the GPU accelerated kernels!",
376 enum_name(combRule
, ljcrNR
, c_ljcrNames
), combRule
)));
378 case eintmodFORCESWITCH
: return VdwType::FSwitch
;
379 case eintmodPOTSWITCH
: return VdwType::PSwitch
;
381 GMX_THROW(gmx::InconsistentInputError(
382 gmx::formatString("The requested VdW interaction modifier %s (%d) is not "
383 "implemented in the GPU accelerated kernels!",
384 INTMODIFIER(ic
->vdw_modifier
), ic
->vdw_modifier
)));
387 else if (ic
->vdwtype
== evdwPME
)
389 if (ic
->ljpme_comb_rule
== ljcrGEOM
)
391 assert(combRule
== ljcrGEOM
);
392 return VdwType::EwaldGeom
;
396 assert(combRule
== ljcrLB
);
397 return VdwType::EwaldLB
;
402 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
403 "The requested VdW type %s (%d) is not implemented in the GPU accelerated kernels!",
404 EVDWTYPE(ic
->vdwtype
), ic
->vdwtype
)));