2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Common functions for the different NBNXN GPU implementations.
38 * \author Szilard Pall <pall.szilard@gmail.com>
40 * \ingroup module_nbnxm
43 #ifndef GMX_NBNXM_GPU_COMMON_H
44 #define GMX_NBNXM_GPU_COMMON_H
50 #if GMX_GPU == GMX_GPU_CUDA
51 #include "cuda/nbnxm_cuda_types.h"
54 #if GMX_GPU == GMX_GPU_OPENCL
55 #include "opencl/nbnxm_ocl_types.h"
58 #include "gromacs/gpu_utils/gpu_utils.h"
59 #include "gromacs/listed_forces/gpubonded.h"
60 #include "gromacs/math/vec.h"
61 #include "gromacs/mdlib/force_flags.h"
62 #include "gromacs/nbnxm/nbnxm.h"
63 #include "gromacs/pbcutil/ishift.h"
64 #include "gromacs/timing/gpu_timing.h"
65 #include "gromacs/utility/fatalerror.h"
66 #include "gromacs/utility/stringutil.h"
68 #include "gpu_common_utils.h"
69 #include "nbnxm_gpu.h"
79 /*! \brief Check that atom locality values are valid for the GPU module.
81 * In the GPU module atom locality "all" is not supported, the local and
82 * non-local ranges are treated separately.
84 * \param[in] atomLocality atom locality specifier
87 validateGpuAtomLocality(const AtomLocality atomLocality
)
89 std::string str
= gmx::formatString("Invalid atom locality passed (%d); valid here is only "
90 "local (%d) or nonlocal (%d)",
91 static_cast<int>(atomLocality
),
92 static_cast<int>(AtomLocality::Local
),
93 static_cast<int>(AtomLocality::NonLocal
));
95 GMX_ASSERT(atomLocality
== AtomLocality::Local
|| atomLocality
== AtomLocality::NonLocal
, str
.c_str());
98 /*! \brief Convert atom locality to interaction locality.
100 * In the current implementation the this is straightforward conversion:
101 * local to local, non-local to non-local.
103 * \param[in] atomLocality Atom locality specifier
104 * \returns Interaction locality corresponding to the atom locality passed.
106 static inline InteractionLocality
107 gpuAtomToInteractionLocality(const AtomLocality atomLocality
)
109 validateGpuAtomLocality(atomLocality
);
111 /* determine interaction locality from atom locality */
112 if (atomLocality
== AtomLocality::Local
)
114 return InteractionLocality::Local
;
116 else if (atomLocality
== AtomLocality::NonLocal
)
118 return InteractionLocality::NonLocal
;
122 gmx_incons("Wrong locality");
128 setupGpuShortRangeWork(gmx_nbnxn_gpu_t
*nb
,
129 const gmx::GpuBonded
*gpuBonded
,
130 const Nbnxm::InteractionLocality iLocality
)
132 GMX_ASSERT(nb
, "Need a valid nbnxn_gpu object");
134 // There is short-range work if the pair list for the provided
135 // interaction locality contains entries or if there is any
136 // bonded work (as this is not split into local/nonlocal).
137 nb
->haveWork
[iLocality
] =
138 ((nb
->plist
[iLocality
]->nsci
!= 0) ||
139 (gpuBonded
!= nullptr && gpuBonded
->haveInteractions()));
142 /*! \brief Returns true if there is GPU short-range work for the given interaction locality.
144 * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
145 * and therefore if there are GPU offloaded bonded interactions, this function will return
146 * true for all interaction localities.
148 * \param[inout] nb Pointer to the nonbonded GPU data structure
149 * \param[in] iLocality Interaction locality identifier
152 haveGpuShortRangeWork(const gmx_nbnxn_gpu_t
&nb
,
153 const Nbnxm::InteractionLocality iLocality
)
155 return nb
.haveWork
[iLocality
];
159 haveGpuShortRangeWork(const gmx_nbnxn_gpu_t
*nb
,
160 const Nbnxm::AtomLocality aLocality
)
162 GMX_ASSERT(nb
, "Need a valid nbnxn_gpu object");
164 return haveGpuShortRangeWork(*nb
, gpuAtomToInteractionLocality(aLocality
));
169 /*! \brief Calculate atom range and return start index and length.
171 * \param[in] atomData Atom descriptor data structure
172 * \param[in] atomLocality Atom locality specifier
173 * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
174 * \param[out] atomRangeLen Atom range length in the atom data array.
176 template <typename AtomDataT
>
178 getGpuAtomRange(const AtomDataT
*atomData
,
179 const AtomLocality atomLocality
,
184 validateGpuAtomLocality(atomLocality
);
186 /* calculate the atom data index range based on locality */
187 if (atomLocality
== AtomLocality::Local
)
190 *atomRangeLen
= atomData
->natoms_local
;
194 *atomRangeBegin
= atomData
->natoms_local
;
195 *atomRangeLen
= atomData
->natoms
- atomData
->natoms_local
;
200 /*! \brief Count pruning kernel time if either kernel has been triggered
202 * We do the accounting for either of the two pruning kernel flavors:
203 * - 1st pass prune: ran during the current step (prior to the force kernel);
204 * - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
206 * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
207 * after calling this function.
209 * \param[in] timers structs with GPU timer objects
210 * \param[inout] timings GPU task timing data
211 * \param[in] iloc interaction locality
213 template <typename GpuTimers
>
214 static void countPruneKernelTime(GpuTimers
*timers
,
215 gmx_wallclock_gpu_nbnxn_t
*timings
,
216 const InteractionLocality iloc
)
218 gpu_timers_t::Interaction
&iTimers
= timers
->interaction
[iloc
];
220 // We might have not done any pruning (e.g. if we skipped with empty domains).
221 if (!iTimers
.didPrune
&&
222 !iTimers
.didRollingPrune
)
227 if (iTimers
.didPrune
)
229 timings
->pruneTime
.c
++;
230 timings
->pruneTime
.t
+= iTimers
.prune_k
.getLastRangeTime();
233 if (iTimers
.didRollingPrune
)
235 timings
->dynamicPruneTime
.c
++;
236 timings
->dynamicPruneTime
.t
+= iTimers
.rollingPrune_k
.getLastRangeTime();
240 /*! \brief Reduce data staged internally in the nbnxn module.
242 * Shift forces and electrostatic/LJ energies copied from the GPU into
243 * a module-internal staging area are immediately reduced (CPU-side buffers passed)
244 * after having waited for the transfers' completion.
246 * Note that this function should always be called after the transfers into the
247 * staging buffers has completed.
249 * \tparam StagingData Type of staging data
250 * \param[in] nbst Nonbonded staging data
251 * \param[in] iLocality Interaction locality specifier
252 * \param[in] reduceEnergies True if energy reduction should be done
253 * \param[in] reduceFshift True if shift force reduction should be done
254 * \param[out] e_lj Variable to accumulate LJ energy into
255 * \param[out] e_el Variable to accumulate electrostatic energy into
256 * \param[out] fshift Pointer to the array of shift forces to accumulate into
258 template <typename StagingData
>
260 gpu_reduce_staged_outputs(const StagingData
&nbst
,
261 const InteractionLocality iLocality
,
262 const bool reduceEnergies
,
263 const bool reduceFshift
,
268 /* add up energies and shift forces (only once at local F wait) */
269 if (iLocality
== InteractionLocality::Local
)
279 for (int i
= 0; i
< SHIFTS
; i
++)
281 rvec_inc(fshift
[i
], nbst
.fshift
[i
]);
287 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
289 * Does timing accumulation and call-count increments for the nonbonded kernels.
290 * Note that this function should be called after the current step's nonbonded
291 * nonbonded tasks have completed with the exception of the rolling pruning kernels
292 * that are accounted for during the following step.
294 * NOTE: if timing with multiple GPUs (streams) becomes possible, the
295 * counters could end up being inconsistent due to not being incremented
296 * on some of the node when this is skipped on empty local domains!
298 * \tparam GpuTimers GPU timers type
299 * \tparam GpuPairlist Pair list type
300 * \param[out] timings Pointer to the NB GPU timings data
301 * \param[in] timers Pointer to GPU timers data
302 * \param[in] plist Pointer to the pair list data
303 * \param[in] atomLocality Atom locality specifier
304 * \param[in] didEnergyKernels True if energy kernels have been called in the current step
305 * \param[in] doTiming True if timing is enabled.
308 template <typename GpuTimers
, typename GpuPairlist
>
310 gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t
*timings
,
312 const GpuPairlist
*plist
,
313 AtomLocality atomLocality
,
314 bool didEnergyKernels
,
317 /* timing data accumulation */
323 /* determine interaction locality from atom locality */
324 const InteractionLocality iLocality
= gpuAtomToInteractionLocality(atomLocality
);
326 /* only increase counter once (at local F wait) */
327 if (iLocality
== InteractionLocality::Local
)
330 timings
->ktime
[plist
->haveFreshList
? 1 : 0][didEnergyKernels
? 1 : 0].c
+= 1;
334 timings
->ktime
[plist
->haveFreshList
? 1 : 0][didEnergyKernels
? 1 : 0].t
+=
335 timers
->interaction
[iLocality
].nb_k
.getLastRangeTime();
337 /* X/q H2D and F D2H timings */
338 timings
->nb_h2d_t
+= timers
->xf
[atomLocality
].nb_h2d
.getLastRangeTime();
339 timings
->nb_d2h_t
+= timers
->xf
[atomLocality
].nb_d2h
.getLastRangeTime();
341 /* Count the pruning kernel times for both cases:1st pass (at search step)
342 and rolling pruning (if called at the previous step).
343 We do the accounting here as this is the only sync point where we
344 know (without checking or additional sync-ing) that prune tasks in
345 in the current stream have completed (having just blocking-waited
346 for the force D2H). */
347 countPruneKernelTime(timers
, timings
, iLocality
);
349 /* only count atdat and pair-list H2D at pair-search step */
350 if (timers
->interaction
[iLocality
].didPairlistH2D
)
352 /* atdat transfer timing (add only once, at local F wait) */
353 if (atomLocality
== AtomLocality::Local
)
356 timings
->pl_h2d_t
+= timers
->atdat
.getLastRangeTime();
359 timings
->pl_h2d_t
+= timers
->interaction
[iLocality
].pl_h2d
.getLastRangeTime();
361 /* Clear the timing flag for the next step */
362 timers
->interaction
[iLocality
].didPairlistH2D
= false;
366 //TODO: move into shared source file with gmx_compile_cpp_as_cuda
367 //NOLINTNEXTLINE(misc-definitions-in-headers)
368 bool gpu_try_finish_task(gmx_nbnxn_gpu_t
*nb
,
370 const AtomLocality aloc
,
374 GpuTaskCompletion completionKind
)
376 GMX_ASSERT(nb
, "Need a valid nbnxn_gpu object");
378 /* determine interaction locality from atom locality */
379 const InteractionLocality iLocality
= gpuAtomToInteractionLocality(aloc
);
381 // We skip when during the non-local phase there was actually no work to do.
382 // This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
384 if ((iLocality
== InteractionLocality::Local
) || haveGpuShortRangeWork(*nb
, iLocality
))
386 // Query the state of the GPU stream and return early if we're not done
387 if (completionKind
== GpuTaskCompletion::Check
)
389 if (!haveStreamTasksCompleted(nb
->stream
[iLocality
]))
391 // Early return to skip the steps below that we have to do only
392 // after the NB task completed
398 gpuStreamSynchronize(nb
->stream
[iLocality
]);
401 bool calcEner
= (flags
& GMX_FORCE_ENERGY
) != 0;
402 bool calcFshift
= (flags
& GMX_FORCE_VIRIAL
) != 0;
404 gpu_accumulate_timings(nb
->timings
, nb
->timers
, nb
->plist
[iLocality
], aloc
, calcEner
,
407 gpu_reduce_staged_outputs(nb
->nbst
, iLocality
, calcEner
, calcFshift
, e_lj
, e_el
, fshift
);
410 /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
411 nb
->timers
->interaction
[iLocality
].didPrune
= nb
->timers
->interaction
[iLocality
].didRollingPrune
= false;
413 /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
414 nb
->plist
[iLocality
]->haveFreshList
= false;
420 * Wait for the asynchronously launched nonbonded tasks and data
421 * transfers to finish.
423 * Also does timing accounting and reduction of the internal staging buffers.
424 * As this is called at the end of the step, it also resets the pair list and
427 * \param[in] nb The nonbonded data GPU structure
428 * \param[in] flags Force flags
429 * \param[in] aloc Atom locality identifier
430 * \param[out] e_lj Pointer to the LJ energy output to accumulate into
431 * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
432 * \param[out] fshift Pointer to the shift force buffer to accumulate into
434 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
435 void gpu_wait_finish_task(gmx_nbnxn_gpu_t
*nb
,
442 gpu_try_finish_task(nb
, flags
, aloc
, e_lj
, e_el
, fshift
,
443 GpuTaskCompletion::Wait
);