src/gromacs/nbnxm/gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_nbnxm
  41  */
  42
  43 #ifndef GMX_NBNXM_GPU_COMMON_H
  44 #define GMX_NBNXM_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU == GMX_GPU_CUDA
  51 #include "cuda/nbnxm_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU == GMX_GPU_OPENCL
  55 #include "opencl/nbnxm_ocl_types.h"
  56 #endif
  57
  58 #include "gromacs/gpu_utils/gpu_utils.h"
  59 #include "gromacs/listed_forces/gpubonded.h"
  60 #include "gromacs/math/vec.h"
  61 #include "gromacs/mdlib/force_flags.h"
  62 #include "gromacs/nbnxm/nbnxm.h"
  63 #include "gromacs/pbcutil/ishift.h"
  64 #include "gromacs/timing/gpu_timing.h"
  65 #include "gromacs/utility/fatalerror.h"
  66 #include "gromacs/utility/stringutil.h"
  67
  68 #include "gpu_common_utils.h"
  69 #include "nbnxm_gpu.h"
  70
  71 namespace gmx
  72 {
  73 class GpuBonded;
  74 }
  75
  76 namespace Nbnxm
  77 {
  78
  79 /*! \brief Check that atom locality values are valid for the GPU module.
  80  *
  81  *  In the GPU module atom locality "all" is not supported, the local and
  82  *  non-local ranges are treated separately.
  83  *
  84  *  \param[in] atomLocality atom locality specifier
  85  */
  86 static inline void
  87 validateGpuAtomLocality(const AtomLocality atomLocality)
  88 {
  89     std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
  90                                         "local (%d) or nonlocal (%d)",
  91                                         static_cast<int>(atomLocality),
  92                                         static_cast<int>(AtomLocality::Local),
  93                                         static_cast<int>(AtomLocality::NonLocal));
  94
  95     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
  96 }
  97
  98 /*! \brief Convert atom locality to interaction locality.
  99  *
 100  *  In the current implementation the this is straightforward conversion:
 101  *  local to local, non-local to non-local.
 102  *
 103  *  \param[in] atomLocality Atom locality specifier
 104  *  \returns                Interaction locality corresponding to the atom locality passed.
 105  */
 106 static inline InteractionLocality
 107 gpuAtomToInteractionLocality(const AtomLocality atomLocality)
 108 {
 109     validateGpuAtomLocality(atomLocality);
 110
 111     /* determine interaction locality from atom locality */
 112     if (atomLocality == AtomLocality::Local)
 113     {
 114         return InteractionLocality::Local;
 115     }
 116     else if (atomLocality == AtomLocality::NonLocal)
 117     {
 118         return InteractionLocality::NonLocal;
 119     }
 120     else
 121     {
 122         gmx_incons("Wrong locality");
 123     }
 124 }
 125
 126
 127 void
 128 setupGpuShortRangeWork(gmx_nbnxn_gpu_t                  *nb,
 129                        const gmx::GpuBonded             *gpuBonded,
 130                        const Nbnxm::InteractionLocality  iLocality)
 131 {
 132     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 133
 134     // There is short-range work if the pair list for the provided
 135     // interaction locality contains entries or if there is any
 136     // bonded work (as this is not split into local/nonlocal).
 137     nb->haveWork[iLocality] =
 138         ((nb->plist[iLocality]->nsci != 0) ||
 139          (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 140 }
 141
 142 /*! \brief Returns true if there is GPU short-range work for the given interaction locality.
 143  *
 144  * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
 145  * and therefore if there are GPU offloaded bonded interactions, this function will return
 146  * true for all interaction localities.
 147  *
 148  * \param[inout]  nb        Pointer to the nonbonded GPU data structure
 149  * \param[in]     iLocality Interaction locality identifier
 150  */
 151 static bool
 152 haveGpuShortRangeWork(const gmx_nbnxn_gpu_t            &nb,
 153                       const Nbnxm::InteractionLocality  iLocality)
 154 {
 155     return nb.haveWork[iLocality];
 156 }
 157
 158 bool
 159 haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     *nb,
 160                       const Nbnxm::AtomLocality  aLocality)
 161 {
 162     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 163
 164     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 165 }
 166
 167
 168
 169 /*! \brief Calculate atom range and return start index and length.
 170  *
 171  * \param[in] atomData Atom descriptor data structure
 172  * \param[in] atomLocality Atom locality specifier
 173  * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
 174  * \param[out] atomRangeLen Atom range length in the atom data array.
 175  */
 176 template <typename AtomDataT>
 177 static inline void
 178 getGpuAtomRange(const AtomDataT    *atomData,
 179                 const AtomLocality  atomLocality,
 180                 int                *atomRangeBegin,
 181                 int                *atomRangeLen)
 182 {
 183     assert(atomData);
 184     validateGpuAtomLocality(atomLocality);
 185
 186     /* calculate the atom data index range based on locality */
 187     if (atomLocality == AtomLocality::Local)
 188     {
 189         *atomRangeBegin  = 0;
 190         *atomRangeLen    = atomData->natoms_local;
 191     }
 192     else
 193     {
 194         *atomRangeBegin  = atomData->natoms_local;
 195         *atomRangeLen    = atomData->natoms - atomData->natoms_local;
 196     }
 197 }
 198
 199
 200 /*! \brief Count pruning kernel time if either kernel has been triggered
 201  *
 202  *  We do the accounting for either of the two pruning kernel flavors:
 203  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 204  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 205  *
 206  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 207  * after calling this function.
 208  *
 209  * \param[in] timers   structs with GPU timer objects
 210  * \param[inout] timings  GPU task timing data
 211  * \param[in] iloc        interaction locality
 212  */
 213 template <typename GpuTimers>
 214 static void countPruneKernelTime(GpuTimers                 *timers,
 215                                  gmx_wallclock_gpu_nbnxn_t *timings,
 216                                  const InteractionLocality  iloc)
 217 {
 218     gpu_timers_t::Interaction &iTimers = timers->interaction[iloc];
 219
 220     // We might have not done any pruning (e.g. if we skipped with empty domains).
 221     if (!iTimers.didPrune &&
 222         !iTimers.didRollingPrune)
 223     {
 224         return;
 225     }
 226
 227     if (iTimers.didPrune)
 228     {
 229         timings->pruneTime.c++;
 230         timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
 231     }
 232
 233     if (iTimers.didRollingPrune)
 234     {
 235         timings->dynamicPruneTime.c++;
 236         timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
 237     }
 238 }
 239
 240 /*! \brief Reduce data staged internally in the nbnxn module.
 241  *
 242  * Shift forces and electrostatic/LJ energies copied from the GPU into
 243  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 244  * after having waited for the transfers' completion.
 245  *
 246  * Note that this function should always be called after the transfers into the
 247  * staging buffers has completed.
 248  *
 249  * \tparam     StagingData    Type of staging data
 250  * \param[in]  nbst           Nonbonded staging data
 251  * \param[in]  iLocality      Interaction locality specifier
 252  * \param[in]  reduceEnergies True if energy reduction should be done
 253  * \param[in]  reduceFshift   True if shift force reduction should be done
 254  * \param[out] e_lj           Variable to accumulate LJ energy into
 255  * \param[out] e_el           Variable to accumulate electrostatic energy into
 256  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 257  */
 258 template <typename StagingData>
 259 static inline void
 260 gpu_reduce_staged_outputs(const StagingData         &nbst,
 261                           const InteractionLocality  iLocality,
 262                           const bool                 reduceEnergies,
 263                           const bool                 reduceFshift,
 264                           real                      *e_lj,
 265                           real                      *e_el,
 266                           rvec                      *fshift)
 267 {
 268     /* add up energies and shift forces (only once at local F wait) */
 269     if (iLocality == InteractionLocality::Local)
 270     {
 271         if (reduceEnergies)
 272         {
 273             *e_lj += *nbst.e_lj;
 274             *e_el += *nbst.e_el;
 275         }
 276
 277         if (reduceFshift)
 278         {
 279             for (int i = 0; i < SHIFTS; i++)
 280             {
 281                 rvec_inc(fshift[i], nbst.fshift[i]);
 282             }
 283         }
 284     }
 285 }
 286
 287 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 288  *
 289  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 290  *  Note that this function should be called after the current step's nonbonded
 291  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 292  *  that are accounted for during the following step.
 293  *
 294  * NOTE: if timing with multiple GPUs (streams) becomes possible, the
 295  *      counters could end up being inconsistent due to not being incremented
 296  *      on some of the node when this is skipped on empty local domains!
 297  *
 298  * \tparam     GpuTimers         GPU timers type
 299  * \tparam     GpuPairlist       Pair list type
 300  * \param[out] timings           Pointer to the NB GPU timings data
 301  * \param[in]  timers            Pointer to GPU timers data
 302  * \param[in]  plist             Pointer to the pair list data
 303  * \param[in]  atomLocality      Atom locality specifier
 304  * \param[in]  didEnergyKernels  True if energy kernels have been called in the current step
 305  * \param[in]  doTiming          True if timing is enabled.
 306  *
 307  */
 308 template <typename GpuTimers, typename GpuPairlist>
 309 static inline void
 310 gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
 311                        GpuTimers                 *timers,
 312                        const GpuPairlist         *plist,
 313                        AtomLocality               atomLocality,
 314                        bool                       didEnergyKernels,
 315                        bool                       doTiming)
 316 {
 317     /* timing data accumulation */
 318     if (!doTiming)
 319     {
 320         return;
 321     }
 322
 323     /* determine interaction locality from atom locality */
 324     const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
 325
 326     /* only increase counter once (at local F wait) */
 327     if (iLocality == InteractionLocality::Local)
 328     {
 329         timings->nb_c++;
 330         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 331     }
 332
 333     /* kernel timings */
 334     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 335         timers->interaction[iLocality].nb_k.getLastRangeTime();
 336
 337     /* X/q H2D and F D2H timings */
 338     timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
 339     timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
 340
 341     /* Count the pruning kernel times for both cases:1st pass (at search step)
 342        and rolling pruning (if called at the previous step).
 343        We do the accounting here as this is the only sync point where we
 344        know (without checking or additional sync-ing) that prune tasks in
 345        in the current stream have completed (having just blocking-waited
 346        for the force D2H). */
 347     countPruneKernelTime(timers, timings, iLocality);
 348
 349     /* only count atdat and pair-list H2D at pair-search step */
 350     if (timers->interaction[iLocality].didPairlistH2D)
 351     {
 352         /* atdat transfer timing (add only once, at local F wait) */
 353         if (atomLocality == AtomLocality::Local)
 354         {
 355             timings->pl_h2d_c++;
 356             timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 357         }
 358
 359         timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
 360
 361         /* Clear the timing flag for the next step */
 362         timers->interaction[iLocality].didPairlistH2D = false;
 363     }
 364 }
 365
 366 //TODO: move into shared source file with gmx_compile_cpp_as_cuda
 367 //NOLINTNEXTLINE(misc-definitions-in-headers)
 368 bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
 369                          const int           flags,
 370                          const AtomLocality  aloc,
 371                          real               *e_lj,
 372                          real               *e_el,
 373                          rvec               *fshift,
 374                          GpuTaskCompletion   completionKind)
 375 {
 376     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 377
 378     /* determine interaction locality from atom locality */
 379     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 380
 381     //  We skip when during the non-local phase there was actually no work to do.
 382     //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
 383     //  bonded GPU work.
 384     if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
 385     {
 386         // Query the state of the GPU stream and return early if we're not done
 387         if (completionKind == GpuTaskCompletion::Check)
 388         {
 389             if (!haveStreamTasksCompleted(nb->stream[iLocality]))
 390             {
 391                 // Early return to skip the steps below that we have to do only
 392                 // after the NB task completed
 393                 return false;
 394             }
 395         }
 396         else
 397         {
 398             gpuStreamSynchronize(nb->stream[iLocality]);
 399         }
 400
 401         bool calcEner   = (flags & GMX_FORCE_ENERGY) != 0;
 402         bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
 403
 404         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
 405                                nb->bDoTime != 0);
 406
 407         gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
 408     }
 409
 410     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
 411     nb->timers->interaction[iLocality].didPrune = nb->timers->interaction[iLocality].didRollingPrune = false;
 412
 413     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 414     nb->plist[iLocality]->haveFreshList = false;
 415
 416     return true;
 417 }
 418
 419 /*! \brief
 420  * Wait for the asynchronously launched nonbonded tasks and data
 421  * transfers to finish.
 422  *
 423  * Also does timing accounting and reduction of the internal staging buffers.
 424  * As this is called at the end of the step, it also resets the pair list and
 425  * pruning flags.
 426  *
 427  * \param[in] nb The nonbonded data GPU structure
 428  * \param[in] flags Force flags
 429  * \param[in] aloc Atom locality identifier
 430  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
 431  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
 432  * \param[out] fshift Pointer to the shift force buffer to accumulate into
 433  */
 434 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 435 void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
 436                           int              flags,
 437                           AtomLocality     aloc,
 438                           real            *e_lj,
 439                           real            *e_el,
 440                           rvec            *fshift)
 441 {
 442     gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift,
 443                         GpuTaskCompletion::Wait);
 444 }
 445
 446 } // namespace Nbnxm
 447
 448 #endif