From 35d9ca98c92c2371aab02899e327797e4cbce457 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Fri, 10 Aug 2018 09:03:41 +0200 Subject: [PATCH] Fix missing interactions with GPU and DD Non-local LJ and Coulomb interactions would not be computed on a rank after the non-local GPU pair-list was empty at some point in time; either at the start of the run or during a run. The issue is that the pair-list was initialized conditionally on the size of the list in the device side data instead of the host side data. Although this could have led to silent errors for small step numbers, most systems will likely crash for production runs. Fixes #2502 Change-Id: Iae1c5b70624b652d625520cadb647f862f296d5b --- docs/release-notes/2018/2018.3.rst | 26 ++++++++++++++++++++++ .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 10 ++------- .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp | 13 +++++------ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/docs/release-notes/2018/2018.3.rst b/docs/release-notes/2018/2018.3.rst index d40f2a4058..c8126a3da0 100644 --- a/docs/release-notes/2018/2018.3.rst +++ b/docs/release-notes/2018/2018.3.rst @@ -9,6 +9,32 @@ earlier, which you can find described in the :ref:`release-notes`. Fixes where mdrun could behave incorrectly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Multi-domain GPU runs can no longer miss pair interactions +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +With systems with empty space in the unit cell, GPU runs with domain +decomposition would not compute LJ and Coulomb interactions between +domains when there we no interactions between domains on a rank at some +point in time. + + - This bug only affects simulations running on GPUs with domain decomposition + and containing empty regions of space that can lead to domains being empty. + - Possible observations of this bug may have been seemingly random failures + of calculations that where not reproducible when restarting a simulation + from a checkpoint file, as the domain would then again be filled properly + if interactions are present at the beginning. + - It is unlikely that this bug will have unnoticed effects on all but + very short simulations, as the missing interactions will inevitable lead + to simulation instability and crashes. + - If a simulation that crashed due to this bug is restarted it can contain + a small region around the crash that will be unphysical due to some + interactions not being calculated just before the crash itself. + +**This is a critical fix and users of 2018.x series that run on GPUs should +update to this point release** + +:issue:`2502` + Fix Conjugate Gradient assertion failure at end of minimization """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index fe86c52557..e4d0916eca 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -52,7 +52,6 @@ #include "gromacs/mdlib/force_flags.h" #include "gromacs/mdlib/nb_verlet.h" #include "gromacs/mdlib/nbnxn_consts.h" -#include "gromacs/mdlib/nbnxn_gpu_common_utils.h" #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h" #include "gromacs/mdtypes/interaction_const.h" #include "gromacs/mdtypes/md_enums.h" @@ -530,13 +529,8 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb, const nbnxn_pairlist_t *h_plist, int iloc) { - if (canSkipWork(nb, iloc)) - { - return; - } - char sbuf[STRLEN]; - bool bDoTime = nb->bDoTime; + bool bDoTime = (nb->bDoTime && h_plist->nsci > 0); cudaStream_t stream = nb->stream[iloc]; cu_plist_t *d_plist = nb->plist[iloc]; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index 559fcfd461..de4cb58c01 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -58,7 +58,6 @@ #include "gromacs/mdlib/nb_verlet.h" #include "gromacs/mdlib/nbnxn_consts.h" #include "gromacs/mdlib/nbnxn_gpu.h" -#include "gromacs/mdlib/nbnxn_gpu_common_utils.h" #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h" #include "gromacs/mdlib/nbnxn_gpu_jit_support.h" #include "gromacs/mdtypes/interaction_const.h" @@ -900,13 +899,11 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb, const nbnxn_pairlist_t *h_plist, int iloc) { - if (canSkipWork(nb, iloc)) - { - return; - } - char sbuf[STRLEN]; - bool bDoTime = nb->bDoTime; + // Timing accumulation should happen only if there was work to do + // because getLastRangeTime() gets skipped with empty lists later + // which leads to the counter not being reset. + bool bDoTime = (nb->bDoTime && h_plist->nsci > 0); cl_command_queue stream = nb->stream[iloc]; cl_plist_t *d_plist = nb->plist[iloc]; -- 2.11.4.GIT