From 2c644d3b5ebb64f6c89a5db36160c9dc30329888 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 7 Jul 2020 09:50:45 +0000 Subject: [PATCH] Make cl_nbparam into a struct This is needed to unify with CUDA path --- docs/release-notes/2021/major/miscellaneous.rst | 8 + docs/user-guide/environment-variables.rst | 18 +- src/gromacs/nbnxm/CMakeLists.txt | 2 + src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 8 +- src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 170 +---------------- src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh | 2 +- .../nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu | 4 +- .../nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh | 14 +- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 7 +- src/gromacs/nbnxm/nbnxm_gpu.h | 5 - src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp | 203 +++++++++++++++++++++ ...a_kernel_pruneonly.cu => nbnxm_gpu_data_mgmt.h} | 56 +++++- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 63 +------ src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 102 +---------- src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 7 +- 15 files changed, 294 insertions(+), 375 deletions(-) create mode 100644 src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp copy src/gromacs/nbnxm/{cuda/nbnxm_cuda_kernel_pruneonly.cu => nbnxm_gpu_data_mgmt.h} (51%) diff --git a/docs/release-notes/2021/major/miscellaneous.rst b/docs/release-notes/2021/major/miscellaneous.rst index fa547d4186..712b93da87 100644 --- a/docs/release-notes/2021/major/miscellaneous.rst +++ b/docs/release-notes/2021/major/miscellaneous.rst @@ -25,3 +25,11 @@ change outside of the users direct control we have removed the support for automatically setting booleans. GMX_BUILD_HELP and GMX_HWLOC are now disabled by default, while GMX_LOAD_PLUGINS is enabled by default. +Unification of several CUDA and OpenCL environment variables +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +The environment variables that had exactly the same meaning in OpenCL and CUDA were unified: + +* GMX_CUDA_NB_ANA_EWALD and GMX_OCL_NB_ANA_EWALD into GMX_GPU_NB_ANA_EWALD +* GMX_CUDA_NB_TAB_EWALD and GMX_OCL_NB_TAB_EWALD into GMX_GPU_NB_TAB_EWALD +* GMX_CUDA_NB_EWALD_TWINCUT and GMX_OCL_NB_EWALD_TWINCUT into GMX_GPU_NB_EWALD_TWINCUT diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst index 8dbb431121..3878c4f298 100644 --- a/docs/user-guide/environment-variables.rst +++ b/docs/user-guide/environment-variables.rst @@ -141,15 +141,15 @@ Performance and Run Control to localized bonded interaction distribution; optimal value dependent on system and hardware, default value is 4. -``GMX_CUDA_NB_EWALD_TWINCUT`` +``GMX_GPU_NB_EWALD_TWINCUT`` force the use of twin-range cutoff kernel even if :mdp:`rvdw` equals :mdp:`rcoulomb` after PP-PME load balancing. The switch to twin-range kernels is automated, so this variable should be used only for benchmarking. -``GMX_CUDA_NB_ANA_EWALD`` +``GMX_GPU_NB_ANA_EWALD`` force the use of analytical Ewald kernels. Should be used only for benchmarking. -``GMX_CUDA_NB_TAB_EWALD`` +``GMX_GPU_NB_TAB_EWALD`` force the use of tabulated Ewald kernels. Should be used only for benchmarking. ``GMX_DISABLE_CUDA_TIMING`` @@ -463,18 +463,6 @@ compilation of OpenCL kernels, but they are also used in device selection. Enables i-atom data (type or LJ parameter) prefetch allowing testing on platforms where this behavior is not default. -``GMX_OCL_NB_ANA_EWALD`` - Forces the use of analytical Ewald kernels. Equivalent of - CUDA environment variable ``GMX_CUDA_NB_ANA_EWALD`` - -``GMX_OCL_NB_TAB_EWALD`` - Forces the use of tabulated Ewald kernel. Equivalent - of CUDA environment variable ``GMX_OCL_NB_TAB_EWALD`` - -``GMX_OCL_NB_EWALD_TWINCUT`` - Forces the use of twin-range cutoff kernel. Equivalent of - CUDA environment variable ``GMX_CUDA_NB_EWALD_TWINCUT`` - ``GMX_OCL_FILE_PATH`` Use this parameter to force |Gromacs| to load the OpenCL kernels from a custom location. Use it only if you want to diff --git a/src/gromacs/nbnxm/CMakeLists.txt b/src/gromacs/nbnxm/CMakeLists.txt index ecfb452e4f..497ba6d631 100644 --- a/src/gromacs/nbnxm/CMakeLists.txt +++ b/src/gromacs/nbnxm/CMakeLists.txt @@ -63,11 +63,13 @@ file (GLOB NBNXM_SOURCES if(GMX_USE_CUDA) add_subdirectory(cuda) + gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp) endif() if(GMX_USE_OPENCL) add_subdirectory(opencl) set(NBNXM_OPENCL_KERNELS ${NBNXM_OPENCL_KERNELS} PARENT_SCOPE) + gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp) endif() set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${NBNXM_SOURCES} PARENT_SCOPE) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index f7a12cf99d..71e598fdf1 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -121,7 +121,7 @@ namespace Nbnxm constexpr static int c_bufOpsThreadsPerBlock = 128; /*! Nonbonded kernel function pointer type */ -typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, bool); +typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const NBParamGpu, const gpu_plist, bool); /*********************************/ @@ -403,7 +403,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ cu_atomdata_t* adat = nb->atdat; - cu_plist_t* plist = nb->plist[iloc]; + gpu_plist* plist = nb->plist[iloc]; cu_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -484,7 +484,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In { cu_atomdata_t* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; - cu_plist_t* plist = nb->plist[iloc]; + gpu_plist* plist = nb->plist[iloc]; cu_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -597,7 +597,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c { cu_atomdata_t* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; - cu_plist_t* plist = nb->plist[iloc]; + gpu_plist* plist = nb->plist[iloc]; cu_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 69beab6b6f..f5d64d7d83 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -65,6 +65,7 @@ #include "gromacs/nbnxm/gridset.h" #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/nbnxm_gpu.h" +#include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h" #include "gromacs/nbnxm/pairlistsets.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" @@ -92,30 +93,6 @@ static unsigned int gpu_min_ci_balanced_factor = 44; /* Fw. decl. */ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb); -/* Fw. decl, */ -static void nbnxn_cuda_free_nbparam_table(NBParamGpu* nbparam); - -/*! \brief Initialized the Ewald Coulomb correction GPU table. - - Tabulates the Ewald Coulomb force and initializes the size/scale - and the table GPU array. If called with an already allocated table, - it just re-uploads the table. - */ -static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, - NBParamGpu* nbp, - const DeviceContext& deviceContext) -{ - if (nbp->coulomb_tab != nullptr) - { - nbnxn_cuda_free_nbparam_table(nbp); - } - - nbp->coulomb_tab_scale = tables.scale; - initParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), - tables.tableF.size(), deviceContext); -} - - /*! Initializes the atomdata structure first time, it only gets filled at pair-search. */ static void init_atomdata_first(cu_atomdata_t* ad, int ntypes, const DeviceContext& deviceContext) @@ -138,82 +115,6 @@ static void init_atomdata_first(cu_atomdata_t* ad, int ntypes, const DeviceConte ad->nalloc = -1; } -/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on - earlier GPUs, single or twin cut-off. */ -static int pick_ewald_kernel_type(const interaction_const_t& ic) -{ - bool bTwinCut = (ic.rcoulomb != ic.rvdw); - bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald; - int kernel_type; - - /* Benchmarking/development environment variables to force the use of - analytical or tabulated Ewald kernel. */ - bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr); - bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr); - - if (bForceAnalyticalEwald && bForceTabulatedEwald) - { - gmx_incons( - "Both analytical and tabulated Ewald CUDA non-bonded kernels " - "requested through environment variables."); - } - - /* By default use analytical Ewald. */ - bUseAnalyticalEwald = true; - if (bForceAnalyticalEwald) - { - if (debug) - { - fprintf(debug, "Using analytical Ewald CUDA kernels\n"); - } - } - else if (bForceTabulatedEwald) - { - bUseAnalyticalEwald = false; - - if (debug) - { - fprintf(debug, "Using tabulated Ewald CUDA kernels\n"); - } - } - - /* Use twin cut-off kernels if requested by bTwinCut or the env. var. - forces it (use it for debugging/benchmarking only). */ - if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == nullptr)) - { - kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA : eelTypeEWALD_TAB; - } - else - { - kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA_TWIN : eelTypeEWALD_TAB_TWIN; - } - - return kernel_type; -} - -/*! Copies all parameters related to the cut-off from ic to nbp */ -static void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams) -{ - nbp->ewald_beta = ic->ewaldcoeff_q; - nbp->sh_ewald = ic->sh_ewald; - nbp->epsfac = ic->epsfac; - nbp->two_k_rf = 2.0 * ic->k_rf; - nbp->c_rf = ic->c_rf; - nbp->rvdw_sq = ic->rvdw * ic->rvdw; - nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb; - nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter; - nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner; - nbp->useDynamicPruning = listParams.useDynamicPruning; - - nbp->sh_lj_ewald = ic->sh_lj_ewald; - nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj; - - nbp->rvdw_switch = ic->rvdw_switch; - nbp->dispersion_shift = ic->dispersion_shift; - nbp->repulsion_shift = ic->repulsion_shift; - nbp->vdw_switch = ic->vdw_switch; -} - /*! Initializes the nonbonded parameter data structure. */ static void init_nbparam(NBParamGpu* nbp, const interaction_const_t* ic, @@ -290,7 +191,7 @@ static void init_nbparam(NBParamGpu* nbp, } else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD)) { - nbp->eeltype = pick_ewald_kernel_type(*ic); + nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic); } else { @@ -336,59 +237,12 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params()); - nbp->eeltype = pick_ewald_kernel_type(*ic); + nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic); GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables"); init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_); } -/*! Initializes the pair list data structure. */ -static void init_plist(cu_plist_t* pl) -{ - /* initialize to nullptr pointers to data that is not allocated here and will - need reallocation in nbnxn_gpu_init_pairlist */ - pl->sci = nullptr; - pl->cj4 = nullptr; - pl->imask = nullptr; - pl->excl = nullptr; - - /* size -1 indicates that the respective array hasn't been initialized yet */ - pl->na_c = -1; - pl->nsci = -1; - pl->sci_nalloc = -1; - pl->ncj4 = -1; - pl->cj4_nalloc = -1; - pl->nimask = -1; - pl->imask_nalloc = -1; - pl->nexcl = -1; - pl->excl_nalloc = -1; - pl->haveFreshList = false; -} - -/*! Initializes the timings data structure. */ -static void init_timings(gmx_wallclock_gpu_nbnxn_t* t) -{ - int i, j; - - t->nb_h2d_t = 0.0; - t->nb_d2h_t = 0.0; - t->nb_c = 0; - t->pl_h2d_t = 0.0; - t->pl_h2d_c = 0; - for (i = 0; i < 2; i++) - { - for (j = 0; j < 2; j++) - { - t->ktime[i][j].t = 0.0; - t->ktime[i][j].c = 0; - } - } - t->pruneTime.c = 0; - t->pruneTime.t = 0.0; - t->dynamicPruneTime.c = 0; - t->dynamicPruneTime.t = 0.0; -} - /*! Initializes simulation constant data. */ static void cuda_init_const(NbnxmGpu* nb, const interaction_const_t* ic, @@ -500,7 +354,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte char sbuf[STRLEN]; bool bDoTime = (nb->bDoTime && !h_plist->sci.empty()); const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - cu_plist_t* d_plist = nb->plist[iloc]; + gpu_plist* d_plist = nb->plist[iloc]; if (d_plist->na_c < 0) { @@ -510,7 +364,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte { if (d_plist->na_c != h_plist->na_ci) { - sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)", + sprintf(sbuf, "In init_plist: the #atoms per cell has changed (from %d to %d)", d_plist->na_c, h_plist->na_ci); gmx_incons(sbuf); } @@ -679,14 +533,6 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) } } -static void nbnxn_cuda_free_nbparam_table(NBParamGpu* nbparam) -{ - if (nbparam->eeltype == eelTypeEWALD_TAB || nbparam->eeltype == eelTypeEWALD_TAB_TWIN) - { - destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj); - } -} - void gpu_free(NbnxmGpu* nb) { cudaError_t stat; @@ -701,7 +547,11 @@ void gpu_free(NbnxmGpu* nb) atdat = nb->atdat; nbparam = nb->nbparam; - nbnxn_cuda_free_nbparam_table(nbparam); + if ((!nbparam->coulomb_tab) + && (nbparam->eeltype == eelTypeEWALD_TAB || nbparam->eeltype == eelTypeEWALD_TAB_TWIN)) + { + destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj); + } stat = cudaEventDestroy(nb->nonlocal_done); CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done"); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh index 7faea980b7..9cddbc1999 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh @@ -158,7 +158,7 @@ __launch_bounds__(THREADS_PER_BLOCK) __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) # endif /* CALC_ENERGIES */ #endif /* PRUNE_NBL */ - (const cu_atomdata_t atdat, const NBParamGpu nbparam, const cu_plist_t plist, bool bCalcFshift) + (const cu_atomdata_t atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift) #ifdef FUNCTION_DECLARATION_ONLY ; /* Only do function declaration, omit the function body. */ #else diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu index 81755cb903..fb8ebb2e76 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu @@ -39,7 +39,7 @@ #ifndef FUNCTION_DECLARATION_ONLY /* Instantiate external template functions */ template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int); template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int); #endif diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh index e5bf2b967c..563e1edc0c 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh @@ -103,20 +103,20 @@ */ template __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__ - void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat, - const NBParamGpu nbparam, - const cu_plist_t plist, - int numParts, - int part) + void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat, + const NBParamGpu nbparam, + const Nbnxm::gpu_plist plist, + int numParts, + int part) #ifdef FUNCTION_DECLARATION_ONLY ; /* Only do function declaration, omit the function body. */ // Add extern declarations so each translation unit understands that // there will be a definition provided. extern template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int); extern template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int); #else { diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index acadca29c5..1044c10162 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -138,11 +138,6 @@ struct cu_atomdata }; /** \internal - * \brief Pair list data. - */ -using cu_plist_t = Nbnxm::gpu_plist; - -/** \internal * \brief Typedef of actual timer type. */ typedef struct Nbnxm::gpu_timers_t cu_timers_t; @@ -190,7 +185,7 @@ struct NbnxmGpu /*! \brief parameters required for the non-bonded calc. */ NBParamGpu* nbparam = nullptr; /*! \brief pair-list data structures (local and non-local) */ - gmx::EnumerationArray plist = { { nullptr } }; + gmx::EnumerationArray plist = { { nullptr } }; /*! \brief staging area where fshift/energies get downloaded */ nb_staging_t nbst; /*! \brief local and non-local GPU streams */ diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 465bce44f7..eace699386 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -275,11 +275,6 @@ float gpu_wait_finish_task(NbnxmGpu gmx_unused* nb, gmx::ArrayRef gmx_unused shiftForces, gmx_wallcycle gmx_unused* wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0); -/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */ -GPU_FUNC_QUALIFIER -int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t gmx_unused& ic) - GPU_FUNC_TERM_WITH_RETURN(-1); - /*! \brief Initialization for X buffer operations on GPU. * Called on the NS step and performs (re-)allocations and memory copies. !*/ CUDA_FUNC_QUALIFIER diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp new file mode 100644 index 0000000000..7f6e433054 --- /dev/null +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -0,0 +1,203 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team. + * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief Define common implementation of nbnxm_gpu_data_mgmt.h + * + * \author Anca Hamuraru + * \author Dimitrios Karkoulis + * \author Teemu Virolainen + * \author Szilárd Páll + * \author Artem Zhmurov + * + * \ingroup module_nbnxm + */ +#include "gmxpre.h" + +#include "config.h" + +#if GMX_GPU == GMX_GPU_CUDA +# include "cuda/nbnxm_cuda_types.h" +#endif + +#if GMX_GPU == GMX_GPU_OPENCL +# include "opencl/nbnxm_ocl_types.h" +#endif + +#include "nbnxm_gpu_data_mgmt.h" + +#include "gromacs/timing/gpu_timing.h" + +#include "nbnxm_gpu.h" + +namespace Nbnxm +{ + +void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, + NBParamGpu* nbp, + const DeviceContext& deviceContext) +{ + if (!nbp->coulomb_tab) + { + destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj); + } + + nbp->coulomb_tab_scale = tables.scale; + initParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), + tables.tableF.size(), deviceContext); +} + +int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic) +{ + bool bTwinCut = (ic.rcoulomb != ic.rvdw); + bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald; + int kernel_type; + + /* Benchmarking/development environment variables to force the use of + analytical or tabulated Ewald kernel. */ + bForceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr); + bForceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr); + + if (bForceAnalyticalEwald && bForceTabulatedEwald) + { + gmx_incons( + "Both analytical and tabulated Ewald GPU non-bonded kernels " + "requested through environment variables."); + } + + /* By default, use analytical Ewald + * TODO: tabulated does not work in OpenCL, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp + * + */ + bUseAnalyticalEwald = true; + if (bForceAnalyticalEwald) + { + if (debug) + { + fprintf(debug, "Using analytical Ewald GPU kernels\n"); + } + } + else if (bForceTabulatedEwald) + { + bUseAnalyticalEwald = false; + + if (debug) + { + fprintf(debug, "Using tabulated Ewald GPU kernels\n"); + } + } + + /* Use twin cut-off kernels if requested by bTwinCut or the env. var. + forces it (use it for debugging/benchmarking only). */ + if (!bTwinCut && (getenv("GMX_GPU_NB_EWALD_TWINCUT") == nullptr)) + { + kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA : eelTypeEWALD_TAB; + } + else + { + kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA_TWIN : eelTypeEWALD_TAB_TWIN; + } + + return kernel_type; +} + +void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams) +{ + nbp->ewald_beta = ic->ewaldcoeff_q; + nbp->sh_ewald = ic->sh_ewald; + nbp->epsfac = ic->epsfac; + nbp->two_k_rf = 2.0 * ic->k_rf; + nbp->c_rf = ic->c_rf; + nbp->rvdw_sq = ic->rvdw * ic->rvdw; + nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb; + nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter; + nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner; + nbp->useDynamicPruning = listParams.useDynamicPruning; + + nbp->sh_lj_ewald = ic->sh_lj_ewald; + nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj; + + nbp->rvdw_switch = ic->rvdw_switch; + nbp->dispersion_shift = ic->dispersion_shift; + nbp->repulsion_shift = ic->repulsion_shift; + nbp->vdw_switch = ic->vdw_switch; +} + +void init_plist(gpu_plist* pl) +{ + /* initialize to nullptr pointers to data that is not allocated here and will + need reallocation in nbnxn_gpu_init_pairlist */ + pl->sci = nullptr; + pl->cj4 = nullptr; + pl->imask = nullptr; + pl->excl = nullptr; + + /* size -1 indicates that the respective array hasn't been initialized yet */ + pl->na_c = -1; + pl->nsci = -1; + pl->sci_nalloc = -1; + pl->ncj4 = -1; + pl->cj4_nalloc = -1; + pl->nimask = -1; + pl->imask_nalloc = -1; + pl->nexcl = -1; + pl->excl_nalloc = -1; + pl->haveFreshList = false; +} + +void init_timings(gmx_wallclock_gpu_nbnxn_t* t) +{ + int i, j; + + t->nb_h2d_t = 0.0; + t->nb_d2h_t = 0.0; + t->nb_c = 0; + t->pl_h2d_t = 0.0; + t->pl_h2d_c = 0; + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + t->ktime[i][j].t = 0.0; + t->ktime[i][j].c = 0; + } + } + t->pruneTime.c = 0; + t->pruneTime.t = 0.0; + t->dynamicPruneTime.c = 0; + t->dynamicPruneTime.t = 0.0; +} + +} // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.h similarity index 51% copy from src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu copy to src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.h index 81755cb903..8c17e7749d 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.h @@ -1,7 +1,8 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team. + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -32,14 +33,49 @@ * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org. */ -#include "gmxpre.h" +/*! \libinternal \file + * \brief Declare common functions for NBNXM GPU data management. + * + * \author Artem Zhmurov + * + * \ingroup module_nbnxm + */ + +#ifndef GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H +#define GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H + +struct interaction_const_t; +struct NBParamGpu; +struct PairlistParams; + +namespace Nbnxm +{ + +struct gpu_plist; + +/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale and the table GPU array. + * + * If called with an already allocated table, it just re-uploads the + * table. + */ +void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, + NBParamGpu* nbp, + const DeviceContext& deviceContext); + +/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */ +int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t gmx_unused& ic); + +/*! \brief Copies all parameters related to the cut-off from ic to nbp + */ +void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams); + +/*! \brief Initializes the pair list data structure. + */ +void init_plist(gpu_plist* pl); + +/*! \brief Initializes the timings data structure. */ +void init_timings(gmx_wallclock_gpu_nbnxn_t* t); -#include "nbnxm_cuda_kernel_pruneonly.cuh" +} // namespace Nbnxm -#ifndef FUNCTION_DECLARATION_ONLY -/* Instantiate external template functions */ -template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); -template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); -#endif +#endif // GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index cd929a4dbd..eaa7bfec4b 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -484,7 +484,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom int adat_begin, adat_len; cl_atomdata_t* adat = nb->atdat; - cl_plist_t* plist = nb->plist[iloc]; + gpu_plist* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -586,7 +586,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb { cl_atomdata_t* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; - cl_plist_t* plist = nb->plist[iloc]; + gpu_plist* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -723,7 +723,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c { cl_atomdata_t* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; - cl_plist_t* plist = nb->plist[iloc]; + gpu_plist* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -951,61 +951,4 @@ void gpu_launch_cpyback(NbnxmGpu* nb, } } - -/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */ -int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic) -{ - bool bTwinCut = (ic.rcoulomb != ic.rvdw); - bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald; - int kernel_type; - - /* Benchmarking/development environment variables to force the use of - analytical or tabulated Ewald kernel. */ - bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr); - bForceTabulatedEwald = (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr); - - if (bForceAnalyticalEwald && bForceTabulatedEwald) - { - gmx_incons( - "Both analytical and tabulated Ewald OpenCL non-bonded kernels " - "requested through environment variables."); - } - - /* OpenCL: By default, use analytical Ewald - * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp - * - */ - /* By default use analytical Ewald. */ - bUseAnalyticalEwald = true; - if (bForceAnalyticalEwald) - { - if (debug) - { - fprintf(debug, "Using analytical Ewald OpenCL kernels\n"); - } - } - else if (bForceTabulatedEwald) - { - bUseAnalyticalEwald = false; - - if (debug) - { - fprintf(debug, "Using tabulated Ewald OpenCL kernels\n"); - } - } - - /* Use twin cut-off kernels if requested by bTwinCut or the env. var. - forces it (use it for debugging/benchmarking only). */ - if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == nullptr)) - { - kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA : eelTypeEWALD_TAB; - } - else - { - kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA_TWIN : eelTypeEWALD_TAB_TWIN; - } - - return kernel_type; -} - } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 766789b930..7d74ebac4c 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -65,6 +65,7 @@ #include "gromacs/nbnxm/gpu_jit_support.h" #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/nbnxm_gpu.h" +#include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h" #include "gromacs/nbnxm/pairlistsets.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" @@ -98,29 +99,6 @@ namespace Nbnxm */ static unsigned int gpu_min_ci_balanced_factor = 50; -/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale - * and the table GPU array. - * - * If called with an already allocated table, it just re-uploads the - * table. - */ -static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, - NBParamGpu* nbp, - const DeviceContext& deviceContext) -{ - if (nbp->coulomb_tab != nullptr) - { - freeDeviceBuffer(&(nbp->coulomb_tab)); - } - - DeviceBuffer coulomb_tab; - - initParamLookupTable(&coulomb_tab, nullptr, tables.tableF.data(), tables.tableF.size(), deviceContext); - - nbp->coulomb_tab = coulomb_tab; - nbp->coulomb_tab_scale = tables.scale; -} - /*! \brief Initializes the atomdata structure first time, it only gets filled at pair-search. @@ -146,30 +124,6 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, const DeviceConte ad->nalloc = -1; } -/*! \brief Copies all parameters related to the cut-off from ic to nbp - */ -static void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams) -{ - nbp->ewald_beta = ic->ewaldcoeff_q; - nbp->sh_ewald = ic->sh_ewald; - nbp->epsfac = ic->epsfac; - nbp->two_k_rf = 2.0 * ic->k_rf; - nbp->c_rf = ic->c_rf; - nbp->rvdw_sq = ic->rvdw * ic->rvdw; - nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb; - nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter; - nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner; - nbp->useDynamicPruning = listParams.useDynamicPruning; - - nbp->sh_lj_ewald = ic->sh_lj_ewald; - nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj; - - nbp->rvdw_switch = ic->rvdw_switch; - nbp->dispersion_shift = ic->dispersion_shift; - nbp->repulsion_shift = ic->repulsion_shift; - nbp->vdw_switch = ic->vdw_switch; -} - /*! \brief Returns the kinds of electrostatics and Vdw OpenCL * kernels that will be used. * @@ -313,56 +267,6 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_); } -/*! \brief Initializes the pair list data structure. - */ -static void init_plist(cl_plist_t* pl) -{ - /* initialize to nullptr pointers to data that is not allocated here and will - need reallocation in nbnxn_gpu_init_pairlist */ - pl->sci = nullptr; - pl->cj4 = nullptr; - pl->imask = nullptr; - pl->excl = nullptr; - - /* size -1 indicates that the respective array hasn't been initialized yet */ - pl->na_c = -1; - pl->nsci = -1; - pl->sci_nalloc = -1; - pl->ncj4 = -1; - pl->cj4_nalloc = -1; - pl->nimask = -1; - pl->imask_nalloc = -1; - pl->nexcl = -1; - pl->excl_nalloc = -1; - pl->haveFreshList = false; -} - -/*! \brief Initializes the timings data structure. - */ -static void init_timings(gmx_wallclock_gpu_nbnxn_t* t) -{ - int i, j; - - t->nb_h2d_t = 0.0; - t->nb_d2h_t = 0.0; - t->nb_c = 0; - t->pl_h2d_t = 0.0; - t->pl_h2d_c = 0; - for (i = 0; i < 2; i++) - { - for (j = 0; j < 2; j++) - { - t->ktime[i][j].t = 0.0; - t->ktime[i][j].c = 0; - } - } - - t->pruneTime.c = 0; - t->pruneTime.t = 0.0; - t->dynamicPruneTime.c = 0; - t->dynamicPruneTime.t = 0.0; -} - /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */ static cl_kernel nbnxn_gpu_create_kernel(NbnxmGpu* nb, const char* kernel_name) { @@ -583,7 +487,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte // which leads to the counter not being reset. bool bDoTime = (nb->bDoTime && !h_plist->sci.empty()); const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - cl_plist_t* d_plist = nb->plist[iloc]; + gpu_plist* d_plist = nb->plist[iloc]; if (d_plist->na_c < 0) { @@ -593,7 +497,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte { if (d_plist->na_c != h_plist->na_ci) { - sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)", + sprintf(sbuf, "In init_plist: the #atoms per cell has changed (from %d to %d)", d_plist->na_c, h_plist->na_ci); gmx_incons(sbuf); } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index a2f6913a90..886298a20e 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -199,11 +199,6 @@ typedef struct cl_nbparam_params } cl_nbparam_params_t; -/*! \internal - * \brief Pair list data. - */ -using cl_plist_t = Nbnxm::gpu_plist; - /** \internal * \brief Typedef of actual timer type. */ @@ -254,7 +249,7 @@ struct NbnxmGpu //! parameters required for the non-bonded calc. NBParamGpu* nbparam = nullptr; //! pair-list data structures (local and non-local) - gmx::EnumerationArray plist = { nullptr }; + gmx::EnumerationArray plist = { nullptr }; //! staging area where fshift/energies get downloaded nb_staging_t nbst; -- 2.11.4.GIT