From 2747fc481b7110a23b20ed8daca98ae76b21b015 Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Tue, 30 May 2017 16:00:35 +0200 Subject: [PATCH] Add calls to the PME GPU stages This adds the inactive calls to PME GPU stages both for PP+PME and PME-only ranks. Ref #2054 Change-Id: I5af2ab95cedff422c39592255f01205d42fc7eb7 --- src/gromacs/ewald/pme-3dfft.cu | 2 +- src/gromacs/ewald/pme-gpu-internal.cpp | 25 ++-- src/gromacs/ewald/pme-gpu-internal.h | 36 +++--- src/gromacs/ewald/pme-gpu-types.h | 25 ++-- src/gromacs/ewald/pme-gpu.cpp | 207 +++++++++++++++++++++++++++++++-- src/gromacs/ewald/pme-internal.h | 1 + src/gromacs/ewald/pme-only.cpp | 36 ++++-- src/gromacs/ewald/pme-timings.cu | 29 ++++- src/gromacs/ewald/pme.cpp | 5 +- src/gromacs/ewald/pme.cu | 22 ++-- src/gromacs/ewald/pme.cuh | 3 + src/gromacs/ewald/pme.h | 84 +++++++++++-- src/gromacs/fft/fft.h | 6 +- src/gromacs/mdlib/force.cpp | 5 +- src/gromacs/mdlib/forcerec.cpp | 7 ++ src/gromacs/mdlib/sim_util.cpp | 69 +++++++++++ src/gromacs/mdtypes/forcerec.h | 7 +- src/gromacs/timing/wallcycle.cpp | 1 + src/gromacs/timing/wallcycle.h | 1 + src/programs/mdrun/runner.cpp | 8 +- 20 files changed, 486 insertions(+), 93 deletions(-) diff --git a/src/gromacs/ewald/pme-3dfft.cu b/src/gromacs/ewald/pme-3dfft.cu index 38f2e75cb0..d1a8799668 100644 --- a/src/gromacs/ewald/pme-3dfft.cu +++ b/src/gromacs/ewald/pme-3dfft.cu @@ -137,7 +137,7 @@ void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir) } } -inline void pme_gpu_3dfft(const pme_gpu_t *pmeGPU, gmx_fft_direction dir, int grid_index) +void pme_gpu_3dfft(const pme_gpu_t *pmeGPU, gmx_fft_direction dir, int grid_index) { int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R; pme_gpu_start_timing(pmeGPU, timerId); diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp index 978ce32b26..0443017ca1 100644 --- a/src/gromacs/ewald/pme-gpu-internal.cpp +++ b/src/gromacs/ewald/pme-gpu-internal.cpp @@ -51,6 +51,7 @@ #include #include +#include "gromacs/ewald/ewald-utils.h" #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/math/invertmatrix.h" #include "gromacs/math/units.h" @@ -98,8 +99,10 @@ void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box) #if GMX_DOUBLE GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU")); #else - matrix recipBox; - gmx::invertBoxMatrix(box, recipBox); + matrix scaledBox, recipBox; + pmeGPU->common->boxScaler->scaleBox(box, scaledBox); + gmx::invertBoxMatrix(scaledBox, recipBox); + /* The GPU recipBox is transposed as compared to the CPU recipBox. * Spread uses matrix columns (while solve and gather use rows). * There is no particular reason for this; it might be further rethought/optimized for better access patterns. @@ -114,15 +117,6 @@ void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box) #endif } -void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates) -{ - pme_gpu_copy_input_coordinates(pmeGPU, h_coordinates); - if (needToUpdateBox) - { - pme_gpu_update_input_box(pmeGPU, box); - } -} - /*! \brief \libinternal * The PME GPU reinitialization function that is called both at the end of any MD step and on any load balancing step. * @@ -136,9 +130,6 @@ static void pme_gpu_reinit_step(const pme_gpu_t *pmeGPU) void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcF, const bool bCalcEnerVir) { - /* Needed for copy back as well as timing events */ - pme_gpu_synchronize(pmeGPU); - if (bCalcF && pme_gpu_performs_gather(pmeGPU)) { pme_gpu_sync_output_forces(pmeGPU); @@ -230,7 +221,8 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme) pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnx, pme->nnx + cellCount * pme->nkx); pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nny, pme->nny + cellCount * pme->nky); pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz); - pmeGPU->common->runMode = pme->runMode; + pmeGPU->common->runMode = pme->runMode; + pmeGPU->common->boxScaler = pme->boxScaler; } /*! \brief \libinternal @@ -418,6 +410,9 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLog pme->gpu->settings.performGPUFFT = (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme_gpu_uses_dd(pme->gpu); pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU); + /* Reinit active timers */ + pme_gpu_reinit_timings(pme->gpu); + pme_gpu_reinit_grids(pme->gpu); pme_gpu_reinit_step(pme->gpu); } diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h index aa892bd2bc..9038bbca98 100644 --- a/src/gromacs/ewald/pme-gpu-internal.h +++ b/src/gromacs/ewald/pme-gpu-internal.h @@ -46,6 +46,7 @@ #ifndef GMX_EWALD_PME_GPU_INTERNAL_H #define GMX_EWALD_PME_GPU_INTERNAL_H +#include "gromacs/fft/fft.h" // for the gmx_fft_direction enum #include "gromacs/gpu_utils/gpu_macros.h" // for the CUDA_FUNC_ macros #include "pme-gpu-types.h" // for the inline functions accessing pme_gpu_t members @@ -425,12 +426,19 @@ CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUME /* Several CUDA event-based timing functions that live in pme-timings.cu */ /*! \libinternal \brief - * Finalizes all the PME GPU stage timings for the current step. Should be called at the end of every step. + * Finalizes all the active PME GPU stage timings for the current step. Should be called at the end of every step. * * \param[in] pmeGPU The PME GPU structure. */ CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +/*! \libinternal \brief + * Updates the internal list of active PME GPU stages (if timings are enabled). + * + * \param[in] pmeGPU The PME GPU data structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_reinit_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + /*! \brief * Resets the PME GPU timings. To be called at the reset step. * @@ -466,6 +474,17 @@ CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeG bool CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM /*! \libinternal \brief + * 3D FFT R2C/C2R routine. + * + * \param[in] pmeGpu The PME GPU structure. + * \param[in] direction Transform direction (real-to-complex or complex-to-real) + * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu), + enum gmx_fft_direction CUDA_FUNC_ARGUMENT(direction), + const int CUDA_FUNC_ARGUMENT(gridIndex)) CUDA_FUNC_TERM + +/*! \libinternal \brief * A GPU Fourier space solving function. * * \param[in] pmeGpu The PME GPU structure. @@ -486,7 +505,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_solve(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGp * \param[in] overwriteForces True: h_forces are output-only. * False: h_forces are copied to the device and are reduced with the results of the force gathering. * TODO: determine efficiency/balance of host/device-side reductions. - * \param[in] h_grid The host-side grid buffer (used only in testing moe) + * \param[in] h_grid The host-side grid buffer (used only in testing mode) */ CUDA_FUNC_QUALIFIER void pme_gpu_gather(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu), float *CUDA_FUNC_ARGUMENT(h_forces), @@ -588,7 +607,7 @@ gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU) void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial); /*! \libinternal \brief - * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_start_step(). + * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_step(). * * \param[in] pmeGPU The PME GPU structure. * \param[in] box The unit cell box. @@ -596,17 +615,6 @@ void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix vir void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box); /*! \libinternal \brief - * Starts the PME GPU step (copies coordinates onto GPU, possibly sets the unit cell parameters). - * - * \param[in] pmeGPU The PME GPU structure. - * \param[in] needToUpdateBox Tells if the stored unit cell parameters should be updated from \p box. - * \param[in] box The unit cell box which does not necessarily change every step (only with pressure coupling enabled). - * Would only be used if \p needToUpdateBox is true. - * \param[in] h_coordinates Input coordinates (XYZ rvec array). - */ -void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates); - -/*! \libinternal \brief * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host. * * \param[in] pmeGPU The PME GPU structure. diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h index 6a79e7c8fb..1127d27038 100644 --- a/src/gromacs/ewald/pme-gpu-types.h +++ b/src/gromacs/ewald/pme-gpu-types.h @@ -249,7 +249,7 @@ struct pme_gpu_settings_t * Only intended to be used by the test framework. */ bool copyAllOutputs; - /*! \brief Various computation flags for the curent step, corresponding to the GMX_PME_ flags in pme.h. */ + /*! \brief Various computation flags for the current step, corresponding to the GMX_PME_ flags in pme.h. */ int stepFlags; }; @@ -290,21 +290,28 @@ struct pme_shared_t /*! \brief Padded grid dimensions - pmegrid_nx, pmegrid_ny, pmegrid_nz * TODO: find out if these are really needed for the CPU FFT compatibility. */ - int pmegrid_n[DIM]; + int pmegrid_n[DIM]; /*! \brief PME interpolation order */ - int pme_order; + int pme_order; /*! \brief Ewald splitting coefficient for Coulomb */ - real ewaldcoeff_q; + real ewaldcoeff_q; /*! \brief Electrostatics parameter */ - real epsilon_r; + real epsilon_r; /*! \brief Gridline indices - nnx, nny, nnz */ - std::vector nn; + std::vector nn; /*! \brief Fractional shifts - fshx, fshy, fshz */ - std::vector fsh; + std::vector fsh; /*! \brief Precomputed B-spline values */ - std::vector bsp_mod[DIM]; + std::vector bsp_mod[DIM]; /*! \brief The PME codepath being taken */ - PmeRunMode runMode; + PmeRunMode runMode; + /*! \brief The box scaler based on inputrec - created in pme_init and managed by CPU structure */ + class EwaldBoxZScaler *boxScaler; + /*! \brief The previous step box to know if we even need to update the current step box params. + * \todo Manage this on higher level. + * \todo Alternatively, when this structure is used by CPU PME code, make use of this field there as well. + */ + matrix previousBox; }; /*! \internal \brief diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp index e24565adbf..f533f1be36 100644 --- a/src/gromacs/ewald/pme-gpu.cpp +++ b/src/gromacs/ewald/pme-gpu.cpp @@ -47,6 +47,8 @@ #include #include "gromacs/ewald/pme.h" +#include "gromacs/fft/parallel_3dfft.h" +#include "gromacs/math/invertmatrix.h" #include "gromacs/mdtypes/inputrec.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/fatalerror.h" @@ -58,9 +60,10 @@ #include "pme-internal.h" #include "pme-solve.h" -bool pme_gpu_task_enabled(const gmx_pme_t *pme) +PmeRunMode pme_run_mode(const gmx_pme_t *pme) { - return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU); + GMX_ASSERT(pme != nullptr, "Expecting valid PME data pointer"); + return pme->runMode; } bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error) @@ -127,25 +130,207 @@ void pme_gpu_get_timings(const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings) } } -void pme_gpu_get_results(const gmx_pme_t *pme, - gmx_wallcycle_t wcycle, - matrix vir_q, - real *energy_q) +/*! \brief + * A convenience wrapper for launching either the GPU or CPU FFT. + * + * \param[in] pme The PME structure. + * \param[in] gridIndex The grid index - should currently always be 0. + * \param[in] dir The FFT direction enum. + * \param[in] wcycle The wallclock counter. + */ +void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t *pme, + const int gridIndex, + enum gmx_fft_direction dir, + gmx_wallcycle_t wcycle) +{ + GMX_ASSERT(gridIndex == 0, "Only single grid supported"); + if (pme_gpu_performs_FFT(pme->gpu)) + { + wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + pme_gpu_3dfft(pme->gpu, dir, gridIndex); + wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); + } + else + { + wallcycle_start(wcycle, ewcPME_FFT); +#pragma omp parallel for num_threads(pme->nthread) schedule(static) + for (int thread = 0; thread < pme->nthread; thread++) + { + gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle); + } + wallcycle_stop(wcycle, ewcPME_FFT); + } +} + +/* The PME step code split into a few separate functions. */ + +void pme_gpu_prepare_step(gmx_pme_t *pme, + bool needToUpdateBox, + const matrix box, + gmx_wallcycle_t wcycle, + int flags) +{ + GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); + GMX_ASSERT(pme->nnodes > 0, ""); + GMX_ASSERT(pme->nnodes == 1 || pme->ndecompdim > 0, ""); + + pme_gpu_t *pmeGpu = pme->gpu; + pmeGpu->settings.stepFlags = flags; + // TODO these flags are only here to honor the CPU PME code, and probably should be removed + + bool shouldUpdateBox = false; + for (int i = 0; i < DIM; ++i) + { + for (int j = 0; j <= i; ++j) + { + shouldUpdateBox |= (pmeGpu->common->previousBox[i][j] != box[i][j]); + pmeGpu->common->previousBox[i][j] = box[i][j]; + } + } + + if (needToUpdateBox || shouldUpdateBox) // || is to make the first step always update + { + wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + pme_gpu_update_input_box(pmeGpu, box); + wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); + + if (!pme_gpu_performs_solve(pmeGpu)) + { + // TODO this has already been computed in pme->gpu + //memcpy(pme->recipbox, pme->gpu->common-> + gmx::invertBoxMatrix(box, pme->recipbox); + // FIXME verify that the box is scaled correctly on GPU codepath + pme->boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ]; + } + } +} + + +void pme_gpu_launch_spread(gmx_pme_t *pme, + const rvec *x, + gmx_wallcycle_t wcycle) +{ + GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); + + pme_gpu_t *pmeGpu = pme->gpu; + + // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count + wallcycle_start(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME); + pme_gpu_copy_input_coordinates(pmeGpu, x); + wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); + + const unsigned int gridIndex = 0; + real *fftgrid = pme->fftgrid[gridIndex]; + if (pmeGpu->settings.stepFlags & GMX_PME_SPREAD) + { + /* Spread the coefficients on a grid */ + const bool computeSplines = true; + const bool spreadCharges = true; + wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges); + wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); + } +} + +void pme_gpu_launch_complex_transforms(gmx_pme_t *pme, + gmx_wallcycle_t wcycle) +{ + pme_gpu_t *pmeGpu = pme->gpu; + const bool computeEnergyAndVirial = pmeGpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR; + const bool performBackFFT = pmeGpu->settings.stepFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT); + const unsigned int gridIndex = 0; + t_complex *cfftgrid = pme->cfftgrid[gridIndex]; + + if (pmeGpu->settings.stepFlags & GMX_PME_SPREAD) + { + if (!pme_gpu_performs_FFT(pmeGpu)) + { + wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD); + pme_gpu_sync_spread_grid(pme->gpu); + wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD); + } + } + + try + { + if (pmeGpu->settings.stepFlags & GMX_PME_SOLVE) + { + /* do R2C 3D-FFT */ + parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle); + + /* solve in k-space for our local cells */ + if (pme_gpu_performs_solve(pmeGpu)) + { + const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ; + wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); //FIXME nocount + pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial); + wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); + } + else + { + wallcycle_start(wcycle, ewcPME_SOLVE); +#pragma omp parallel for num_threads(pme->nthread) schedule(static) + for (int thread = 0; thread < pme->nthread; thread++) + { + solve_pme_yzx(pme, cfftgrid, pme->boxVolume, + computeEnergyAndVirial, pme->nthread, thread); + } + wallcycle_stop(wcycle, ewcPME_SOLVE); + } + } + + if (performBackFFT) + { + parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle); + } + } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; +} + +void pme_gpu_launch_gather(const gmx_pme_t *pme, + gmx_wallcycle_t gmx_unused wcycle, + rvec *forces, + bool overwriteForces) +{ + GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); + + if (!pme_gpu_performs_gather(pme->gpu)) + { + return; + } + + wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + const unsigned int gridIndex = 0; + real *fftgrid = pme->fftgrid[gridIndex]; + pme_gpu_gather(pme->gpu, reinterpret_cast(forces), overwriteForces, reinterpret_cast(fftgrid)); + wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); +} + +void pme_gpu_wait_for_gpu(const gmx_pme_t *pme, + gmx_wallcycle_t wcycle, + matrix vir_q, + real *energy_q) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); const bool haveComputedEnergyAndVirial = pme->gpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR; const bool haveComputedForces = pme->gpu->settings.stepFlags & GMX_PME_CALC_F; - // The wallcycle hierarchy is messy - we pause ewcFORCE just to exclude the PME GPU sync time - wallcycle_stop(wcycle, ewcFORCE); - wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER); pme_gpu_finish_step(pme->gpu, haveComputedForces, haveComputedEnergyAndVirial); wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER); - wallcycle_start_nocount(wcycle, ewcFORCE); - if (haveComputedEnergyAndVirial) { if (pme->doCoulomb) diff --git a/src/gromacs/ewald/pme-internal.h b/src/gromacs/ewald/pme-internal.h index a9d621ab85..4a3c126852 100644 --- a/src/gromacs/ewald/pme-internal.h +++ b/src/gromacs/ewald/pme-internal.h @@ -319,6 +319,7 @@ struct gmx_pme_t { pme_atomcomm_t atc[2]; /* Indexed on decomposition index */ matrix recipbox; + real boxVolume; splinevec bsp_mod; /* Buffers to store data for local atoms for L-B combination rule * calculations in LJ-PME. lb_buf1 stores either the coefficients diff --git a/src/gromacs/ewald/pme-only.cpp b/src/gromacs/ewald/pme-only.cpp index f9629100d9..6e46e6c984 100644 --- a/src/gromacs/ewald/pme-only.cpp +++ b/src/gromacs/ewald/pme-only.cpp @@ -527,7 +527,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme, t_commrec *cr, t_nrnb *mynrnb, gmx_wallcycle_t wcycle, gmx_walltime_accounting_t walltime_accounting, - t_inputrec *ir) + t_inputrec *ir, PmeRunMode runMode) { int npmedata; struct gmx_pme_t **pmedata; @@ -613,19 +613,37 @@ int gmx_pmeonly(struct gmx_pme_t *pme, dvdlambda_lj = 0; clear_mat(vir_q); clear_mat(vir_lj); + energy_q = 0; + energy_lj = 0; // TODO Make a struct of array refs onto these per-atom fields // of pme_pp (maybe box, energy and virial, too; and likewise // from mdatoms for the other call to gmx_pme_do), so we have // fewer lines of code and less parameter passing. - gmx_pme_do(pme, 0, natoms, pme_pp->x, pme_pp->f, - pme_pp->chargeA, pme_pp->chargeB, - pme_pp->sqrt_c6A, pme_pp->sqrt_c6B, - pme_pp->sigmaA, pme_pp->sigmaB, box, - cr, maxshift_x, maxshift_y, mynrnb, wcycle, - vir_q, vir_lj, - &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj, - GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0)); + const int pmeFlags = GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0); + if (runMode != PmeRunMode::CPU) + { + const bool boxChanged = true; + //TODO this should be set properly by gmx_pme_recv_coeffs_coords, + // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested! + constexpr bool pmeGpuAccumulateInputForces = false; + pme_gpu_prepare_step(pme, boxChanged, box, wcycle, pmeFlags); + pme_gpu_launch_spread(pme, pme_pp->x, wcycle); + pme_gpu_launch_complex_transforms(pme, wcycle); + pme_gpu_launch_gather(pme, wcycle, pme_pp->f, !pmeGpuAccumulateInputForces); + pme_gpu_wait_for_gpu(pme, wcycle, vir_q, &energy_q); + } + else + { + gmx_pme_do(pme, 0, natoms, pme_pp->x, pme_pp->f, + pme_pp->chargeA, pme_pp->chargeB, + pme_pp->sqrt_c6A, pme_pp->sqrt_c6B, + pme_pp->sigmaA, pme_pp->sigmaB, box, + cr, maxshift_x, maxshift_y, mynrnb, wcycle, + vir_q, vir_lj, + &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj, + pmeFlags); + } cycles = wallcycle_stop(wcycle, ewcPMEMESH); diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu index 9fbe39dde9..09d6d18337 100644 --- a/src/gromacs/ewald/pme-timings.cu +++ b/src/gromacs/ewald/pme-timings.cu @@ -93,9 +93,34 @@ void pme_gpu_update_timings(const pme_gpu_t *pmeGPU) { if (pme_gpu_timings_enabled(pmeGPU)) { - for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++) + pme_gpu_synchronize(pmeGPU); + + for (const size_t &activeTimer : pmeGPU->archSpecific->activeTimers) + { + pmeGPU->archSpecific->timingEvents[activeTimer].getLastRangeTime(); + } + } +} + +void pme_gpu_reinit_timings(const pme_gpu_t *pmeGPU) +{ + if (pme_gpu_timings_enabled(pmeGPU)) + { + pmeGPU->archSpecific->activeTimers.clear(); + pmeGPU->archSpecific->activeTimers.insert(gtPME_SPLINEANDSPREAD); + // TODO: no separate gtPME_SPLINE and gtPME_SPREAD as they are not used currently + if (pme_gpu_performs_FFT(pmeGPU)) + { + pmeGPU->archSpecific->activeTimers.insert(gtPME_FFT_C2R); + pmeGPU->archSpecific->activeTimers.insert(gtPME_FFT_R2C); + } + if (pme_gpu_performs_solve(pmeGPU)) + { + pmeGPU->archSpecific->activeTimers.insert(gtPME_SOLVE); + } + if (pme_gpu_performs_gather(pmeGPU)) { - pmeGPU->archSpecific->timingEvents[i].getLastRangeTime(); + pmeGPU->archSpecific->activeTimers.insert(gtPME_GATHER); } } } diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index cdd0a41f3d..8c02dcbff5 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -1751,7 +1751,10 @@ void gmx_pme_destroy(gmx_pme_t *pme) sfree(pme->bufv); sfree(pme->bufr); - pme_free_all_work(&pme->solve_work, pme->nthread); + if (pme->solve_work) + { + pme_free_all_work(&pme->solve_work, pme->nthread); + } sfree(pme->sum_qgrid_tmp); sfree(pme->sum_qgrid_dd_tmp); diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu index 8cf47ab878..14bef5ce5d 100644 --- a/src/gromacs/ewald/pme.cu +++ b/src/gromacs/ewald/pme.cu @@ -170,8 +170,7 @@ void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces) void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU) { - cudaStream_t s = pmeGPU->archSpecific->pmeStream; - cudaError_t stat = cudaStreamWaitEvent(s, pmeGPU->archSpecific->syncForcesD2H, 0); + cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncForcesD2H); CU_RET_ERR(stat, "Error while waiting for the PME GPU forces"); } @@ -384,7 +383,7 @@ void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU) void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU) { - cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncEnerVirD2H, 0); + cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncEnerVirD2H); CU_RET_ERR(stat, "Error while waiting for PME solve output"); for (int j = 0; j < c_virialAndEnergyCount; j++) @@ -447,19 +446,19 @@ void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *pmeGpu) void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU) { - cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0); + cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSpreadGridD2H); CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host"); } void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU) { - cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSplineAtomDataD2H, 0); + cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSplineAtomDataD2H); CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host"); } void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU) { - cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSolveGridD2H, 0); + cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSolveGridD2H); CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host"); //should check for pme_gpu_performs_solve(pmeGPU) } @@ -503,15 +502,16 @@ void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU) void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU) { cudaError_t stat; - stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, cudaEventDisableTiming); + const auto eventFlags = cudaEventDisableTiming; + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, eventFlags); CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed"); - stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, cudaEventDisableTiming); + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, eventFlags); CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed"); - stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, cudaEventDisableTiming); + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, eventFlags); CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed"); - stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, cudaEventDisableTiming); + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, eventFlags); CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed"); - stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, cudaEventDisableTiming); + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, eventFlags); CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed"); } diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh index 949da626d8..71fb758a1c 100644 --- a/src/gromacs/ewald/pme.cuh +++ b/src/gromacs/ewald/pme.cuh @@ -48,6 +48,7 @@ #include #include +#include #include "gromacs/gpu_utils/cuda_arch_utils.cuh" // for warp_size @@ -172,6 +173,8 @@ struct pme_gpu_cuda_t std::array timingEvents; + std::set activeTimers; // indices into timingEvents + /* GPU arrays element counts (not the arrays sizes in bytes!). * They might be larger than the actual meaningful data sizes. * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory. diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index b75926bbed..781dc72a0c 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -152,7 +152,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme, struct t_commrec *cr, t_nrnb *mynrnb, gmx_wallcycle_t wcycle, gmx_walltime_accounting_t walltime_accounting, - t_inputrec *ir); + t_inputrec *ir, PmeRunMode runMode); /*! \brief Calculate the PME grid energy V for n charges. * @@ -218,15 +218,26 @@ void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *ch bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error); /*! \brief + * Returns the active PME codepath (CPU, GPU, mixed). + * \todo This is a rather static data that should be managed by the higher level task scheduler. + * + * \param[in] pme The PME data structure. + * \returns active PME codepath. + */ +PmeRunMode pme_run_mode(const gmx_pme_t *pme); + +/*! \brief * Tells if PME is enabled to run on GPU (not necessarily active at the moment). - * For now, this decision is stored in the PME structure itself. - * FIXME: this is an information that should be managed by the task scheduler. - * As soon as such functionality appears, this function should be removed from this module. + * \todo This is a rather static data that should be managed by the hardware assignment manager. + * For now, it is synonymous with the active PME codepath (in the absence of dynamic switching). * * \param[in] pme The PME data structure. * \returns true if PME can run on GPU, false otherwise. */ -bool pme_gpu_task_enabled(const gmx_pme_t *pme); +inline bool pme_gpu_task_enabled(const gmx_pme_t *pme) +{ + return (pme != nullptr) && (pme_run_mode(pme) != PmeRunMode::CPU); +} /*! \brief * Resets the PME GPU timings. To be called at the reset step. @@ -247,17 +258,68 @@ void pme_gpu_get_timings(const gmx_pme_t *pme, /* The main PME GPU functions */ /*! \brief - * Gets the output forces and virial/energy if corresponding flags are (were?) passed in. + * Prepares PME on GPU step (updating the box if needed) + * \param[in] pme The PME data structure. + * \param[in] needToUpdateBox Tells if the stored unit cell parameters should be updated from \p box. + * \param[in] box The unit cell box. + * \param[in] wcycle The wallclock counter. + * \param[in] flags The combination of flags to affect the PME computation on this step. + * The flags are the GMX_PME_ flags from pme.h. + */ +void pme_gpu_prepare_step(gmx_pme_t *pme, + bool needToUpdateBox, + const matrix box, + gmx_wallcycle_t wcycle, + int flags); + +/*! \brief + * Launches first stage of PME on GPU - H2D input transfers, spreading kernel, and D2H grid transfer if needed. + * + * \param[in] pme The PME data structure. + * \param[in] x The array of local atoms' coordinates. + * \param[in] wcycle The wallclock counter. + */ +void pme_gpu_launch_spread(gmx_pme_t *pme, + const rvec *x, + gmx_wallcycle_t wcycle); + +/*! \brief + * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode. + * + * \param[in] pme The PME data structure. + * \param[in] wcycle The wallclock counter. + */ +void pme_gpu_launch_complex_transforms(gmx_pme_t *pme, + gmx_wallcycle_t wcycle); + +/*! \brief + * Launches last stage of PME on GPU - force gathering. + * + * \param[in] pme The PME data structure. + * \param[in] wcycle The wallclock counter. + * \param[in,out] forces The array of local atoms' resulting forces. + * \param[in] overwriteForces The boolean which tells whether the gathering kernel stores + * the output reciprocal forces into the host array (true), + * or copies its contents to the GPU first, and accumulates + * (false). The reduction is non-atomic. + */ +void pme_gpu_launch_gather(const gmx_pme_t *pme, + gmx_wallcycle_t wcycle, + rvec *forces, + bool overwriteForces); + +/*! \brief + * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy + * (if they were to be computed). * * \param[in] pme The PME data structure. * \param[in] wcycle The wallclock counter. * \param[out] vir_q The output virial matrix. * \param[out] energy_q The output energy. */ -void pme_gpu_get_results(const gmx_pme_t *pme, - gmx_wallcycle_t wcycle, - matrix vir_q, - real *energy_q); - +void pme_gpu_wait_for_gpu(const gmx_pme_t *pme, + gmx_wallcycle_t wcycle, + matrix vir_q, + real *energy_q); #endif diff --git a/src/gromacs/fft/fft.h b/src/gromacs/fft/fft.h index fe75131365..be54a756aa 100644 --- a/src/gromacs/fft/fft.h +++ b/src/gromacs/fft/fft.h @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 1991-2003 David van der Spoel, Erik Lindahl, University of Groningen. - * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2017, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -93,13 +93,13 @@ typedef struct gmx_fft * * GMX_FFT_FORWARD or GMX_FFT_BACKWARD, and for real-complex transforms you * can only use GMX_FFT_REAL_TO_COMPLEX or GMX_FFT_COMPLEX_TO_REAL. */ -typedef enum gmx_fft_direction +enum gmx_fft_direction { GMX_FFT_FORWARD, /**< Forward complex-to-complex transform */ GMX_FFT_BACKWARD, /**< Backward complex-to-complex transform */ GMX_FFT_REAL_TO_COMPLEX, /**< Real-to-complex valued FFT */ GMX_FFT_COMPLEX_TO_REAL /**< Complex-to-real valued FFT */ -} gmx_fft_direction; +}; /*! \brief Specifier for FFT flags. * diff --git a/src/gromacs/mdlib/force.cpp b/src/gromacs/mdlib/force.cpp index 474293ab5f..4af67d3c72 100644 --- a/src/gromacs/mdlib/force.cpp +++ b/src/gromacs/mdlib/force.cpp @@ -153,7 +153,7 @@ void do_force_lowlevel(t_forcerec *fr, t_inputrec *ir, t_blocka *excl, rvec mu_tot[], int flags, - float *cycles_pme) + float *cycles_pme) { int i, j; int donb_flags; @@ -458,7 +458,7 @@ void do_force_lowlevel(t_forcerec *fr, t_inputrec *ir, ewaldOutput.vir_q); } - if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) && (cr->duty & DUTY_PME)) + if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) && (cr->duty & DUTY_PME) && (pme_run_mode(fr->pmedata) == PmeRunMode::CPU)) { /* Do reciprocal PME for Coulomb and/or LJ. */ assert(fr->n_tpi >= 0); @@ -512,6 +512,7 @@ void do_force_lowlevel(t_forcerec *fr, t_inputrec *ir, { gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status); } + /* We should try to do as little computation after * this as possible, because parallel PME synchronizes * the nodes, so we want all load imbalance of the diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index 14f47a4037..b462cbde01 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -1514,6 +1514,11 @@ void forcerec_set_ranges(t_forcerec *fr, { fr->forceBufferForDirectVirialContributions->resize(natoms_f_novirsum); } + + if (fr->ic->cutoff_scheme == ecutsVERLET) + { + fr->forceBufferIntermediate->resize(ncg_home); + } } static real cutoff_inf(real cutoff) @@ -2789,6 +2794,8 @@ void init_forcerec(FILE *fp, fr->forceBufferForDirectVirialContributions = new std::vector; } + fr->forceBufferIntermediate = new std::vector; //TODO add proper conditionals + if (fr->cutoff_scheme == ecutsGROUP && ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr)) { diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index ce2a476b36..29829debb1 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -223,6 +223,18 @@ static void sum_forces(rvec f[], gmx::ArrayRef forceToAdd) } } +static void pme_gpu_reduce_outputs(ForceWithVirial *forceWithVirial, + ArrayRef pmeForces, + gmx_enerdata_t *enerd, + const tensor vir_Q, + real Vlr_q) +{ + GMX_ASSERT(forceWithVirial, "Invalid force pointer"); + forceWithVirial->addVirialContribution(vir_Q); + enerd->term[F_COUL_RECIP] += Vlr_q; + sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces); +} + static void calc_virial(int start, int homenr, rvec x[], rvec f[], tensor vir_part, t_graph *graph, matrix box, t_nrnb *nrnb, const t_forcerec *fr, int ePBC) @@ -823,6 +835,14 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, bUseGPU = fr->nbv->bUseGPU; bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes); + const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU; + // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code! + const bool useGpuPme = EEL_PME(fr->ic->eeltype) && (cr->duty & DUTY_PME) && + ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Hybrid)); + // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code! + const ArrayRef pmeGpuForces = *fr->forceBufferIntermediate; + constexpr bool pmeGpuAccumulateInputForces = false; + /* At a search step we need to start the first balancing region * somewhere early inside the step after communication during domain * decomposition (and not during the previous step as usual). @@ -909,6 +929,37 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, } #endif /* GMX_MPI */ + /* Launching PME on GPU */ + if (useGpuPme) + { + assert(fr->n_tpi >= 0); + if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED)) // TODO consolidate these checks + { + int pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE; + if (flags & GMX_FORCE_FORCES) + { + pme_flags |= GMX_PME_CALC_F; + } + if (flags & GMX_FORCE_VIRIAL) + { + pme_flags |= GMX_PME_CALC_ENER_VIR; + } + if (fr->n_tpi > 0) + { + /* We don't calculate f, but we do want the potential */ + pme_flags |= GMX_PME_CALC_POT; + } + + pme_gpu_prepare_step(fr->pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pme_flags); + pme_gpu_launch_spread(fr->pmedata, x, wcycle); + if (pmeRunMode == PmeRunMode::GPU) + { + pme_gpu_launch_complex_transforms(fr->pmedata, wcycle); + pme_gpu_launch_gather(fr->pmedata, wcycle, as_rvec_array(pmeGpuForces.data()), !pmeGpuAccumulateInputForces); + } + } + } + /* do gridding for pair search */ if (bNS) { @@ -1021,6 +1072,14 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, wallcycle_stop(wcycle, ewcLAUNCH_GPU); } + if (pmeRunMode == PmeRunMode::Hybrid) + { + // PME GPU - intermediate CPU work in mixed mode + // TODO - try moving this below till after do_force_lowlevel() / special forces? + pme_gpu_launch_complex_transforms(fr->pmedata, wcycle); + pme_gpu_launch_gather(fr->pmedata, wcycle, as_rvec_array(pmeGpuForces.data()), !pmeGpuAccumulateInputForces); + } + /* Communicate coordinates and sum dipole if necessary + do non-local pair search */ if (DOMAINDECOMP(cr)) @@ -1333,6 +1392,16 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, } } + if (useGpuPme) + { + matrix vir_Q; + real Vlr_q; + pme_gpu_wait_for_gpu(fr->pmedata, wcycle, vir_Q, &Vlr_q); + + pme_gpu_reduce_outputs(&forceWithVirial, pmeGpuForces, + enerd, vir_Q, Vlr_q); + } + if (bUseOrEmulGPU) { /* wait for local forces (or calculate in emulation mode) */ diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h index ce3b8b341f..cea244b21c 100644 --- a/src/gromacs/mdtypes/forcerec.h +++ b/src/gromacs/mdtypes/forcerec.h @@ -292,9 +292,14 @@ struct t_forcerec { gmx_bool haveDirectVirialContributions; #ifdef __cplusplus /* TODO: Replace the pointer by an object once we got rid of C */ - std::vector *forceBufferForDirectVirialContributions; + std::vector *forceBufferForDirectVirialContributions; + /* This buffer is currently only used for storing the PME GPU output until reduction. + * TODO: Pagelock/pin it + * TODO: Replace the pointer by an object once we got rid of C */ + std::vector *forceBufferIntermediate; #else void *forceBufferForDirectVirialContributions_dummy; + void *forceBufferIntermediate_dummy; #endif /* Data for PPPM/PME/Ewald */ diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index 2ad431a09d..5968d40430 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -123,6 +123,7 @@ static const char *wcsn[ewcsNR] = "Nonbonded pruning", "Nonbonded F", "Launch NB GPU tasks", + "Launch PME GPU tasks", "Ewald F correction", "NB X buffer ops.", "NB F buffer ops.", diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 01cec41b8f..620503538d 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -70,6 +70,7 @@ enum { ewcsNONBONDED_PRUNING, ewcsNONBONDED, ewcsLAUNCH_GPU_NONBONDED, + ewcsLAUNCH_GPU_PME, ewcsEWALD_CORRECTION, ewcsNB_X_BUF_OPS, ewcsNB_F_BUF_OPS, diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index 70a3eb7d53..79231d1a89 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -482,6 +482,9 @@ int Mdrunner::mdrunner() bool tryUsePhysicalGpu = (strncmp(nbpu_opt, "auto", 4) == 0) && hw_opt.gpuIdTaskAssignment.empty() && (emulateGpuNonbonded == EmulateGpuNonbonded::No); GMX_RELEASE_ASSERT(!(forceUsePhysicalGpu && tryUsePhysicalGpu), "Must either force use of " "GPUs for short-ranged interactions, or try to use them, not both."); + const PmeRunMode pmeRunMode = PmeRunMode::CPU; + //TODO this is a placeholder as PME on GPU is not permitted yet + //TODO should there exist a PmeRunMode::None value for consistency? // Here we assume that SIMMASTER(cr) does not change even after the // threads are started. @@ -1077,13 +1080,12 @@ int Mdrunner::mdrunner() try { gmx_device_info_t *pmeGpuInfo = nullptr; - auto runMode = PmeRunMode::CPU; status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec, mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed, mdrunOptions.reproducible, ewaldcoeff_q, ewaldcoeff_lj, nthreads_pme, - runMode, nullptr, pmeGpuInfo, mdlog); + pmeRunMode, nullptr, pmeGpuInfo, mdlog); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; if (status != 0) @@ -1174,7 +1176,7 @@ int Mdrunner::mdrunner() GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP"); /* do PME only */ walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME)); - gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec); + gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode); } wallcycle_stop(wcycle, ewcRUN); -- 2.11.4.GIT