From 2747fc481b7110a23b20ed8daca98ae76b21b015 Mon Sep 17 00:00:00 2001
From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Tue, 30 May 2017 16:00:35 +0200
Subject: [PATCH] Add calls to the PME GPU stages

This adds the inactive calls to PME GPU stages both for PP+PME
and PME-only ranks.

Ref #2054

Change-Id: I5af2ab95cedff422c39592255f01205d42fc7eb7
---
 src/gromacs/ewald/pme-3dfft.cu         |   2 +-
 src/gromacs/ewald/pme-gpu-internal.cpp |  25 ++--
 src/gromacs/ewald/pme-gpu-internal.h   |  36 +++---
 src/gromacs/ewald/pme-gpu-types.h      |  25 ++--
 src/gromacs/ewald/pme-gpu.cpp          | 207 +++++++++++++++++++++++++++++++--
 src/gromacs/ewald/pme-internal.h       |   1 +
 src/gromacs/ewald/pme-only.cpp         |  36 ++++--
 src/gromacs/ewald/pme-timings.cu       |  29 ++++-
 src/gromacs/ewald/pme.cpp              |   5 +-
 src/gromacs/ewald/pme.cu               |  22 ++--
 src/gromacs/ewald/pme.cuh              |   3 +
 src/gromacs/ewald/pme.h                |  84 +++++++++++--
 src/gromacs/fft/fft.h                  |   6 +-
 src/gromacs/mdlib/force.cpp            |   5 +-
 src/gromacs/mdlib/forcerec.cpp         |   7 ++
 src/gromacs/mdlib/sim_util.cpp         |  69 +++++++++++
 src/gromacs/mdtypes/forcerec.h         |   7 +-
 src/gromacs/timing/wallcycle.cpp       |   1 +
 src/gromacs/timing/wallcycle.h         |   1 +
 src/programs/mdrun/runner.cpp          |   8 +-
 20 files changed, 486 insertions(+), 93 deletions(-)
diff --git a/src/gromacs/ewald/pme-3dfft.cu b/src/gromacs/ewald/pme-3dfft.cu
index 38f2e75cb0..d1a8799668 100644
--- a/src/gromacs/ewald/pme-3dfft.cu
+++ b/src/gromacs/ewald/pme-3dfft.cu
@@ -137,7 +137,7 @@ void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir)
     }
 }
 
-inline void pme_gpu_3dfft(const pme_gpu_t *pmeGPU, gmx_fft_direction dir, int grid_index)
+void pme_gpu_3dfft(const pme_gpu_t *pmeGPU, gmx_fft_direction dir, int grid_index)
 {
     int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
     pme_gpu_start_timing(pmeGPU, timerId);
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp
index 978ce32b26..0443017ca1 100644
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -51,6 +51,7 @@
 #include <list>
 #include <string>
 
+#include "gromacs/ewald/ewald-utils.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/math/units.h"
@@ -98,8 +99,10 @@ void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box)
 #if GMX_DOUBLE
     GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
 #else
-    matrix recipBox;
-    gmx::invertBoxMatrix(box, recipBox);
+    matrix scaledBox, recipBox;
+    pmeGPU->common->boxScaler->scaleBox(box, scaledBox);
+    gmx::invertBoxMatrix(scaledBox, recipBox);
+
     /* The GPU recipBox is transposed as compared to the CPU recipBox.
      * Spread uses matrix columns (while solve and gather use rows).
      * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
@@ -114,15 +117,6 @@ void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box)
 #endif
 }
 
-void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates)
-{
-    pme_gpu_copy_input_coordinates(pmeGPU, h_coordinates);
-    if (needToUpdateBox)
-    {
-        pme_gpu_update_input_box(pmeGPU, box);
-    }
-}
-
 /*! \brief \libinternal
  * The PME GPU reinitialization function that is called both at the end of any MD step and on any load balancing step.
  *
@@ -136,9 +130,6 @@ static void pme_gpu_reinit_step(const pme_gpu_t *pmeGPU)
 
 void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcF, const bool bCalcEnerVir)
 {
-    /* Needed for copy back as well as timing events */
-    pme_gpu_synchronize(pmeGPU);
-
     if (bCalcF && pme_gpu_performs_gather(pmeGPU))
     {
         pme_gpu_sync_output_forces(pmeGPU);
@@ -230,7 +221,8 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
     pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnx, pme->nnx + cellCount * pme->nkx);
     pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nny, pme->nny + cellCount * pme->nky);
     pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
-    pmeGPU->common->runMode = pme->runMode;
+    pmeGPU->common->runMode   = pme->runMode;
+    pmeGPU->common->boxScaler = pme->boxScaler;
 }
 
 /*! \brief \libinternal
@@ -418,6 +410,9 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLog
     pme->gpu->settings.performGPUFFT   = (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme_gpu_uses_dd(pme->gpu);
     pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU);
 
+    /* Reinit active timers */
+    pme_gpu_reinit_timings(pme->gpu);
+
     pme_gpu_reinit_grids(pme->gpu);
     pme_gpu_reinit_step(pme->gpu);
 }
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h
index aa892bd2bc..9038bbca98 100644
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -46,6 +46,7 @@
 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
 #define GMX_EWALD_PME_GPU_INTERNAL_H
 
+#include "gromacs/fft/fft.h"                   // for the gmx_fft_direction enum
 #include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
 
 #include "pme-gpu-types.h"                     // for the inline functions accessing pme_gpu_t members
@@ -425,12 +426,19 @@ CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUME
 /* Several CUDA event-based timing functions that live in pme-timings.cu */
 
 /*! \libinternal \brief
- * Finalizes all the PME GPU stage timings for the current step. Should be called at the end of every step.
+ * Finalizes all the active PME GPU stage timings for the current step. Should be called at the end of every step.
  *
  * \param[in] pmeGPU         The PME GPU structure.
  */
 CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 
+/*! \libinternal \brief
+ * Updates the internal list of active PME GPU stages (if timings are enabled).
+ *
+ * \param[in] pmeGPU         The PME GPU data structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_reinit_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
 /*! \brief
  * Resets the PME GPU timings. To be called at the reset step.
  *
@@ -466,6 +474,17 @@ CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeG
                                         bool             CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
 
 /*! \libinternal \brief
+ * 3D FFT R2C/C2R routine.
+ *
+ * \param[in]  pmeGpu          The PME GPU structure.
+ * \param[in]  direction       Transform direction (real-to-complex or complex-to-real)
+ * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const pme_gpu_t       *CUDA_FUNC_ARGUMENT(pmeGpu),
+                                       enum gmx_fft_direction CUDA_FUNC_ARGUMENT(direction),
+                                       const int              CUDA_FUNC_ARGUMENT(gridIndex)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
  * A GPU Fourier space solving function.
  *
  * \param[in]     pmeGpu                  The PME GPU structure.
@@ -486,7 +505,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_solve(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGp
  * \param[in]     overwriteForces  True: h_forces are output-only.
  *                                 False: h_forces are copied to the device and are reduced with the results of the force gathering.
  *                                 TODO: determine efficiency/balance of host/device-side reductions.
- * \param[in]     h_grid           The host-side grid buffer (used only in testing moe)
+ * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
  */
 CUDA_FUNC_QUALIFIER void pme_gpu_gather(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu),
                                         float           *CUDA_FUNC_ARGUMENT(h_forces),
@@ -588,7 +607,7 @@ gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU)
 void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial);
 
 /*! \libinternal \brief
- * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_start_step().
+ * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_step().
  *
  * \param[in] pmeGPU         The PME GPU structure.
  * \param[in] box            The unit cell box.
@@ -596,17 +615,6 @@ void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix vir
 void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box);
 
 /*! \libinternal \brief
- * Starts the PME GPU step (copies coordinates onto GPU, possibly sets the unit cell parameters).
- *
- * \param[in] pmeGPU           The PME GPU structure.
- * \param[in] needToUpdateBox  Tells if the stored unit cell parameters should be updated from \p box.
- * \param[in] box              The unit cell box which does not necessarily change every step (only with pressure coupling enabled).
- *                             Would only be used if \p needToUpdateBox is true.
- * \param[in] h_coordinates    Input coordinates (XYZ rvec array).
- */
-void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates);
-
-/*! \libinternal \brief
  * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host.
  *
  * \param[in] pmeGPU         The PME GPU structure.
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h
index 6a79e7c8fb..1127d27038 100644
--- a/src/gromacs/ewald/pme-gpu-types.h
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -249,7 +249,7 @@ struct pme_gpu_settings_t
      * Only intended to be used by the test framework.
      */
     bool copyAllOutputs;
-    /*! \brief Various computation flags for the curent step, corresponding to the GMX_PME_ flags in pme.h. */
+    /*! \brief Various computation flags for the current step, corresponding to the GMX_PME_ flags in pme.h. */
     int  stepFlags;
 };
 
@@ -290,21 +290,28 @@ struct pme_shared_t
     /*! \brief Padded grid dimensions - pmegrid_nx, pmegrid_ny, pmegrid_nz
      * TODO: find out if these are really needed for the CPU FFT compatibility.
      */
-    int               pmegrid_n[DIM];
+    int                    pmegrid_n[DIM];
     /*! \brief PME interpolation order */
-    int               pme_order;
+    int                    pme_order;
     /*! \brief Ewald splitting coefficient for Coulomb */
-    real              ewaldcoeff_q;
+    real                   ewaldcoeff_q;
     /*! \brief Electrostatics parameter */
-    real              epsilon_r;
+    real                   epsilon_r;
     /*! \brief Gridline indices - nnx, nny, nnz */
-    std::vector<int>  nn;
+    std::vector<int>       nn;
     /*! \brief Fractional shifts - fshx, fshy, fshz */
-    std::vector<real> fsh;
+    std::vector<real>      fsh;
     /*! \brief Precomputed B-spline values */
-    std::vector<real> bsp_mod[DIM];
+    std::vector<real>      bsp_mod[DIM];
     /*! \brief The PME codepath being taken */
-    PmeRunMode        runMode;
+    PmeRunMode             runMode;
+    /*! \brief The box scaler based on inputrec - created in pme_init and managed by CPU structure */
+    class EwaldBoxZScaler *boxScaler;
+    /*! \brief The previous step box to know if we even need to update the current step box params.
+     * \todo Manage this on higher level.
+     * \todo Alternatively, when this structure is used by CPU PME code, make use of this field there as well.
+     */
+    matrix previousBox;
 };
 
 /*! \internal \brief
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp
index e24565adbf..f533f1be36 100644
--- a/src/gromacs/ewald/pme-gpu.cpp
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -47,6 +47,8 @@
 #include <list>
 
 #include "gromacs/ewald/pme.h"
+#include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/math/invertmatrix.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
@@ -58,9 +60,10 @@
 #include "pme-internal.h"
 #include "pme-solve.h"
 
-bool pme_gpu_task_enabled(const gmx_pme_t *pme)
+PmeRunMode pme_run_mode(const gmx_pme_t *pme)
 {
-    return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
+    GMX_ASSERT(pme != nullptr, "Expecting valid PME data pointer");
+    return pme->runMode;
 }
 
 bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
@@ -127,25 +130,207 @@ void pme_gpu_get_timings(const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings)
     }
 }
 
-void pme_gpu_get_results(const gmx_pme_t *pme,
-                         gmx_wallcycle_t  wcycle,
-                         matrix           vir_q,
-                         real            *energy_q)
+/*! \brief
+ * A convenience wrapper for launching either the GPU or CPU FFT.
+ *
+ * \param[in] pme            The PME structure.
+ * \param[in] gridIndex      The grid index - should currently always be 0.
+ * \param[in] dir            The FFT direction enum.
+ * \param[in] wcycle         The wallclock counter.
+ */
+void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t              *pme,
+                                               const int               gridIndex,
+                                               enum gmx_fft_direction  dir,
+                                               gmx_wallcycle_t         wcycle)
+{
+    GMX_ASSERT(gridIndex == 0, "Only single grid supported");
+    if (pme_gpu_performs_FFT(pme->gpu))
+    {
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+        pme_gpu_3dfft(pme->gpu, dir, gridIndex);
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+    }
+    else
+    {
+        wallcycle_start(wcycle, ewcPME_FFT);
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+        for (int thread = 0; thread < pme->nthread; thread++)
+        {
+            gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle);
+        }
+        wallcycle_stop(wcycle, ewcPME_FFT);
+    }
+}
+
+/* The PME step code split into a few separate functions. */
+
+void pme_gpu_prepare_step(gmx_pme_t            *pme,
+                          bool                  needToUpdateBox,
+                          const matrix          box,
+                          gmx_wallcycle_t       wcycle,
+                          int                   flags)
+{
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+    GMX_ASSERT(pme->nnodes > 0, "");
+    GMX_ASSERT(pme->nnodes == 1 || pme->ndecompdim > 0, "");
+
+    pme_gpu_t *pmeGpu = pme->gpu;
+    pmeGpu->settings.stepFlags = flags;
+    // TODO these flags are only here to honor the CPU PME code, and probably should be removed
+
+    bool shouldUpdateBox = false;
+    for (int i = 0; i < DIM; ++i)
+    {
+        for (int j = 0; j <= i; ++j)
+        {
+            shouldUpdateBox                  |= (pmeGpu->common->previousBox[i][j] != box[i][j]);
+            pmeGpu->common->previousBox[i][j] = box[i][j];
+        }
+    }
+
+    if (needToUpdateBox || shouldUpdateBox) // || is to make the first step always update
+    {
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+        pme_gpu_update_input_box(pmeGpu, box);
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+
+        if (!pme_gpu_performs_solve(pmeGpu))
+        {
+            // TODO this has already been computed in pme->gpu
+            //memcpy(pme->recipbox, pme->gpu->common->
+            gmx::invertBoxMatrix(box, pme->recipbox);
+            // FIXME verify that the box is scaled correctly on GPU codepath
+            pme->boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ];
+        }
+    }
+}
+
+
+void pme_gpu_launch_spread(gmx_pme_t            *pme,
+                           const rvec           *x,
+                           gmx_wallcycle_t       wcycle)
+{
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    pme_gpu_t *pmeGpu = pme->gpu;
+
+    // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count
+    wallcycle_start(wcycle, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
+    pme_gpu_copy_input_coordinates(pmeGpu, x);
+    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+
+    const unsigned int gridIndex  = 0;
+    real              *fftgrid    = pme->fftgrid[gridIndex];
+    if (pmeGpu->settings.stepFlags & GMX_PME_SPREAD)
+    {
+        /* Spread the coefficients on a grid */
+        const bool computeSplines = true;
+        const bool spreadCharges  = true;
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+        pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+    }
+}
+
+void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
+                                       gmx_wallcycle_t wcycle)
+{
+    pme_gpu_t         *pmeGpu                 = pme->gpu;
+    const bool         computeEnergyAndVirial = pmeGpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR;
+    const bool         performBackFFT         = pmeGpu->settings.stepFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
+    const unsigned int gridIndex              = 0;
+    t_complex         *cfftgrid               = pme->cfftgrid[gridIndex];
+
+    if (pmeGpu->settings.stepFlags & GMX_PME_SPREAD)
+    {
+        if (!pme_gpu_performs_FFT(pmeGpu))
+        {
+            wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD);
+            pme_gpu_sync_spread_grid(pme->gpu);
+            wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD);
+        }
+    }
+
+    try
+    {
+        if (pmeGpu->settings.stepFlags & GMX_PME_SOLVE)
+        {
+            /* do R2C 3D-FFT */
+            parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
+
+            /* solve in k-space for our local cells */
+            if (pme_gpu_performs_solve(pmeGpu))
+            {
+                const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ;
+                wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+                wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); //FIXME nocount
+                pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
+                wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+                wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+            }
+            else
+            {
+                wallcycle_start(wcycle, ewcPME_SOLVE);
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+                for (int thread = 0; thread < pme->nthread; thread++)
+                {
+                    solve_pme_yzx(pme, cfftgrid, pme->boxVolume,
+                                  computeEnergyAndVirial, pme->nthread, thread);
+                }
+                wallcycle_stop(wcycle, ewcPME_SOLVE);
+            }
+        }
+
+        if (performBackFFT)
+        {
+            parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
+        }
+    } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+}
+
+void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
+                           gmx_wallcycle_t gmx_unused       wcycle,
+                           rvec                            *forces,
+                           bool                             overwriteForces)
+{
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    if (!pme_gpu_performs_gather(pme->gpu))
+    {
+        return;
+    }
+
+    wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+    wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+    const unsigned int gridIndex  = 0;
+    real              *fftgrid    = pme->fftgrid[gridIndex];
+    pme_gpu_gather(pme->gpu, reinterpret_cast<float *>(forces), overwriteForces, reinterpret_cast<float *>(fftgrid));
+    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+}
+
+void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
+                          gmx_wallcycle_t  wcycle,
+                          matrix           vir_q,
+                          real            *energy_q)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 
     const bool haveComputedEnergyAndVirial = pme->gpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR;
     const bool haveComputedForces          = pme->gpu->settings.stepFlags & GMX_PME_CALC_F;
 
-    // The wallcycle hierarchy is messy - we pause ewcFORCE just to exclude the PME GPU sync time
-    wallcycle_stop(wcycle, ewcFORCE);
-
     wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
     pme_gpu_finish_step(pme->gpu, haveComputedForces, haveComputedEnergyAndVirial);
     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
 
-    wallcycle_start_nocount(wcycle, ewcFORCE);
-
     if (haveComputedEnergyAndVirial)
     {
         if (pme->doCoulomb)
diff --git a/src/gromacs/ewald/pme-internal.h b/src/gromacs/ewald/pme-internal.h
index a9d621ab85..4a3c126852 100644
--- a/src/gromacs/ewald/pme-internal.h
+++ b/src/gromacs/ewald/pme-internal.h
@@ -319,6 +319,7 @@ struct gmx_pme_t {
 
     pme_atomcomm_t            atc[2]; /* Indexed on decomposition index */
     matrix                    recipbox;
+    real                      boxVolume;
     splinevec                 bsp_mod;
     /* Buffers to store data for local atoms for L-B combination rule
      * calculations in LJ-PME. lb_buf1 stores either the coefficients
diff --git a/src/gromacs/ewald/pme-only.cpp b/src/gromacs/ewald/pme-only.cpp
index f9629100d9..6e46e6c984 100644
--- a/src/gromacs/ewald/pme-only.cpp
+++ b/src/gromacs/ewald/pme-only.cpp
@@ -527,7 +527,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
                 t_commrec *cr,    t_nrnb *mynrnb,
                 gmx_wallcycle_t wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec *ir)
+                t_inputrec *ir, PmeRunMode runMode)
 {
     int                npmedata;
     struct gmx_pme_t **pmedata;
@@ -613,19 +613,37 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
         dvdlambda_lj = 0;
         clear_mat(vir_q);
         clear_mat(vir_lj);
+        energy_q  = 0;
+        energy_lj = 0;
 
         // TODO Make a struct of array refs onto these per-atom fields
         // of pme_pp (maybe box, energy and virial, too; and likewise
         // from mdatoms for the other call to gmx_pme_do), so we have
         // fewer lines of code and less parameter passing.
-        gmx_pme_do(pme, 0, natoms, pme_pp->x, pme_pp->f,
-                   pme_pp->chargeA, pme_pp->chargeB,
-                   pme_pp->sqrt_c6A, pme_pp->sqrt_c6B,
-                   pme_pp->sigmaA, pme_pp->sigmaB, box,
-                   cr, maxshift_x, maxshift_y, mynrnb, wcycle,
-                   vir_q, vir_lj,
-                   &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
-                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
+        const int pmeFlags = GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0);
+        if (runMode != PmeRunMode::CPU)
+        {
+            const bool boxChanged = true;
+            //TODO this should be set properly by gmx_pme_recv_coeffs_coords,
+            // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
+            constexpr bool pmeGpuAccumulateInputForces = false;
+            pme_gpu_prepare_step(pme, boxChanged, box, wcycle, pmeFlags);
+            pme_gpu_launch_spread(pme, pme_pp->x, wcycle);
+            pme_gpu_launch_complex_transforms(pme, wcycle);
+            pme_gpu_launch_gather(pme, wcycle, pme_pp->f, !pmeGpuAccumulateInputForces);
+            pme_gpu_wait_for_gpu(pme, wcycle, vir_q, &energy_q);
+        }
+        else
+        {
+            gmx_pme_do(pme, 0, natoms, pme_pp->x, pme_pp->f,
+                       pme_pp->chargeA, pme_pp->chargeB,
+                       pme_pp->sqrt_c6A, pme_pp->sqrt_c6B,
+                       pme_pp->sigmaA, pme_pp->sigmaB, box,
+                       cr, maxshift_x, maxshift_y, mynrnb, wcycle,
+                       vir_q, vir_lj,
+                       &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
+                       pmeFlags);
+        }
 
         cycles = wallcycle_stop(wcycle, ewcPMEMESH);
 
diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu
index 9fbe39dde9..09d6d18337 100644
--- a/src/gromacs/ewald/pme-timings.cu
+++ b/src/gromacs/ewald/pme-timings.cu
@@ -93,9 +93,34 @@ void pme_gpu_update_timings(const pme_gpu_t *pmeGPU)
 {
     if (pme_gpu_timings_enabled(pmeGPU))
     {
-        for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++)
+        pme_gpu_synchronize(pmeGPU);
+
+        for (const size_t &activeTimer : pmeGPU->archSpecific->activeTimers)
+        {
+            pmeGPU->archSpecific->timingEvents[activeTimer].getLastRangeTime();
+        }
+    }
+}
+
+void pme_gpu_reinit_timings(const pme_gpu_t *pmeGPU)
+{
+    if (pme_gpu_timings_enabled(pmeGPU))
+    {
+        pmeGPU->archSpecific->activeTimers.clear();
+        pmeGPU->archSpecific->activeTimers.insert(gtPME_SPLINEANDSPREAD);
+        // TODO: no separate gtPME_SPLINE and gtPME_SPREAD as they are not used currently
+        if (pme_gpu_performs_FFT(pmeGPU))
+        {
+            pmeGPU->archSpecific->activeTimers.insert(gtPME_FFT_C2R);
+            pmeGPU->archSpecific->activeTimers.insert(gtPME_FFT_R2C);
+        }
+        if (pme_gpu_performs_solve(pmeGPU))
+        {
+            pmeGPU->archSpecific->activeTimers.insert(gtPME_SOLVE);
+        }
+        if (pme_gpu_performs_gather(pmeGPU))
         {
-            pmeGPU->archSpecific->timingEvents[i].getLastRangeTime();
+            pmeGPU->archSpecific->activeTimers.insert(gtPME_GATHER);
         }
     }
 }
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp
index cdd0a41f3d..8c02dcbff5 100644
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -1751,7 +1751,10 @@ void gmx_pme_destroy(gmx_pme_t *pme)
     sfree(pme->bufv);
     sfree(pme->bufr);
 
-    pme_free_all_work(&pme->solve_work, pme->nthread);
+    if (pme->solve_work)
+    {
+        pme_free_all_work(&pme->solve_work, pme->nthread);
+    }
 
     sfree(pme->sum_qgrid_tmp);
     sfree(pme->sum_qgrid_dd_tmp);
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu
index 8cf47ab878..14bef5ce5d 100644
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -170,8 +170,7 @@ void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces)
 
 void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU)
 {
-    cudaStream_t s    = pmeGPU->archSpecific->pmeStream;
-    cudaError_t  stat = cudaStreamWaitEvent(s, pmeGPU->archSpecific->syncForcesD2H, 0);
+    cudaError_t  stat = cudaEventSynchronize(pmeGPU->archSpecific->syncForcesD2H);
     CU_RET_ERR(stat, "Error while waiting for the PME GPU forces");
 }
 
@@ -384,7 +383,7 @@ void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
 
 void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
 {
-    cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncEnerVirD2H, 0);
+    cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncEnerVirD2H);
     CU_RET_ERR(stat, "Error while waiting for PME solve output");
 
     for (int j = 0; j < c_virialAndEnergyCount; j++)
@@ -447,19 +446,19 @@ void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *pmeGpu)
 
 void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
 {
-    cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0);
+    cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSpreadGridD2H);
     CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
 }
 
 void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU)
 {
-    cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSplineAtomDataD2H, 0);
+    cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSplineAtomDataD2H);
     CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host");
 }
 
 void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU)
 {
-    cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSolveGridD2H, 0);
+    cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSolveGridD2H);
     CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host");
     //should check for pme_gpu_performs_solve(pmeGPU)
 }
@@ -503,15 +502,16 @@ void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU)
 void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU)
 {
     cudaError_t stat;
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, cudaEventDisableTiming);
+    const auto  eventFlags = cudaEventDisableTiming;
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, eventFlags);
     CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed");
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, cudaEventDisableTiming);
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, eventFlags);
     CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed");
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, cudaEventDisableTiming);
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, eventFlags);
     CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed");
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, cudaEventDisableTiming);
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, eventFlags);
     CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed");
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, cudaEventDisableTiming);
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, eventFlags);
     CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed");
 }
 
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh
index 949da626d8..71fb758a1c 100644
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -48,6 +48,7 @@
 #include <cassert>
 
 #include <array>
+#include <set>
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh" // for warp_size
 
@@ -172,6 +173,8 @@ struct pme_gpu_cuda_t
 
     std::array<GpuRegionTimer, gtPME_EVENT_COUNT>    timingEvents;
 
+    std::set<size_t>                                 activeTimers; // indices into timingEvents
+
     /* GPU arrays element counts (not the arrays sizes in bytes!).
      * They might be larger than the actual meaningful data sizes.
      * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory.
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index b75926bbed..781dc72a0c 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -152,7 +152,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
                 struct t_commrec *cr,     t_nrnb *mynrnb,
                 gmx_wallcycle_t wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec *ir);
+                t_inputrec *ir, PmeRunMode runMode);
 
 /*! \brief Calculate the PME grid energy V for n charges.
  *
@@ -218,15 +218,26 @@ void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *ch
 bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error);
 
 /*! \brief
+ * Returns the active PME codepath (CPU, GPU, mixed).
+ * \todo This is a rather static data that should be managed by the higher level task scheduler.
+ *
+ * \param[in]  pme            The PME data structure.
+ * \returns active PME codepath.
+ */
+PmeRunMode pme_run_mode(const gmx_pme_t *pme);
+
+/*! \brief
  * Tells if PME is enabled to run on GPU (not necessarily active at the moment).
- * For now, this decision is stored in the PME structure itself.
- * FIXME: this is an information that should be managed by the task scheduler.
- * As soon as such functionality appears, this function should be removed from this module.
+ * \todo This is a rather static data that should be managed by the hardware assignment manager.
+ * For now, it is synonymous with the active PME codepath (in the absence of dynamic switching).
  *
  * \param[in]  pme            The PME data structure.
  * \returns true if PME can run on GPU, false otherwise.
  */
-bool pme_gpu_task_enabled(const gmx_pme_t *pme);
+inline bool pme_gpu_task_enabled(const gmx_pme_t *pme)
+{
+    return (pme != nullptr) && (pme_run_mode(pme) != PmeRunMode::CPU);
+}
 
 /*! \brief
  * Resets the PME GPU timings. To be called at the reset step.
@@ -247,17 +258,68 @@ void pme_gpu_get_timings(const gmx_pme_t         *pme,
 /* The main PME GPU functions */
 
 /*! \brief
- * Gets the output forces and virial/energy if corresponding flags are (were?) passed in.
+ * Prepares PME on GPU step (updating the box if needed)
+ * \param[in] pme               The PME data structure.
+ * \param[in] needToUpdateBox   Tells if the stored unit cell parameters should be updated from \p box.
+ * \param[in] box               The unit cell box.
+ * \param[in] wcycle            The wallclock counter.
+ * \param[in] flags             The combination of flags to affect the PME computation on this step.
+ *                              The flags are the GMX_PME_ flags from pme.h.
+ */
+void pme_gpu_prepare_step(gmx_pme_t      *pme,
+                          bool            needToUpdateBox,
+                          const matrix    box,
+                          gmx_wallcycle_t wcycle,
+                          int             flags);
+
+/*! \brief
+ * Launches first stage of PME on GPU - H2D input transfers, spreading kernel, and D2H grid transfer if needed.
+ *
+ * \param[in] pme               The PME data structure.
+ * \param[in] x                 The array of local atoms' coordinates.
+ * \param[in] wcycle            The wallclock counter.
+ */
+void pme_gpu_launch_spread(gmx_pme_t      *pme,
+                           const rvec     *x,
+                           gmx_wallcycle_t wcycle);
+
+/*! \brief
+ * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
+ *
+ * \param[in] pme               The PME data structure.
+ * \param[in] wcycle            The wallclock counter.
+ */
+void pme_gpu_launch_complex_transforms(gmx_pme_t       *pme,
+                                       gmx_wallcycle_t  wcycle);
+
+/*! \brief
+ * Launches last stage of PME on GPU - force gathering.
+ *
+ * \param[in]  pme               The PME data structure.
+ * \param[in]  wcycle            The wallclock counter.
+ * \param[in,out] forces         The array of local atoms' resulting forces.
+ * \param[in]  overwriteForces   The boolean which tells whether the gathering kernel stores
+ *                               the output reciprocal forces into the host array (true),
+ *                               or copies its contents to the GPU first, and accumulates
+ *                               (false). The reduction is non-atomic.
+ */
+void pme_gpu_launch_gather(const gmx_pme_t      *pme,
+                           gmx_wallcycle_t       wcycle,
+                           rvec                 *forces,
+                           bool                  overwriteForces);
+
+/*! \brief
+ * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
+ * (if they were to be computed).
  *
  * \param[in]  pme            The PME data structure.
  * \param[in]  wcycle         The wallclock counter.
  * \param[out] vir_q          The output virial matrix.
  * \param[out] energy_q       The output energy.
  */
-void pme_gpu_get_results(const gmx_pme_t *pme,
-                         gmx_wallcycle_t  wcycle,
-                         matrix           vir_q,
-                         real            *energy_q);
-
+void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
+                          gmx_wallcycle_t  wcycle,
+                          matrix           vir_q,
+                          real            *energy_q);
 
 #endif
diff --git a/src/gromacs/fft/fft.h b/src/gromacs/fft/fft.h
index fe75131365..be54a756aa 100644
--- a/src/gromacs/fft/fft.h
+++ b/src/gromacs/fft/fft.h
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 1991-2003 David van der Spoel, Erik Lindahl, University of Groningen.
- * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -93,13 +93,13 @@ typedef struct gmx_fft *
  *  GMX_FFT_FORWARD or GMX_FFT_BACKWARD, and for real-complex transforms you
  *  can only use GMX_FFT_REAL_TO_COMPLEX or GMX_FFT_COMPLEX_TO_REAL.
  */
-typedef enum gmx_fft_direction
+enum gmx_fft_direction
 {
     GMX_FFT_FORWARD,         /**< Forward complex-to-complex transform  */
     GMX_FFT_BACKWARD,        /**< Backward complex-to-complex transform */
     GMX_FFT_REAL_TO_COMPLEX, /**< Real-to-complex valued FFT            */
     GMX_FFT_COMPLEX_TO_REAL  /**< Complex-to-real valued FFT            */
-} gmx_fft_direction;
+};
 
 /*! \brief Specifier for FFT flags.
  *
diff --git a/src/gromacs/mdlib/force.cpp b/src/gromacs/mdlib/force.cpp
index 474293ab5f..4af67d3c72 100644
--- a/src/gromacs/mdlib/force.cpp
+++ b/src/gromacs/mdlib/force.cpp
@@ -153,7 +153,7 @@ void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                        t_blocka   *excl,
                        rvec       mu_tot[],
                        int        flags,
-                       float      *cycles_pme)
+                       float     *cycles_pme)
 {
     int         i, j;
     int         donb_flags;
@@ -458,7 +458,7 @@ void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                                             ewaldOutput.vir_q);
             }
 
-            if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) && (cr->duty & DUTY_PME))
+            if ((EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) && (cr->duty & DUTY_PME) && (pme_run_mode(fr->pmedata) == PmeRunMode::CPU))
             {
                 /* Do reciprocal PME for Coulomb and/or LJ. */
                 assert(fr->n_tpi >= 0);
@@ -512,6 +512,7 @@ void do_force_lowlevel(t_forcerec *fr,      t_inputrec *ir,
                     {
                         gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
                     }
+
                     /* We should try to do as little computation after
                      * this as possible, because parallel PME synchronizes
                      * the nodes, so we want all load imbalance of the
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 14f47a4037..b462cbde01 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -1514,6 +1514,11 @@ void forcerec_set_ranges(t_forcerec *fr,
     {
         fr->forceBufferForDirectVirialContributions->resize(natoms_f_novirsum);
     }
+
+    if (fr->ic->cutoff_scheme == ecutsVERLET)
+    {
+        fr->forceBufferIntermediate->resize(ncg_home);
+    }
 }
 
 static real cutoff_inf(real cutoff)
@@ -2789,6 +2794,8 @@ void init_forcerec(FILE                *fp,
         fr->forceBufferForDirectVirialContributions = new std::vector<gmx::RVec>;
     }
 
+    fr->forceBufferIntermediate = new std::vector<gmx::RVec>; //TODO add proper conditionals
+
     if (fr->cutoff_scheme == ecutsGROUP &&
         ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
     {
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index ce2a476b36..29829debb1 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -223,6 +223,18 @@ static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
     }
 }
 
+static void pme_gpu_reduce_outputs(ForceWithVirial           *forceWithVirial,
+                                   ArrayRef<const gmx::RVec>  pmeForces,
+                                   gmx_enerdata_t            *enerd,
+                                   const tensor               vir_Q,
+                                   real                       Vlr_q)
+{
+    GMX_ASSERT(forceWithVirial, "Invalid force pointer");
+    forceWithVirial->addVirialContribution(vir_Q);
+    enerd->term[F_COUL_RECIP] += Vlr_q;
+    sum_forces(as_rvec_array(forceWithVirial->force_.data()), pmeForces);
+}
+
 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
                         tensor vir_part, t_graph *graph, matrix box,
                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
@@ -823,6 +835,14 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     bUseGPU       = fr->nbv->bUseGPU;
     bUseOrEmulGPU = bUseGPU || (fr->nbv->emulateGpu == EmulateGpuNonbonded::Yes);
 
+    const auto pmeRunMode = fr->pmedata ? pme_run_mode(fr->pmedata) : PmeRunMode::CPU;
+    // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
+    const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && (cr->duty & DUTY_PME) &&
+        ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Hybrid));
+    // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
+    const ArrayRef<RVec> pmeGpuForces                = *fr->forceBufferIntermediate;
+    constexpr bool       pmeGpuAccumulateInputForces = false;
+
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
      * decomposition (and not during the previous step as usual).
@@ -909,6 +929,37 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     }
 #endif /* GMX_MPI */
 
+    /* Launching PME on GPU */
+    if (useGpuPme)
+    {
+        assert(fr->n_tpi >= 0);
+        if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED)) // TODO consolidate these checks
+        {
+            int pme_flags = GMX_PME_SPREAD | GMX_PME_SOLVE;
+            if (flags & GMX_FORCE_FORCES)
+            {
+                pme_flags |= GMX_PME_CALC_F;
+            }
+            if (flags & GMX_FORCE_VIRIAL)
+            {
+                pme_flags |= GMX_PME_CALC_ENER_VIR;
+            }
+            if (fr->n_tpi > 0)
+            {
+                /* We don't calculate f, but we do want the potential */
+                pme_flags |= GMX_PME_CALC_POT;
+            }
+
+            pme_gpu_prepare_step(fr->pmedata, flags & GMX_FORCE_DYNAMICBOX, box, wcycle, pme_flags);
+            pme_gpu_launch_spread(fr->pmedata, x, wcycle);
+            if (pmeRunMode == PmeRunMode::GPU)
+            {
+                pme_gpu_launch_complex_transforms(fr->pmedata, wcycle);
+                pme_gpu_launch_gather(fr->pmedata, wcycle, as_rvec_array(pmeGpuForces.data()), !pmeGpuAccumulateInputForces);
+            }
+        }
+    }
+
     /* do gridding for pair search */
     if (bNS)
     {
@@ -1021,6 +1072,14 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
+    if (pmeRunMode == PmeRunMode::Hybrid)
+    {
+        // PME GPU - intermediate CPU work in mixed mode
+        // TODO - try moving this below till after do_force_lowlevel() / special forces?
+        pme_gpu_launch_complex_transforms(fr->pmedata, wcycle);
+        pme_gpu_launch_gather(fr->pmedata, wcycle, as_rvec_array(pmeGpuForces.data()), !pmeGpuAccumulateInputForces);
+    }
+
     /* Communicate coordinates and sum dipole if necessary +
        do non-local pair search */
     if (DOMAINDECOMP(cr))
@@ -1333,6 +1392,16 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         }
     }
 
+    if (useGpuPme)
+    {
+        matrix vir_Q;
+        real   Vlr_q;
+        pme_gpu_wait_for_gpu(fr->pmedata, wcycle, vir_Q, &Vlr_q);
+
+        pme_gpu_reduce_outputs(&forceWithVirial, pmeGpuForces,
+                               enerd, vir_Q, Vlr_q);
+    }
+
     if (bUseOrEmulGPU)
     {
         /* wait for local forces (or calculate in emulation mode) */
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h
index ce3b8b341f..cea244b21c 100644
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -292,9 +292,14 @@ struct t_forcerec {
     gmx_bool          haveDirectVirialContributions;
 #ifdef __cplusplus
     /* TODO: Replace the pointer by an object once we got rid of C */
-    std::vector<gmx::RVec>  *forceBufferForDirectVirialContributions;
+    std::vector<gmx::RVec> *forceBufferForDirectVirialContributions;
+    /* This buffer is currently only used for storing the PME GPU output until reduction.
+     * TODO: Pagelock/pin it
+     * TODO: Replace the pointer by an object once we got rid of C */
+    std::vector<gmx::RVec>  *forceBufferIntermediate;
 #else
     void                    *forceBufferForDirectVirialContributions_dummy;
+    void                    *forceBufferIntermediate_dummy;
 #endif
 
     /* Data for PPPM/PME/Ewald */
diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp
index 2ad431a09d..5968d40430 100644
--- a/src/gromacs/timing/wallcycle.cpp
+++ b/src/gromacs/timing/wallcycle.cpp
@@ -123,6 +123,7 @@ static const char *wcsn[ewcsNR] =
     "Nonbonded pruning",
     "Nonbonded F",
     "Launch NB GPU tasks",
+    "Launch PME GPU tasks",
     "Ewald F correction",
     "NB X buffer ops.",
     "NB F buffer ops.",
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h
index 01cec41b8f..620503538d 100644
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -70,6 +70,7 @@ enum {
     ewcsNONBONDED_PRUNING,
     ewcsNONBONDED,
     ewcsLAUNCH_GPU_NONBONDED,
+    ewcsLAUNCH_GPU_PME,
     ewcsEWALD_CORRECTION,
     ewcsNB_X_BUF_OPS,
     ewcsNB_F_BUF_OPS,
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index 70a3eb7d53..79231d1a89 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -482,6 +482,9 @@ int Mdrunner::mdrunner()
     bool tryUsePhysicalGpu   = (strncmp(nbpu_opt, "auto", 4) == 0) && hw_opt.gpuIdTaskAssignment.empty() && (emulateGpuNonbonded == EmulateGpuNonbonded::No);
     GMX_RELEASE_ASSERT(!(forceUsePhysicalGpu && tryUsePhysicalGpu), "Must either force use of "
                        "GPUs for short-ranged interactions, or try to use them, not both.");
+    const PmeRunMode pmeRunMode = PmeRunMode::CPU;
+    //TODO this is a placeholder as PME on GPU is not permitted yet
+    //TODO should there exist a PmeRunMode::None value for consistency?
 
     // Here we assume that SIMMASTER(cr) does not change even after the
     // threads are started.
@@ -1077,13 +1080,12 @@ int Mdrunner::mdrunner()
             try
             {
                 gmx_device_info_t *pmeGpuInfo = nullptr;
-                auto               runMode    = PmeRunMode::CPU;
                 status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
                                       mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
                                       mdrunOptions.reproducible,
                                       ewaldcoeff_q, ewaldcoeff_lj,
                                       nthreads_pme,
-                                      runMode, nullptr, pmeGpuInfo, mdlog);
+                                      pmeRunMode, nullptr, pmeGpuInfo, mdlog);
             }
             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
             if (status != 0)
@@ -1174,7 +1176,7 @@ int Mdrunner::mdrunner()
         GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
         /* do PME only */
         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
-        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec);
+        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
     }
 
     wallcycle_stop(wcycle, ewcRUN);
-- 
2.11.4.GIT