From bd48f518c6bae0f561493553d2d14e3f52b7602f Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Fri, 27 Oct 2017 13:01:19 +0200 Subject: [PATCH] Bring PME GPU/CUDA internal structure names to CamelCase This only does mechanical renaming (e.g. pme_gpu_settings_t to PmeGpuSettings). Any meaningful renames will be done separately. Change-Id: I7ea2af94fd0212ff6edcf433ff21842c5bbb67b0 --- src/gromacs/ewald/pme-3dfft.cu | 6 +- src/gromacs/ewald/pme-3dfft.cuh | 4 +- src/gromacs/ewald/pme-gather.cu | 4 +- src/gromacs/ewald/pme-gpu-internal.cpp | 30 ++++---- src/gromacs/ewald/pme-gpu-internal.h | 128 ++++++++++++++++----------------- src/gromacs/ewald/pme-gpu-types.h | 58 +++++++-------- src/gromacs/ewald/pme-gpu.cpp | 6 +- src/gromacs/ewald/pme-internal.h | 4 +- src/gromacs/ewald/pme-solve.cu | 4 +- src/gromacs/ewald/pme-spread.cu | 10 +-- src/gromacs/ewald/pme-timings.cu | 14 ++-- src/gromacs/ewald/pme-timings.cuh | 6 +- src/gromacs/ewald/pme.cpp | 2 +- src/gromacs/ewald/pme.cu | 86 +++++++++++----------- src/gromacs/ewald/pme.cuh | 6 +- src/gromacs/ewald/pme.h | 4 +- 16 files changed, 186 insertions(+), 186 deletions(-) diff --git a/src/gromacs/ewald/pme-3dfft.cu b/src/gromacs/ewald/pme-3dfft.cu index d1a8799668..ad722763bb 100644 --- a/src/gromacs/ewald/pme-3dfft.cu +++ b/src/gromacs/ewald/pme-3dfft.cu @@ -57,9 +57,9 @@ static void handleCufftError(cufftResult_t status, const char *msg) } } -GpuParallel3dFft::GpuParallel3dFft(const pme_gpu_t *pmeGPU) +GpuParallel3dFft::GpuParallel3dFft(const PmeGpu *pmeGPU) { - const pme_gpu_cuda_kernel_params_t *kernelParamsPtr = pmeGPU->kernelParams.get(); + const PmeGpuCudaKernelParams *kernelParamsPtr = pmeGPU->kernelParams.get(); ivec realGridSize, realGridSizePadded, complexGridSizePadded; for (int i = 0; i < DIM; i++) { @@ -137,7 +137,7 @@ void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir) } } -void pme_gpu_3dfft(const pme_gpu_t *pmeGPU, gmx_fft_direction dir, int grid_index) +void pme_gpu_3dfft(const PmeGpu *pmeGPU, gmx_fft_direction dir, int grid_index) { int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R; pme_gpu_start_timing(pmeGPU, timerId); diff --git a/src/gromacs/ewald/pme-3dfft.cuh b/src/gromacs/ewald/pme-3dfft.cuh index 775c3241d3..ef6cba466b 100644 --- a/src/gromacs/ewald/pme-3dfft.cuh +++ b/src/gromacs/ewald/pme-3dfft.cuh @@ -46,7 +46,7 @@ #include "gromacs/fft/fft.h" // for the enum gmx_fft_direction -struct pme_gpu_t; +struct PmeGpu; /*! \brief \internal A 3D FFT class for performing R2C/C2R transforms * \todo Make this class actually parallel over multiple GPUs @@ -64,7 +64,7 @@ class GpuParallel3dFft * * \param[in] pmeGPU The PME GPU structure. */ - GpuParallel3dFft(const pme_gpu_t *pmeGPU); + GpuParallel3dFft(const PmeGpu *pmeGPU); /*! \brief Destroys CUDA FFT plans. */ ~GpuParallel3dFft(); /*! \brief Performs the FFT transform in given direction */ diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu index 8a3179aad3..40db9452c7 100644 --- a/src/gromacs/ewald/pme-gather.cu +++ b/src/gromacs/ewald/pme-gather.cu @@ -235,7 +235,7 @@ template < const bool wrapY > __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) -__global__ void pme_gather_kernel(const pme_gpu_cuda_kernel_params_t kernelParams) +__global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams) { /* Global memory pointers */ const float * __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients; @@ -410,7 +410,7 @@ __global__ void pme_gather_kernel(const pme_gpu_cuda_kernel_params_t kernelPa } } -void pme_gpu_gather(const pme_gpu_t *pmeGpu, +void pme_gpu_gather(const PmeGpu *pmeGpu, float *h_forces, PmeForceOutputHandling forceTreatment, const float *h_grid diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp index 0443017ca1..ab7cec033c 100644 --- a/src/gromacs/ewald/pme-gpu-internal.cpp +++ b/src/gromacs/ewald/pme-gpu-internal.cpp @@ -70,14 +70,14 @@ * \param[in] pmeGPU The PME GPU structure. * \returns The pointer to the kernel parameters. */ -static pme_gpu_kernel_params_base_t *pme_gpu_get_kernel_params_base_ptr(const pme_gpu_t *pmeGPU) +static PmeGpuKernelParamsBase *pme_gpu_get_kernel_params_base_ptr(const PmeGpu *pmeGPU) { // reinterpret_cast is needed because the derived CUDA structure is not known in this file - auto *kernelParamsPtr = reinterpret_cast(pmeGPU->kernelParams.get()); + auto *kernelParamsPtr = reinterpret_cast(pmeGPU->kernelParams.get()); return kernelParamsPtr; } -void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial) +void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial) { GMX_ASSERT(energy, "Invalid energy output pointer in PME GPU"); unsigned int j = 0; @@ -90,7 +90,7 @@ void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix vir *energy = 0.5f * pmeGPU->staging.h_virialAndEnergy[j++]; } -void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box) +void pme_gpu_update_input_box(PmeGpu *pmeGPU, const matrix box) { auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU); kernelParamsPtr->step.boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ]; @@ -122,13 +122,13 @@ void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box) * * \param[in] pmeGPU The PME GPU structure. */ -static void pme_gpu_reinit_step(const pme_gpu_t *pmeGPU) +static void pme_gpu_reinit_step(const PmeGpu *pmeGPU) { pme_gpu_clear_grids(pmeGPU); pme_gpu_clear_energy_virial(pmeGPU); } -void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcF, const bool bCalcEnerVir) +void pme_gpu_finish_step(const PmeGpu *pmeGPU, const bool bCalcF, const bool bCalcEnerVir) { if (bCalcF && pme_gpu_performs_gather(pmeGPU)) { @@ -147,7 +147,7 @@ void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcF, const bool * * \param[in] pmeGPU The PME GPU structure. */ -static void pme_gpu_reinit_grids(pme_gpu_t *pmeGPU) +static void pme_gpu_reinit_grids(PmeGpu *pmeGPU) { auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU); kernelParamsPtr->grid.ewaldFactor = (M_PI * M_PI) / (pmeGPU->common->ewaldcoeff_q * pmeGPU->common->ewaldcoeff_q); @@ -197,7 +197,7 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme) { /* TODO: Consider refactoring the CPU PME code to use the same structure, * so that this function becomes 2 lines */ - pme_gpu_t *pmeGPU = pme->gpu; + PmeGpu *pmeGPU = pme->gpu; pmeGPU->common->ngrids = pme->ngrids; pmeGPU->common->epsilon_r = pme->epsilon_r; pmeGPU->common->ewaldcoeff_q = pme->ewaldcoeff_q; @@ -290,9 +290,9 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx:: GMX_THROW(gmx::NotImplementedError(errorString)); } - pme->gpu = new pme_gpu_t(); - pme_gpu_t *pmeGPU = pme->gpu; - pmeGPU->common = std::shared_ptr(new pme_shared_t()); + pme->gpu = new PmeGpu(); + PmeGpu *pmeGPU = pme->gpu; + pmeGPU->common = std::shared_ptr(new PmeShared()); /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */ /* A convenience variable. */ @@ -318,7 +318,7 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx:: kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGPU->common->epsilon_r; } -void pme_gpu_transform_spline_atom_data(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc, +void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGPU, const pme_atomcomm_t *atc, PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform) { // The GPU atom spline data is laid out in a different way currently than the CPU one. @@ -376,7 +376,7 @@ void pme_gpu_transform_spline_atom_data(const pme_gpu_t *pmeGPU, const pme_atomc } } -void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize) +void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize) { GMX_ASSERT(gridSize != nullptr, ""); GMX_ASSERT(paddedGridSize != nullptr, ""); @@ -417,7 +417,7 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLog pme_gpu_reinit_step(pme->gpu); } -void pme_gpu_destroy(pme_gpu_t *pmeGPU) +void pme_gpu_destroy(PmeGpu *pmeGPU) { /* Free lots of data */ pme_gpu_free_energy_virial(pmeGPU); @@ -439,7 +439,7 @@ void pme_gpu_destroy(pme_gpu_t *pmeGPU) delete pmeGPU; } -void pme_gpu_reinit_atoms(pme_gpu_t *pmeGPU, const int nAtoms, const real *charges) +void pme_gpu_reinit_atoms(PmeGpu *pmeGPU, const int nAtoms, const real *charges) { auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU); kernelParamsPtr->atoms.nAtoms = nAtoms; diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h index ee53312184..91ab374d97 100644 --- a/src/gromacs/ewald/pme-gpu-internal.h +++ b/src/gromacs/ewald/pme-gpu-internal.h @@ -49,7 +49,7 @@ #include "gromacs/fft/fft.h" // for the gmx_fft_direction enum #include "gromacs/gpu_utils/gpu_macros.h" // for the CUDA_FUNC_ macros -#include "pme-gpu-types.h" // for the inline functions accessing pme_gpu_t members +#include "pme-gpu-types.h" // for the inline functions accessing PmeGpu members struct gmx_hw_info_t; struct gmx_gpu_opt_t; @@ -115,7 +115,7 @@ const int c_virialAndEnergyCount = 7; * \param[in] pmeGPU The PME GPU structure. * \returns Number of atoms in a single GPU atom data chunk. */ -CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1) +CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1) /*! \libinternal \brief * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout. @@ -123,28 +123,28 @@ CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const pme_gpu_t *CUDA_FU * \param[in] pmeGPU The PME GPU structure. * \returns Number of atoms in a single GPU atom spline data chunk. */ -CUDA_FUNC_QUALIFIER int pme_gpu_get_atoms_per_warp(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1) +CUDA_FUNC_QUALIFIER int pme_gpu_get_atoms_per_warp(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1) /*! \libinternal \brief * Synchronizes the current step, waiting for the GPU kernels/transfers to finish. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Allocates the fixed size energy and virial buffer both on GPU and CPU. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Frees the energy and virial memory both on GPU and CPU. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Clears the energy and virial memory on GPU with 0. @@ -152,35 +152,35 @@ CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(pme_gpu_t *CUDA_FUNC_ARGUMEN * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Reallocates and copies the pre-computed B-spline values to the GPU. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers). * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Reallocates the GPU buffer for the PME forces. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Frees the GPU buffer for the PME forces. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces). @@ -189,7 +189,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT * \param[in] pmeGPU The PME GPU structure. * \param[in] h_forces The input forces rvec buffer. */ -CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), +CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU), const float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM /*! \libinternal \brief @@ -198,7 +198,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const pme_gpu_t *CUDA_FUNC_AR * \param[in] pmeGPU The PME GPU structure. * \param[out] h_forces The output forces rvec buffer. */ -CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), +CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU), float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM /*! \libinternal \brief @@ -206,7 +206,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const pme_gpu_t *CUDA_FUNC_A * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed). @@ -215,7 +215,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const pme_gpu_t *CUDA_FUNC_A * * Needs to be called on every DD step/in the beginning. */ -CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Copies the input coordinates from the CPU buffer onto the GPU. @@ -225,7 +225,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const pme_gpu_t *CUDA_FUNC_ * * Needs to be called every MD step. The coordinates are then used in the spline calculation. */ -CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), +CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU), const rvec *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM /*! \libinternal \brief @@ -233,7 +233,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const pme_gpu_t *CUDA_FU * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer. @@ -245,7 +245,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const pme_gpu_t *CUDA_FUNC_ARG * Does not need to be done every MD step, only whenever the local charges change. * (So, in the beginning of the run, or on DD step). */ -CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU), const float *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM /*! \libinternal \brief @@ -253,49 +253,49 @@ CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const pme_g * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Reallocates the buffers on the GPU and the host for the atoms spline data. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Frees the buffers on the GPU for the atoms spline data. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Reallocates the buffers on the GPU and the host for the particle gridline indices. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Frees the buffer on the GPU for the particle gridline indices. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Clears the real space grid on the GPU. @@ -303,28 +303,28 @@ CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT( * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Frees the pre-computed fractional coordinates' shifts on the GPU. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Waits for the output virial/energy copying to the intermediate CPU buffer to finish. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Copies the input real-space grid from the host to the GPU. @@ -332,7 +332,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const pme_gpu_t *CUDA * \param[in] pmeGPU The PME GPU structure. * \param[in] h_grid The host-side grid buffer. */ -CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), +CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU), float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM /*! \libinternal \brief @@ -341,7 +341,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FU * \param[in] pmeGPU The PME GPU structure. * \param[out] h_grid The host-side grid buffer. */ -CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), +CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU), float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM /*! \libinternal \brief @@ -349,35 +349,35 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_F * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Copies the gather input spline data and gridline indices from the host to the GPU. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Waits for the grid copying to the host-side buffer after spreading to finish. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Waits for the atom data copying to the intermediate host-side buffer after spline computation to finish. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_sync_spline_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_sync_spline_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Waits for the grid copying to the host-side buffer after solving to finish. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Does the one-time GPU-framework specific PME initialization. @@ -385,7 +385,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const pme_gpu_t *CUDA_FUNC_ARGU * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Destroys the PME GPU-framework specific data. @@ -393,35 +393,35 @@ CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(pme_gpu_t *CUDA_FUNC_ARGUMENT(pme * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Initializes the PME GPU synchronization events. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Destroys the PME GPU synchronization events. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Initializes the CUDA FFT structures. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Destroys the CUDA FFT structures. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /* Several CUDA event-based timing functions that live in pme-timings.cu */ @@ -430,21 +430,21 @@ CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUME * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Updates the internal list of active PME GPU stages (if timings are enabled). * * \param[in] pmeGPU The PME GPU data structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_reinit_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_reinit_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \brief * Resets the PME GPU timings. To be called at the reset step. * * \param[in] pmeGPU The PME GPU structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM +CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM /*! \libinternal \brief * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end. @@ -452,7 +452,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUME * \param[in] pmeGPU The PME GPU structure. * \param[in] timings The gmx_wallclock_gpu_pme_t structure. */ -CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), +CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU), gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM /* The PME stages themselves */ @@ -467,7 +467,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t *CUDA_FUNC_ * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed. * \param[in] spreadCharges Should the charges/coefficients be spread on the grid. */ -CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu), +CUDA_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu), int CUDA_FUNC_ARGUMENT(gridIndex), real *CUDA_FUNC_ARGUMENT(h_grid), bool CUDA_FUNC_ARGUMENT(computeSplines), @@ -480,7 +480,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeG * \param[in] direction Transform direction (real-to-complex or complex-to-real) * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0. */ -CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu), +CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu), enum gmx_fft_direction CUDA_FUNC_ARGUMENT(direction), const int CUDA_FUNC_ARGUMENT(gridIndex)) CUDA_FUNC_TERM @@ -492,7 +492,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT * \param[in] gridOrdering Specifies the dimenion ordering of the complex grid. TODO: store this information? * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should also be performed. */ -CUDA_FUNC_QUALIFIER void pme_gpu_solve(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu), +CUDA_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu), t_complex *CUDA_FUNC_ARGUMENT(h_grid), GridOrdering CUDA_FUNC_ARGUMENT(gridOrdering), bool CUDA_FUNC_ARGUMENT(computeEnergyAndVirial)) CUDA_FUNC_TERM @@ -506,7 +506,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_solve(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGp * TODO: determine efficiency/balance of host/device-side reductions. * \param[in] h_grid The host-side grid buffer (used only in testing mode) */ -CUDA_FUNC_QUALIFIER void pme_gpu_gather(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu), +CUDA_FUNC_QUALIFIER void pme_gpu_gather(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu), float *CUDA_FUNC_ARGUMENT(h_forces), PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment), const float *CUDA_FUNC_ARGUMENT(h_grid) @@ -521,7 +521,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_gather(const pme_gpu_t *CUDA_FUNC_ARGUMEN * \param[in] pmeGPU The PME GPU structure. * \returns True if PME runs on multiple GPUs, false otherwise. */ -gmx_inline bool pme_gpu_uses_dd(const pme_gpu_t *pmeGPU) +gmx_inline bool pme_gpu_uses_dd(const PmeGpu *pmeGPU) { return !pmeGPU->settings.useDecomposition; } @@ -532,7 +532,7 @@ gmx_inline bool pme_gpu_uses_dd(const pme_gpu_t *pmeGPU) * \param[in] pmeGPU The PME GPU structure. * \returns True if the gathering is performed on GPU, false otherwise. */ -gmx_inline bool pme_gpu_performs_gather(const pme_gpu_t *pmeGPU) +gmx_inline bool pme_gpu_performs_gather(const PmeGpu *pmeGPU) { return pmeGPU->settings.performGPUGather; } @@ -543,7 +543,7 @@ gmx_inline bool pme_gpu_performs_gather(const pme_gpu_t *pmeGPU) * \param[in] pmeGPU The PME GPU structure. * \returns True if FFT is performed on GPU, false otherwise. */ -gmx_inline bool pme_gpu_performs_FFT(const pme_gpu_t *pmeGPU) +gmx_inline bool pme_gpu_performs_FFT(const PmeGpu *pmeGPU) { return pmeGPU->settings.performGPUFFT; } @@ -554,7 +554,7 @@ gmx_inline bool pme_gpu_performs_FFT(const pme_gpu_t *pmeGPU) * \param[in] pmeGPU The PME GPU structure. * \returns True if (un-)wrapping is performed on GPU, false otherwise. */ -gmx_inline bool pme_gpu_performs_wrapping(const pme_gpu_t *pmeGPU) +gmx_inline bool pme_gpu_performs_wrapping(const PmeGpu *pmeGPU) { return pmeGPU->settings.useDecomposition; } @@ -565,7 +565,7 @@ gmx_inline bool pme_gpu_performs_wrapping(const pme_gpu_t *pmeGPU) * \param[in] pmeGPU The PME GPU structure. * \returns True if solving is performed on GPU, false otherwise. */ -gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU) +gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU) { return pmeGPU->settings.performGPUSolve; } @@ -577,7 +577,7 @@ gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU) * \param[in] pmeGPU The PME GPU structure. * \param[in] testing Should the testing mode be enabled, or disabled. */ -gmx_inline void pme_gpu_set_testing(pme_gpu_t *pmeGPU, bool testing) +gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGPU, bool testing) { pmeGPU->settings.copyAllOutputs = testing; } @@ -588,7 +588,7 @@ gmx_inline void pme_gpu_set_testing(pme_gpu_t *pmeGPU, bool testing) * \param[in] pmeGPU The PME GPU structure. * \returns true if testing mode is enabled, false otherwise. */ -gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU) +gmx_inline bool pme_gpu_is_testing(const PmeGpu *pmeGPU) { return pmeGPU->settings.copyAllOutputs; } @@ -603,7 +603,7 @@ gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU) * \param[out] energy The output energy. * \param[out] virial The output virial matrix. */ -void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial); +void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial); /*! \libinternal \brief * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_step(). @@ -611,7 +611,7 @@ void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix vir * \param[in] pmeGPU The PME GPU structure. * \param[in] box The unit cell box. */ -void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box); +void pme_gpu_update_input_box(PmeGpu *pmeGPU, const matrix box); /*! \libinternal \brief * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host. @@ -620,7 +620,7 @@ void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box); * \param[in] bCalcForces The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME * \param[in] bCalcEnerVir The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead. */ -void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcForces, +void pme_gpu_finish_step(const PmeGpu *pmeGPU, const bool bCalcForces, const bool bCalcEnerVir); //! A binary enum for spline data layout transformation @@ -640,7 +640,7 @@ enum class PmeLayoutTransform * \param[in] dimIndex Dimension index. * \param[in] transform Layout transform type */ -void pme_gpu_transform_spline_atom_data(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc, +void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGPU, const pme_atomcomm_t *atc, PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform); /*! \libinternal \brief @@ -650,7 +650,7 @@ void pme_gpu_transform_spline_atom_data(const pme_gpu_t *pmeGPU, const pme_atomc * \param[out] gridSize Pointer to the grid dimensions to fill in. * \param[out] paddedGridSize Pointer to the padded grid dimensions to fill in. */ -void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize); +void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize); /*! \libinternal \brief * (Re-)initializes the PME GPU data at the beginning of the run or on DLB. @@ -668,7 +668,7 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLog * * \param[in] pmeGPU The PME GPU structure. */ -void pme_gpu_destroy(pme_gpu_t *pmeGPU); +void pme_gpu_destroy(PmeGpu *pmeGPU); /*! \libinternal \brief * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU. @@ -680,7 +680,7 @@ void pme_gpu_destroy(pme_gpu_t *pmeGPU); * This is a function that should only be called in the beginning of the run and on domain decomposition. * Should be called before the pme_gpu_set_io_ranges. */ -void pme_gpu_reinit_atoms(pme_gpu_t *pmeGPU, +void pme_gpu_reinit_atoms(PmeGpu *pmeGPU, const int nAtoms, const real *charges); diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h index 04e5481466..ced974c087 100644 --- a/src/gromacs/ewald/pme-gpu-types.h +++ b/src/gromacs/ewald/pme-gpu-types.h @@ -41,8 +41,8 @@ * most of the initial PME CUDA implementation is merged * into the master branch (likely, after release 2017). * This should include: - * -- bringing the structure/function names up to guidelines - * ---- pme_gpu_settings_t -> PmeGpuTasks + * -- bringing the function names up to guidelines + * -- PmeGpuSettings -> PmeGpuTasks * -- refining GPU notation application (#2053) * -- renaming coefficients to charges (?) * @@ -84,20 +84,20 @@ enum class PmeForceOutputHandling #if GMX_GPU == GMX_GPU_CUDA -struct pme_gpu_cuda_t; +struct PmeGpuCuda; /*! \brief A typedef for including the GPU host data by pointer */ -typedef pme_gpu_cuda_t pme_gpu_specific_t; +typedef PmeGpuCuda PmeGpuSpecific; -struct pme_gpu_cuda_kernel_params_t; +struct PmeGpuCudaKernelParams; /*! \brief A typedef for including the GPU kernel arguments data by pointer */ -typedef pme_gpu_cuda_kernel_params_t pme_gpu_kernel_params_t; +typedef PmeGpuCudaKernelParams PmeGpuKernelParams; #else /*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */ -typedef int pme_gpu_specific_t; +typedef int PmeGpuSpecific; /*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */ -typedef int pme_gpu_kernel_params_t; +typedef int PmeGpuKernelParams; #endif @@ -105,14 +105,14 @@ typedef int pme_gpu_kernel_params_t; * sorted into several device-side structures depending on the update rate. * This is GPU agnostic (float3 replaced by float[3], etc.). * The GPU-framework specifics (e.g. cudaTextureObject_t handles) are described - * in the larger structure pme_gpu_cuda_kernel_params_t in the pme.cuh. + * in the larger structure PmeGpuCudaKernelParams in the pme.cuh. */ /*! \internal \brief * A GPU data structure for storing the constant PME data. * This only has to be initialized once. */ -struct pme_gpu_const_params_t +struct PmeGpuConstParams { /*! \brief Electrostatics coefficient = ONE_4PI_EPS0 / pme->epsilon_r */ float elFactor; @@ -125,7 +125,7 @@ struct pme_gpu_const_params_t * A GPU data structure for storing the PME data related to the grid sizes and cut-off. * This only has to be updated at every DD step. */ -struct pme_gpu_grid_params_t +struct PmeGpuGridParams { /* Grid sizes */ /*! \brief Real-space grid data dimensions. */ @@ -168,7 +168,7 @@ struct pme_gpu_grid_params_t * A GPU data structure for storing the PME data of the atoms, local to this process' domain partition. * This only has to be updated every DD step. */ -struct pme_gpu_atom_params_t +struct PmeGpuAtomParams { /*! \brief Number of local atoms */ int nAtoms; @@ -203,7 +203,7 @@ struct pme_gpu_atom_params_t /*! \internal \brief * A GPU data structure for storing the PME data which might change every MD step. */ -struct pme_gpu_step_params_t +struct PmeGpuStepParams { /* The box parameters. The box only changes size each step with pressure coupling enabled. */ /*! \brief @@ -221,20 +221,20 @@ struct pme_gpu_step_params_t /*! \internal \brief * A single structure encompassing almost all the PME data used in GPU kernels on device. * This is inherited by the GPU framework-specific structure - * (pme_gpu_cuda_kernel_params_t in pme.cuh). + * (PmeGpuCudaKernelParams in pme.cuh). * This way, most code preparing the kernel parameters can be GPU-agnostic by casting - * the kernel parameter data pointer to pme_gpu_kernel_params_base_t. + * the kernel parameter data pointer to PmeGpuKernelParamsBase. */ -struct pme_gpu_kernel_params_base_t +struct PmeGpuKernelParamsBase { /*! \brief Constant data that is set once. */ - pme_gpu_const_params_t constants; + PmeGpuConstParams constants; /*! \brief Data dependent on the grid size/cutoff. */ - pme_gpu_grid_params_t grid; + PmeGpuGridParams grid; /*! \brief Data dependent on the DD and local atoms. */ - pme_gpu_atom_params_t atoms; + PmeGpuAtomParams atoms; /*! \brief Data that possibly changes on every MD step. */ - pme_gpu_step_params_t step; + PmeGpuStepParams step; }; /* Here are the host-side structures */ @@ -242,7 +242,7 @@ struct pme_gpu_kernel_params_base_t /*! \internal \brief * The PME GPU settings structure, included in the main PME GPU structure by value. */ -struct pme_gpu_settings_t +struct PmeGpuSettings { /* Permanent settings set on initialization */ /*! \brief A boolean which tells if the solving is performed on GPU. Currently always true */ @@ -265,7 +265,7 @@ struct pme_gpu_settings_t * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value. * Buffers are managed by the PME GPU module. */ -struct pme_gpu_staging_t +struct PmeGpuStaging { /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */ float *h_virialAndEnergy; @@ -289,7 +289,7 @@ struct pme_gpu_staging_t * TODO: use the shared data with the PME CPU. * Included in the main PME GPU structure by value. */ -struct pme_shared_t +struct PmeShared { /*! \brief Grid count - currently always 1 on GPU */ int ngrids; @@ -325,18 +325,18 @@ struct pme_shared_t /*! \internal \brief * The main PME GPU host structure, included in the PME CPU structure by pointer. */ -struct pme_gpu_t +struct PmeGpu { /*! \brief The information copied once per reinit from the CPU structure. */ - std::shared_ptr common; // TODO: make the CPU structure use the same type + std::shared_ptr common; // TODO: make the CPU structure use the same type /*! \brief The settings. */ - pme_gpu_settings_t settings; + PmeGpuSettings settings; /*! \brief The host-side buffers. * The device-side buffers are buried in kernelParams, but that will have to change. */ - pme_gpu_staging_t staging; + PmeGpuStaging staging; /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT. * Used for kernel scheduling. @@ -362,10 +362,10 @@ struct pme_gpu_t * \todo Test whether this should be copied to the constant GPU memory once per MD step * (or even less often with no box updates) instead of being an argument. */ - std::shared_ptr kernelParams; + std::shared_ptr kernelParams; /*! \brief The pointer to GPU-framework specific host-side data, such as CUDA streams and events. */ - std::shared_ptr archSpecific; /* FIXME: make it an unique_ptr */ + std::shared_ptr archSpecific; /* FIXME: make it an unique_ptr */ }; #endif diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp index fd97065b06..0ba9603b11 100644 --- a/src/gromacs/ewald/pme-gpu.cpp +++ b/src/gromacs/ewald/pme-gpu.cpp @@ -176,7 +176,7 @@ void pme_gpu_prepare_step(gmx_pme_t *pme, GMX_ASSERT(pme->nnodes > 0, ""); GMX_ASSERT(pme->nnodes == 1 || pme->ndecompdim > 0, ""); - pme_gpu_t *pmeGpu = pme->gpu; + PmeGpu *pmeGpu = pme->gpu; pmeGpu->settings.stepFlags = flags; // TODO these flags are only here to honor the CPU PME code, and probably should be removed @@ -216,7 +216,7 @@ void pme_gpu_launch_spread(gmx_pme_t *pme, { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); - pme_gpu_t *pmeGpu = pme->gpu; + PmeGpu *pmeGpu = pme->gpu; // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count wallcycle_start(wcycle, ewcLAUNCH_GPU); @@ -243,7 +243,7 @@ void pme_gpu_launch_spread(gmx_pme_t *pme, void pme_gpu_launch_complex_transforms(gmx_pme_t *pme, gmx_wallcycle_t wcycle) { - pme_gpu_t *pmeGpu = pme->gpu; + PmeGpu *pmeGpu = pme->gpu; const bool computeEnergyAndVirial = pmeGpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR; const bool performBackFFT = pmeGpu->settings.stepFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT); const unsigned int gridIndex = 0; diff --git a/src/gromacs/ewald/pme-internal.h b/src/gromacs/ewald/pme-internal.h index 4530acde70..b7eac93368 100644 --- a/src/gromacs/ewald/pme-internal.h +++ b/src/gromacs/ewald/pme-internal.h @@ -66,7 +66,7 @@ typedef struct gmx_parallel_3dfft *gmx_parallel_3dfft_t; struct t_commrec; struct t_inputrec; -struct pme_gpu_t; +struct PmeGpu; //@{ //! Grid indices for A state for charge and Lennard-Jones C6 @@ -263,7 +263,7 @@ struct gmx_pme_t { * and ideally not be duplicated here. */ - pme_gpu_t *gpu; /* A pointer to the GPU data. + PmeGpu *gpu; /* A pointer to the GPU data. * TODO: this should be unique or a shared pointer. * Currently in practice there is a single gmx_pme_t instance while a code * is partially set up for many of them. The PME tuning calls gmx_pme_reinit() diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu index aa75eb8401..6f68453a5a 100644 --- a/src/gromacs/ewald/pme-solve.cu +++ b/src/gromacs/ewald/pme-solve.cu @@ -81,7 +81,7 @@ template< bool computeEnergyAndVirial > __launch_bounds__(c_solveMaxThreadsPerBlock) -__global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kernelParams) +__global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParams) { /* This kernel supports 2 different grid dimension orderings: YZX and XYZ */ int majorDim, middleDim, minorDim; @@ -417,7 +417,7 @@ __global__ void pme_solve_kernel(const struct pme_gpu_cuda_kernel_params_t kerne } } -void pme_gpu_solve(const pme_gpu_t *pmeGpu, t_complex *h_grid, +void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial) { const bool copyInputAndOutputGrid = pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu); diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu index eb52077dcb..3e969d572b 100644 --- a/src/gromacs/ewald/pme-spread.cu +++ b/src/gromacs/ewald/pme-spread.cu @@ -107,7 +107,7 @@ template __device__ __forceinline__ -void pme_gpu_stage_atom_data(const pme_gpu_cuda_kernel_params_t kernelParams, +void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams kernelParams, T * __restrict__ sm_destination, const T * __restrict__ gm_source) { @@ -141,7 +141,7 @@ void pme_gpu_stage_atom_data(const pme_gpu_cuda_kernel_params_t kernelParams, */ template -__device__ __forceinline__ void calculate_splines(const pme_gpu_cuda_kernel_params_t kernelParams, +__device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams kernelParams, const int atomIndexOffset, const float3 * __restrict__ sm_coordinates, const float * __restrict__ sm_coefficients, @@ -354,7 +354,7 @@ __device__ __forceinline__ void calculate_splines(const pme_gpu_cuda_kernel_para */ template < const int order, const bool wrapX, const bool wrapY> -__device__ __forceinline__ void spread_charges(const pme_gpu_cuda_kernel_params_t kernelParams, +__device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams, int atomIndexOffset, const float * __restrict__ sm_coefficients, const int * __restrict__ sm_gridlineIndices, @@ -444,7 +444,7 @@ template < const bool wrapY > __launch_bounds__(c_spreadMaxThreadsPerBlock) -__global__ void pme_spline_and_spread_kernel(const pme_gpu_cuda_kernel_params_t kernelParams) +__global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams) { const int atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM; // Gridline indices, ivec @@ -491,7 +491,7 @@ __global__ void pme_spline_and_spread_kernel(const pme_gpu_cuda_kernel_params_t } } -void pme_gpu_spread(const pme_gpu_t *pmeGpu, +void pme_gpu_spread(const PmeGpu *pmeGpu, int gmx_unused gridIndex, real *h_grid, bool computeSplines, diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu index 09d6d18337..ee47c27ec7 100644 --- a/src/gromacs/ewald/pme-timings.cu +++ b/src/gromacs/ewald/pme-timings.cu @@ -53,12 +53,12 @@ * \param[in] pme The PME data structure. * \returns True if timings are enabled, false otherwise. */ -gmx_inline bool pme_gpu_timings_enabled(const pme_gpu_t *pmeGPU) +gmx_inline bool pme_gpu_timings_enabled(const PmeGpu *pmeGPU) { return pmeGPU->archSpecific->useTiming; } -void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId) +void pme_gpu_start_timing(const PmeGpu *pmeGPU, size_t PMEStageId) { if (pme_gpu_timings_enabled(pmeGPU)) { @@ -67,7 +67,7 @@ void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId) } } -void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId) +void pme_gpu_stop_timing(const PmeGpu *pmeGPU, size_t PMEStageId) { if (pme_gpu_timings_enabled(pmeGPU)) { @@ -76,7 +76,7 @@ void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId) } } -void pme_gpu_get_timings(const pme_gpu_t *pmeGPU, gmx_wallclock_gpu_pme_t *timings) +void pme_gpu_get_timings(const PmeGpu *pmeGPU, gmx_wallclock_gpu_pme_t *timings) { if (pme_gpu_timings_enabled(pmeGPU)) { @@ -89,7 +89,7 @@ void pme_gpu_get_timings(const pme_gpu_t *pmeGPU, gmx_wallclock_gpu_pme_t *timin } } -void pme_gpu_update_timings(const pme_gpu_t *pmeGPU) +void pme_gpu_update_timings(const PmeGpu *pmeGPU) { if (pme_gpu_timings_enabled(pmeGPU)) { @@ -102,7 +102,7 @@ void pme_gpu_update_timings(const pme_gpu_t *pmeGPU) } } -void pme_gpu_reinit_timings(const pme_gpu_t *pmeGPU) +void pme_gpu_reinit_timings(const PmeGpu *pmeGPU) { if (pme_gpu_timings_enabled(pmeGPU)) { @@ -125,7 +125,7 @@ void pme_gpu_reinit_timings(const pme_gpu_t *pmeGPU) } } -void pme_gpu_reset_timings(const pme_gpu_t *pmeGPU) +void pme_gpu_reset_timings(const PmeGpu *pmeGPU) { if (pme_gpu_timings_enabled(pmeGPU)) { diff --git a/src/gromacs/ewald/pme-timings.cuh b/src/gromacs/ewald/pme-timings.cuh index 8c94e7f817..72a70b87e7 100644 --- a/src/gromacs/ewald/pme-timings.cuh +++ b/src/gromacs/ewald/pme-timings.cuh @@ -45,7 +45,7 @@ #include "gromacs/gpu_utils/gpuregiontimer.cuh" #include "gromacs/timing/gpu_timing.h" // TODO: move include to the source files -struct pme_gpu_t; +struct PmeGpu; /*! \libinternal \brief * Starts timing the certain PME GPU stage during a single step (if timings are enabled). @@ -53,7 +53,7 @@ struct pme_gpu_t; * \param[in] pmeGPU The PME GPU data structure. * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h */ -void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId); +void pme_gpu_start_timing(const PmeGpu *pmeGPU, size_t PMEStageId); /*! \libinternal \brief * Stops timing the certain PME GPU stage during a single step (if timings are enabled). @@ -61,6 +61,6 @@ void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId); * \param[in] pmeGPU The PME GPU data structure. * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h */ -void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId); +void pme_gpu_stop_timing(const PmeGpu *pmeGPU, size_t PMEStageId); #endif diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index ef2a5cf6fc..812754f24b 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -517,7 +517,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec *cr, real ewaldcoeff_lj, int nthread, PmeRunMode runMode, - pme_gpu_t *pmeGPU, + PmeGpu *pmeGPU, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog) { diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu index 14bef5ce5d..1a59b01e4d 100644 --- a/src/gromacs/ewald/pme.cu +++ b/src/gromacs/ewald/pme.cu @@ -56,27 +56,27 @@ #include "pme-3dfft.cuh" #include "pme-grid.h" -int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU) +int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGPU) { const int order = pmeGPU->common->pme_order; GMX_ASSERT(order > 0, "Invalid PME order"); return PME_ATOM_DATA_ALIGNMENT; } -int pme_gpu_get_atoms_per_warp(const pme_gpu_t *pmeGPU) +int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGPU) { const int order = pmeGPU->common->pme_order; GMX_ASSERT(order > 0, "Invalid PME order"); return PME_SPREADGATHER_ATOMS_PER_WARP; } -void pme_gpu_synchronize(const pme_gpu_t *pmeGPU) +void pme_gpu_synchronize(const PmeGpu *pmeGPU) { cudaError_t stat = cudaStreamSynchronize(pmeGPU->archSpecific->pmeStream); CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!"); } -void pme_gpu_alloc_energy_virial(const pme_gpu_t *pmeGPU) +void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGPU) { const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float); cudaError_t stat = cudaMalloc((void **)&pmeGPU->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize); @@ -84,7 +84,7 @@ void pme_gpu_alloc_energy_virial(const pme_gpu_t *pmeGPU) pmalloc((void **)&pmeGPU->staging.h_virialAndEnergy, energyAndVirialSize); } -void pme_gpu_free_energy_virial(pme_gpu_t *pmeGPU) +void pme_gpu_free_energy_virial(PmeGpu *pmeGPU) { cudaError_t stat = cudaFree(pmeGPU->kernelParams->constants.d_virialAndEnergy); CU_RET_ERR(stat, "cudaFree failed on PME energy and virial"); @@ -93,14 +93,14 @@ void pme_gpu_free_energy_virial(pme_gpu_t *pmeGPU) pmeGPU->staging.h_virialAndEnergy = nullptr; } -void pme_gpu_clear_energy_virial(const pme_gpu_t *pmeGPU) +void pme_gpu_clear_energy_virial(const PmeGpu *pmeGPU) { cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount * sizeof(float), pmeGPU->archSpecific->pmeStream); CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error"); } -void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *pmeGPU) +void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGPU) { const int splineValuesOffset[DIM] = { 0, @@ -130,14 +130,14 @@ void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *pmeGPU) newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream); } -void pme_gpu_free_bspline_values(const pme_gpu_t *pmeGPU) +void pme_gpu_free_bspline_values(const PmeGpu *pmeGPU) { pfree(pmeGPU->staging.h_splineModuli); cu_free_buffered(pmeGPU->kernelParams->grid.d_splineModuli, &pmeGPU->archSpecific->splineValuesSize, &pmeGPU->archSpecific->splineValuesSizeAlloc); } -void pme_gpu_realloc_forces(const pme_gpu_t *pmeGPU) +void pme_gpu_realloc_forces(const PmeGpu *pmeGPU) { const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM; GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU"); @@ -145,12 +145,12 @@ void pme_gpu_realloc_forces(const pme_gpu_t *pmeGPU) &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true); } -void pme_gpu_free_forces(const pme_gpu_t *pmeGPU) +void pme_gpu_free_forces(const PmeGpu *pmeGPU) { cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc); } -void pme_gpu_copy_input_forces(const pme_gpu_t *pmeGPU, const float *h_forces) +void pme_gpu_copy_input_forces(const PmeGpu *pmeGPU, const float *h_forces) { GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU"); const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float); @@ -158,7 +158,7 @@ void pme_gpu_copy_input_forces(const pme_gpu_t *pmeGPU, const float *h_forces) cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream); } -void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces) +void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces) { GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU"); const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float); @@ -168,13 +168,13 @@ void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces) CU_RET_ERR(stat, "PME gather forces synchronization failure"); } -void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU) +void pme_gpu_sync_output_forces(const PmeGpu *pmeGPU) { cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncForcesD2H); CU_RET_ERR(stat, "Error while waiting for the PME GPU forces"); } -void pme_gpu_realloc_coordinates(const pme_gpu_t *pmeGPU) +void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU) { const size_t newCoordinatesSize = pmeGPU->nAtomsAlloc * DIM; GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU"); @@ -192,7 +192,7 @@ void pme_gpu_realloc_coordinates(const pme_gpu_t *pmeGPU) } } -void pme_gpu_copy_input_coordinates(const pme_gpu_t *pmeGPU, const rvec *h_coordinates) +void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGPU, const rvec *h_coordinates) { GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU"); #if GMX_DOUBLE @@ -204,12 +204,12 @@ void pme_gpu_copy_input_coordinates(const pme_gpu_t *pmeGPU, const rvec *h_coord #endif } -void pme_gpu_free_coordinates(const pme_gpu_t *pmeGPU) +void pme_gpu_free_coordinates(const PmeGpu *pmeGPU) { cu_free_buffered(pmeGPU->kernelParams->atoms.d_coordinates, &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc); } -void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *pmeGPU, const float *h_coefficients) +void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGPU, const float *h_coefficients) { GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU"); const size_t newCoefficientsSize = pmeGPU->nAtomsAlloc; @@ -231,12 +231,12 @@ void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *pmeGPU, const } } -void pme_gpu_free_coefficients(const pme_gpu_t *pmeGPU) +void pme_gpu_free_coefficients(const PmeGpu *pmeGPU) { cu_free_buffered(pmeGPU->kernelParams->atoms.d_coefficients, &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc); } -void pme_gpu_realloc_spline_data(const pme_gpu_t *pmeGPU) +void pme_gpu_realloc_spline_data(const PmeGpu *pmeGPU) { const int order = pmeGPU->common->pme_order; const int alignment = pme_gpu_get_atoms_per_warp(pmeGPU); @@ -261,7 +261,7 @@ void pme_gpu_realloc_spline_data(const pme_gpu_t *pmeGPU) } } -void pme_gpu_free_spline_data(const pme_gpu_t *pmeGPU) +void pme_gpu_free_spline_data(const PmeGpu *pmeGPU) { /* Two arrays of the same size */ cu_free_buffered(pmeGPU->kernelParams->atoms.d_theta); @@ -270,7 +270,7 @@ void pme_gpu_free_spline_data(const pme_gpu_t *pmeGPU) pfree(pmeGPU->staging.h_dtheta); } -void pme_gpu_realloc_grid_indices(const pme_gpu_t *pmeGPU) +void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGPU) { const size_t newIndicesSize = DIM * pmeGPU->nAtomsAlloc; GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU"); @@ -280,13 +280,13 @@ void pme_gpu_realloc_grid_indices(const pme_gpu_t *pmeGPU) pmalloc((void **)&pmeGPU->staging.h_gridlineIndices, newIndicesSize * sizeof(int)); } -void pme_gpu_free_grid_indices(const pme_gpu_t *pmeGPU) +void pme_gpu_free_grid_indices(const PmeGpu *pmeGPU) { cu_free_buffered(pmeGPU->kernelParams->atoms.d_gridlineIndices, &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc); pfree(pmeGPU->staging.h_gridlineIndices); } -void pme_gpu_realloc_grids(pme_gpu_t *pmeGPU) +void pme_gpu_realloc_grids(PmeGpu *pmeGPU) { auto *kernelParamsPtr = pmeGPU->kernelParams.get(); const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] * @@ -319,7 +319,7 @@ void pme_gpu_realloc_grids(pme_gpu_t *pmeGPU) } } -void pme_gpu_free_grids(const pme_gpu_t *pmeGPU) +void pme_gpu_free_grids(const PmeGpu *pmeGPU) { if (pmeGPU->archSpecific->performOutOfPlaceFFT) { @@ -329,7 +329,7 @@ void pme_gpu_free_grids(const pme_gpu_t *pmeGPU) &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc); } -void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU) +void pme_gpu_clear_grids(const PmeGpu *pmeGPU) { cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->grid.d_realGrid, 0, pmeGPU->archSpecific->realGridSize * sizeof(float), pmeGPU->archSpecific->pmeStream); @@ -337,7 +337,7 @@ void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU) CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error"); } -void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU) +void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *pmeGPU) { pme_gpu_free_fract_shifts(pmeGPU); @@ -368,7 +368,7 @@ void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU) pmeGPU->deviceInfo); } -void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU) +void pme_gpu_free_fract_shifts(const PmeGpu *pmeGPU) { auto *kernelParamsPtr = pmeGPU->kernelParams.get(); destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable, @@ -381,7 +381,7 @@ void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU) pmeGPU->deviceInfo); } -void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU) +void pme_gpu_sync_output_energy_virial(const PmeGpu *pmeGPU) { cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncEnerVirD2H); CU_RET_ERR(stat, "Error while waiting for PME solve output"); @@ -392,13 +392,13 @@ void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU) } } -void pme_gpu_copy_input_gather_grid(const pme_gpu_t *pmeGpu, float *h_grid) +void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid) { const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float); cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream); } -void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid) +void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid) { const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float); cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream); @@ -406,7 +406,7 @@ void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid) CU_RET_ERR(stat, "PME spread grid sync event record failure"); } -void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *pmeGpu) +void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu) { const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu); const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment; @@ -420,7 +420,7 @@ void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *pmeGpu) CU_RET_ERR(stat, "PME spread atom data sync event record failure"); } -void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *pmeGpu) +void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu) { const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu); const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment; @@ -444,30 +444,30 @@ void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *pmeGpu) kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream); } -void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU) +void pme_gpu_sync_spread_grid(const PmeGpu *pmeGPU) { cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSpreadGridD2H); CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host"); } -void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU) +void pme_gpu_sync_spline_atom_data(const PmeGpu *pmeGPU) { cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSplineAtomDataD2H); CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host"); } -void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU) +void pme_gpu_sync_solve_grid(const PmeGpu *pmeGPU) { cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSolveGridD2H); CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host"); //should check for pme_gpu_performs_solve(pmeGPU) } -void pme_gpu_init_internal(pme_gpu_t *pmeGPU) +void pme_gpu_init_internal(PmeGpu *pmeGPU) { /* Allocate the target-specific structures */ - pmeGPU->archSpecific.reset(new pme_gpu_specific_t()); - pmeGPU->kernelParams.reset(new pme_gpu_kernel_params_t()); + pmeGPU->archSpecific.reset(new PmeGpuSpecific()); + pmeGPU->kernelParams.reset(new PmeGpuKernelParams()); pmeGPU->archSpecific->performOutOfPlaceFFT = true; /* This should give better performance, according to the cuFFT documentation. @@ -492,14 +492,14 @@ void pme_gpu_init_internal(pme_gpu_t *pmeGPU) CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed"); } -void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU) +void pme_gpu_destroy_specific(const PmeGpu *pmeGPU) { /* Destroy the CUDA stream */ cudaError_t stat = cudaStreamDestroy(pmeGPU->archSpecific->pmeStream); CU_RET_ERR(stat, "PME cudaStreamDestroy error"); } -void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU) +void pme_gpu_init_sync_events(const PmeGpu *pmeGPU) { cudaError_t stat; const auto eventFlags = cudaEventDisableTiming; @@ -515,7 +515,7 @@ void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU) CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed"); } -void pme_gpu_destroy_sync_events(const pme_gpu_t *pmeGPU) +void pme_gpu_destroy_sync_events(const PmeGpu *pmeGPU) { cudaError_t stat; stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H); @@ -530,7 +530,7 @@ void pme_gpu_destroy_sync_events(const pme_gpu_t *pmeGPU) CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H"); } -void pme_gpu_reinit_3dfft(const pme_gpu_t *pmeGPU) +void pme_gpu_reinit_3dfft(const PmeGpu *pmeGPU) { if (pme_gpu_performs_FFT(pmeGPU)) { @@ -542,7 +542,7 @@ void pme_gpu_reinit_3dfft(const pme_gpu_t *pmeGPU) } } -void pme_gpu_destroy_3dfft(const pme_gpu_t *pmeGPU) +void pme_gpu_destroy_3dfft(const PmeGpu *pmeGPU) { pmeGPU->archSpecific->fftSetup.resize(0); } diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh index 71fb758a1c..001caf7686 100644 --- a/src/gromacs/ewald/pme.cuh +++ b/src/gromacs/ewald/pme.cuh @@ -140,7 +140,7 @@ int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient /*! \brief \internal * The main PME CUDA-specific host data structure, included in the PME GPU structure by the archSpecific pointer. */ -struct pme_gpu_cuda_t +struct PmeGpuCuda { /*! \brief The CUDA stream where everything related to the PME happens. */ cudaStream_t pmeStream; @@ -220,10 +220,10 @@ struct pme_gpu_cuda_t /*! \brief \internal * A single structure encompassing all the PME data used in CUDA kernels. - * This inherits from pme_gpu_kernel_params_base_t and adds a couple cudaTextureObject_t handles, + * This inherits from PmeGpuKernelParamsBase and adds a couple cudaTextureObject_t handles, * which we would like to avoid in plain C++. */ -struct pme_gpu_cuda_kernel_params_t : pme_gpu_kernel_params_base_t +struct PmeGpuCudaKernelParams : PmeGpuKernelParamsBase { /* These are CUDA texture objects, related to the grid size. */ /*! \brief CUDA texture object for accessing grid.d_fractShiftsTable */ diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index af9fd7c77b..65dcd22bc5 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -58,7 +58,7 @@ struct interaction_const_t; struct t_commrec; struct t_inputrec; struct t_nrnb; -struct pme_gpu_t; +struct PmeGpu; struct gmx_wallclock_gpu_pme_t; struct gmx_device_info_t; struct gmx_pme_t; @@ -106,7 +106,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec *cr, real ewaldcoeff_q, real ewaldcoeff_lj, int nthread, PmeRunMode runMode, - pme_gpu_t *pmeGPU, + PmeGpu *pmeGPU, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog); -- 2.11.4.GIT