2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file contains internal function implementations
39 * for performing the PME calculations on GPU.
41 * Note that this file is compiled as regular C++ source in OpenCL builds, but
42 * it is treated as CUDA source in CUDA-enabled GPU builds.
44 * \author Aleksei Iupinov <a.yupinov@gmail.com>
45 * \ingroup module_ewald
50 #include "pme_gpu_internal.h"
58 #include "gromacs/ewald/ewald_utils.h"
59 #include "gromacs/gpu_utils/gpu_utils.h"
60 #include "gromacs/math/invertmatrix.h"
61 #include "gromacs/math/units.h"
62 #include "gromacs/timing/gpu_timing.h"
63 #include "gromacs/utility/exceptions.h"
64 #include "gromacs/utility/fatalerror.h"
65 #include "gromacs/utility/gmxassert.h"
66 #include "gromacs/utility/logger.h"
67 #include "gromacs/utility/stringutil.h"
69 #if GMX_GPU == GMX_GPU_CUDA
70 # include "gromacs/gpu_utils/pmalloc_cuda.h"
73 #elif GMX_GPU == GMX_GPU_OPENCL
74 # include "gromacs/gpu_utils/gmxopencl.h"
77 #include "gromacs/ewald/pme.h"
79 #include "pme_gpu_3dfft.h"
80 #include "pme_gpu_constants.h"
81 #include "pme_gpu_program_impl.h"
82 #include "pme_gpu_timings.h"
83 #include "pme_gpu_types.h"
84 #include "pme_gpu_types_host.h"
85 #include "pme_gpu_types_host_impl.h"
86 #include "pme_gpu_utils.h"
88 #include "pme_internal.h"
89 #include "pme_solve.h"
93 * Atom limit above which it is advantageous to turn on the
94 * recalcuating of the splines in the gather and using less threads per atom in the spline and spread
96 constexpr int c_pmeGpuPerformanceAtomLimit
= 23000;
99 * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
101 * \param[in] pmeGpu The PME GPU structure.
102 * \returns The pointer to the kernel parameters.
104 static PmeGpuKernelParamsBase
* pme_gpu_get_kernel_params_base_ptr(const PmeGpu
* pmeGpu
)
106 // reinterpret_cast is needed because the derived CUDA structure is not known in this file
107 auto* kernelParamsPtr
= reinterpret_cast<PmeGpuKernelParamsBase
*>(pmeGpu
->kernelParams
.get());
108 return kernelParamsPtr
;
111 int pme_gpu_get_atom_data_alignment(const PmeGpu
* /*unused*/)
113 // TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
116 return c_pmeAtomDataAlignment
;
124 int pme_gpu_get_atoms_per_warp(const PmeGpu
* pmeGpu
)
126 if (pmeGpu
->settings
.useOrderThreadsPerAtom
)
128 return pmeGpu
->programHandle_
->impl_
->warpSize
/ c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
;
132 return pmeGpu
->programHandle_
->impl_
->warpSize
/ c_pmeSpreadGatherThreadsPerAtom
;
136 void pme_gpu_synchronize(const PmeGpu
* pmeGpu
)
138 gpuStreamSynchronize(pmeGpu
->archSpecific
->pmeStream
);
141 void pme_gpu_alloc_energy_virial(PmeGpu
* pmeGpu
)
143 const size_t energyAndVirialSize
= c_virialAndEnergyCount
* sizeof(float);
144 allocateDeviceBuffer(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
, c_virialAndEnergyCount
,
145 pmeGpu
->archSpecific
->context
);
146 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_virialAndEnergy
), energyAndVirialSize
);
149 void pme_gpu_free_energy_virial(PmeGpu
* pmeGpu
)
151 freeDeviceBuffer(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
);
152 pfree(pmeGpu
->staging
.h_virialAndEnergy
);
153 pmeGpu
->staging
.h_virialAndEnergy
= nullptr;
156 void pme_gpu_clear_energy_virial(const PmeGpu
* pmeGpu
)
158 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
, 0,
159 c_virialAndEnergyCount
, pmeGpu
->archSpecific
->pmeStream
);
162 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu
* pmeGpu
)
164 const int splineValuesOffset
[DIM
] = { 0, pmeGpu
->kernelParams
->grid
.realGridSize
[XX
],
165 pmeGpu
->kernelParams
->grid
.realGridSize
[XX
]
166 + pmeGpu
->kernelParams
->grid
.realGridSize
[YY
] };
167 memcpy(&pmeGpu
->kernelParams
->grid
.splineValuesOffset
, &splineValuesOffset
, sizeof(splineValuesOffset
));
169 const int newSplineValuesSize
= pmeGpu
->kernelParams
->grid
.realGridSize
[XX
]
170 + pmeGpu
->kernelParams
->grid
.realGridSize
[YY
]
171 + pmeGpu
->kernelParams
->grid
.realGridSize
[ZZ
];
172 const bool shouldRealloc
= (newSplineValuesSize
> pmeGpu
->archSpecific
->splineValuesSize
);
173 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
, newSplineValuesSize
,
174 &pmeGpu
->archSpecific
->splineValuesSize
,
175 &pmeGpu
->archSpecific
->splineValuesSizeAlloc
, pmeGpu
->archSpecific
->context
);
178 /* Reallocate the host buffer */
179 pfree(pmeGpu
->staging
.h_splineModuli
);
180 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_splineModuli
),
181 newSplineValuesSize
* sizeof(float));
183 for (int i
= 0; i
< DIM
; i
++)
185 memcpy(pmeGpu
->staging
.h_splineModuli
+ splineValuesOffset
[i
],
186 pmeGpu
->common
->bsp_mod
[i
].data(), pmeGpu
->common
->bsp_mod
[i
].size() * sizeof(float));
188 /* TODO: pin original buffer instead! */
189 copyToDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
, pmeGpu
->staging
.h_splineModuli
,
190 0, newSplineValuesSize
, pmeGpu
->archSpecific
->pmeStream
,
191 pmeGpu
->settings
.transferKind
, nullptr);
194 void pme_gpu_free_bspline_values(const PmeGpu
* pmeGpu
)
196 pfree(pmeGpu
->staging
.h_splineModuli
);
197 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
);
200 void pme_gpu_realloc_forces(PmeGpu
* pmeGpu
)
202 const size_t newForcesSize
= pmeGpu
->nAtomsAlloc
* DIM
;
203 GMX_ASSERT(newForcesSize
> 0, "Bad number of atoms in PME GPU");
204 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
, newForcesSize
,
205 &pmeGpu
->archSpecific
->forcesSize
,
206 &pmeGpu
->archSpecific
->forcesSizeAlloc
, pmeGpu
->archSpecific
->context
);
207 pmeGpu
->staging
.h_forces
.reserveWithPadding(pmeGpu
->nAtomsAlloc
);
208 pmeGpu
->staging
.h_forces
.resizeWithPadding(pmeGpu
->kernelParams
->atoms
.nAtoms
);
211 void pme_gpu_free_forces(const PmeGpu
* pmeGpu
)
213 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
);
216 void pme_gpu_copy_input_forces(PmeGpu
* pmeGpu
)
218 GMX_ASSERT(pmeGpu
->kernelParams
->atoms
.nAtoms
> 0, "Bad number of atoms in PME GPU");
219 float* h_forcesFloat
= reinterpret_cast<float*>(pmeGpu
->staging
.h_forces
.data());
220 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
, h_forcesFloat
, 0,
221 DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
, pmeGpu
->archSpecific
->pmeStream
,
222 pmeGpu
->settings
.transferKind
, nullptr);
225 void pme_gpu_copy_output_forces(PmeGpu
* pmeGpu
)
227 GMX_ASSERT(pmeGpu
->kernelParams
->atoms
.nAtoms
> 0, "Bad number of atoms in PME GPU");
228 float* h_forcesFloat
= reinterpret_cast<float*>(pmeGpu
->staging
.h_forces
.data());
229 copyFromDeviceBuffer(h_forcesFloat
, &pmeGpu
->kernelParams
->atoms
.d_forces
, 0,
230 DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
, pmeGpu
->archSpecific
->pmeStream
,
231 pmeGpu
->settings
.transferKind
, nullptr);
234 void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu
* pmeGpu
, const float* h_coefficients
)
236 GMX_ASSERT(h_coefficients
, "Bad host-side charge buffer in PME GPU");
237 const size_t newCoefficientsSize
= pmeGpu
->nAtomsAlloc
;
238 GMX_ASSERT(newCoefficientsSize
> 0, "Bad number of atoms in PME GPU");
239 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
, newCoefficientsSize
,
240 &pmeGpu
->archSpecific
->coefficientsSize
,
241 &pmeGpu
->archSpecific
->coefficientsSizeAlloc
, pmeGpu
->archSpecific
->context
);
242 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
,
243 const_cast<float*>(h_coefficients
), 0, pmeGpu
->kernelParams
->atoms
.nAtoms
,
244 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
247 const size_t paddingIndex
= pmeGpu
->kernelParams
->atoms
.nAtoms
;
248 const size_t paddingCount
= pmeGpu
->nAtomsAlloc
- paddingIndex
;
249 if (paddingCount
> 0)
251 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->atoms
.d_coefficients
, paddingIndex
,
252 paddingCount
, pmeGpu
->archSpecific
->pmeStream
);
257 void pme_gpu_free_coefficients(const PmeGpu
* pmeGpu
)
259 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
);
262 void pme_gpu_realloc_spline_data(PmeGpu
* pmeGpu
)
264 const int order
= pmeGpu
->common
->pme_order
;
265 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
266 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
267 const int newSplineDataSize
= DIM
* order
* nAtomsPadded
;
268 GMX_ASSERT(newSplineDataSize
> 0, "Bad number of atoms in PME GPU");
269 /* Two arrays of the same size */
270 const bool shouldRealloc
= (newSplineDataSize
> pmeGpu
->archSpecific
->splineDataSize
);
271 int currentSizeTemp
= pmeGpu
->archSpecific
->splineDataSize
;
272 int currentSizeTempAlloc
= pmeGpu
->archSpecific
->splineDataSizeAlloc
;
273 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_theta
, newSplineDataSize
,
274 ¤tSizeTemp
, ¤tSizeTempAlloc
, pmeGpu
->archSpecific
->context
);
275 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_dtheta
, newSplineDataSize
,
276 &pmeGpu
->archSpecific
->splineDataSize
,
277 &pmeGpu
->archSpecific
->splineDataSizeAlloc
, pmeGpu
->archSpecific
->context
);
278 // the host side reallocation
281 pfree(pmeGpu
->staging
.h_theta
);
282 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_theta
), newSplineDataSize
* sizeof(float));
283 pfree(pmeGpu
->staging
.h_dtheta
);
284 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_dtheta
), newSplineDataSize
* sizeof(float));
288 void pme_gpu_free_spline_data(const PmeGpu
* pmeGpu
)
290 /* Two arrays of the same size */
291 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_theta
);
292 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_dtheta
);
293 pfree(pmeGpu
->staging
.h_theta
);
294 pfree(pmeGpu
->staging
.h_dtheta
);
297 void pme_gpu_realloc_grid_indices(PmeGpu
* pmeGpu
)
299 const size_t newIndicesSize
= DIM
* pmeGpu
->nAtomsAlloc
;
300 GMX_ASSERT(newIndicesSize
> 0, "Bad number of atoms in PME GPU");
301 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_gridlineIndices
, newIndicesSize
,
302 &pmeGpu
->archSpecific
->gridlineIndicesSize
,
303 &pmeGpu
->archSpecific
->gridlineIndicesSizeAlloc
, pmeGpu
->archSpecific
->context
);
304 pfree(pmeGpu
->staging
.h_gridlineIndices
);
305 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_gridlineIndices
), newIndicesSize
* sizeof(int));
308 void pme_gpu_free_grid_indices(const PmeGpu
* pmeGpu
)
310 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_gridlineIndices
);
311 pfree(pmeGpu
->staging
.h_gridlineIndices
);
314 void pme_gpu_realloc_grids(PmeGpu
* pmeGpu
)
316 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
317 const int newRealGridSize
= kernelParamsPtr
->grid
.realGridSizePadded
[XX
]
318 * kernelParamsPtr
->grid
.realGridSizePadded
[YY
]
319 * kernelParamsPtr
->grid
.realGridSizePadded
[ZZ
];
320 const int newComplexGridSize
= kernelParamsPtr
->grid
.complexGridSizePadded
[XX
]
321 * kernelParamsPtr
->grid
.complexGridSizePadded
[YY
]
322 * kernelParamsPtr
->grid
.complexGridSizePadded
[ZZ
] * 2;
323 // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
324 if (pmeGpu
->archSpecific
->performOutOfPlaceFFT
)
326 /* 2 separate grids */
327 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_fourierGrid
, newComplexGridSize
,
328 &pmeGpu
->archSpecific
->complexGridSize
,
329 &pmeGpu
->archSpecific
->complexGridSizeAlloc
, pmeGpu
->archSpecific
->context
);
330 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_realGrid
, newRealGridSize
,
331 &pmeGpu
->archSpecific
->realGridSize
,
332 &pmeGpu
->archSpecific
->realGridSizeAlloc
, pmeGpu
->archSpecific
->context
);
336 /* A single buffer so that any grid will fit */
337 const int newGridsSize
= std::max(newRealGridSize
, newComplexGridSize
);
338 reallocateDeviceBuffer(
339 &kernelParamsPtr
->grid
.d_realGrid
, newGridsSize
, &pmeGpu
->archSpecific
->realGridSize
,
340 &pmeGpu
->archSpecific
->realGridSizeAlloc
, pmeGpu
->archSpecific
->context
);
341 kernelParamsPtr
->grid
.d_fourierGrid
= kernelParamsPtr
->grid
.d_realGrid
;
342 pmeGpu
->archSpecific
->complexGridSize
= pmeGpu
->archSpecific
->realGridSize
;
343 // the size might get used later for copying the grid
347 void pme_gpu_free_grids(const PmeGpu
* pmeGpu
)
349 if (pmeGpu
->archSpecific
->performOutOfPlaceFFT
)
351 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_fourierGrid
);
353 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_realGrid
);
356 void pme_gpu_clear_grids(const PmeGpu
* pmeGpu
)
358 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->grid
.d_realGrid
, 0,
359 pmeGpu
->archSpecific
->realGridSize
, pmeGpu
->archSpecific
->pmeStream
);
362 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu
* pmeGpu
)
364 pme_gpu_free_fract_shifts(pmeGpu
);
366 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
368 const int nx
= kernelParamsPtr
->grid
.realGridSize
[XX
];
369 const int ny
= kernelParamsPtr
->grid
.realGridSize
[YY
];
370 const int nz
= kernelParamsPtr
->grid
.realGridSize
[ZZ
];
371 const int cellCount
= c_pmeNeighborUnitcellCount
;
372 const int gridDataOffset
[DIM
] = { 0, cellCount
* nx
, cellCount
* (nx
+ ny
) };
374 memcpy(kernelParamsPtr
->grid
.tablesOffsets
, &gridDataOffset
, sizeof(gridDataOffset
));
376 const int newFractShiftsSize
= cellCount
* (nx
+ ny
+ nz
);
378 #if GMX_GPU == GMX_GPU_CUDA
379 initParamLookupTable(kernelParamsPtr
->grid
.d_fractShiftsTable
, kernelParamsPtr
->fractShiftsTableTexture
,
380 pmeGpu
->common
->fsh
.data(), newFractShiftsSize
);
382 initParamLookupTable(kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
383 kernelParamsPtr
->gridlineIndicesTableTexture
, pmeGpu
->common
->nn
.data(),
385 #elif GMX_GPU == GMX_GPU_OPENCL
386 // No dedicated texture routines....
387 allocateDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
, newFractShiftsSize
,
388 pmeGpu
->archSpecific
->context
);
389 allocateDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
, newFractShiftsSize
,
390 pmeGpu
->archSpecific
->context
);
391 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
, pmeGpu
->common
->fsh
.data(), 0,
392 newFractShiftsSize
, pmeGpu
->archSpecific
->pmeStream
,
393 GpuApiCallBehavior::Async
, nullptr);
394 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
, pmeGpu
->common
->nn
.data(), 0,
395 newFractShiftsSize
, pmeGpu
->archSpecific
->pmeStream
,
396 GpuApiCallBehavior::Async
, nullptr);
400 void pme_gpu_free_fract_shifts(const PmeGpu
* pmeGpu
)
402 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
403 #if GMX_GPU == GMX_GPU_CUDA
404 destroyParamLookupTable(kernelParamsPtr
->grid
.d_fractShiftsTable
,
405 kernelParamsPtr
->fractShiftsTableTexture
);
406 destroyParamLookupTable(kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
407 kernelParamsPtr
->gridlineIndicesTableTexture
);
408 #elif GMX_GPU == GMX_GPU_OPENCL
409 freeDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
);
410 freeDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
);
414 bool pme_gpu_stream_query(const PmeGpu
* pmeGpu
)
416 return haveStreamTasksCompleted(pmeGpu
->archSpecific
->pmeStream
);
419 void pme_gpu_copy_input_gather_grid(const PmeGpu
* pmeGpu
, float* h_grid
)
421 copyToDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_realGrid
, h_grid
, 0, pmeGpu
->archSpecific
->realGridSize
,
422 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
425 void pme_gpu_copy_output_spread_grid(const PmeGpu
* pmeGpu
, float* h_grid
)
427 copyFromDeviceBuffer(h_grid
, &pmeGpu
->kernelParams
->grid
.d_realGrid
, 0,
428 pmeGpu
->archSpecific
->realGridSize
, pmeGpu
->archSpecific
->pmeStream
,
429 pmeGpu
->settings
.transferKind
, nullptr);
430 pmeGpu
->archSpecific
->syncSpreadGridD2H
.markEvent(pmeGpu
->archSpecific
->pmeStream
);
433 void pme_gpu_copy_output_spread_atom_data(const PmeGpu
* pmeGpu
)
435 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
436 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
437 const size_t splinesCount
= DIM
* nAtomsPadded
* pmeGpu
->common
->pme_order
;
438 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
439 copyFromDeviceBuffer(pmeGpu
->staging
.h_dtheta
, &kernelParamsPtr
->atoms
.d_dtheta
, 0, splinesCount
,
440 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
441 copyFromDeviceBuffer(pmeGpu
->staging
.h_theta
, &kernelParamsPtr
->atoms
.d_theta
, 0, splinesCount
,
442 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
443 copyFromDeviceBuffer(pmeGpu
->staging
.h_gridlineIndices
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
444 0, kernelParamsPtr
->atoms
.nAtoms
* DIM
, pmeGpu
->archSpecific
->pmeStream
,
445 pmeGpu
->settings
.transferKind
, nullptr);
448 void pme_gpu_copy_input_gather_atom_data(const PmeGpu
* pmeGpu
)
450 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
451 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
452 const size_t splinesCount
= DIM
* nAtomsPadded
* pmeGpu
->common
->pme_order
;
453 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
456 // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
457 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_gridlineIndices
, 0,
458 pmeGpu
->nAtomsAlloc
* DIM
, pmeGpu
->archSpecific
->pmeStream
);
459 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_dtheta
, 0,
460 pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
* DIM
,
461 pmeGpu
->archSpecific
->pmeStream
);
462 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_theta
, 0,
463 pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
* DIM
,
464 pmeGpu
->archSpecific
->pmeStream
);
466 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_dtheta
, pmeGpu
->staging
.h_dtheta
, 0, splinesCount
,
467 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
468 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_theta
, pmeGpu
->staging
.h_theta
, 0, splinesCount
,
469 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
470 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_gridlineIndices
, pmeGpu
->staging
.h_gridlineIndices
,
471 0, kernelParamsPtr
->atoms
.nAtoms
* DIM
, pmeGpu
->archSpecific
->pmeStream
,
472 pmeGpu
->settings
.transferKind
, nullptr);
475 void pme_gpu_sync_spread_grid(const PmeGpu
* pmeGpu
)
477 pmeGpu
->archSpecific
->syncSpreadGridD2H
.waitForEvent();
480 void pme_gpu_init_internal(PmeGpu
* pmeGpu
)
482 #if GMX_GPU == GMX_GPU_CUDA
483 // Prepare to use the device that this PME task was assigned earlier.
484 // Other entities, such as CUDA timing events, are known to implicitly use the device context.
485 CU_RET_ERR(cudaSetDevice(pmeGpu
->deviceInfo
->id
), "Switching to PME CUDA device");
488 /* Allocate the target-specific structures */
489 pmeGpu
->archSpecific
.reset(new PmeGpuSpecific());
490 pmeGpu
->kernelParams
.reset(new PmeGpuKernelParams());
492 pmeGpu
->archSpecific
->performOutOfPlaceFFT
= true;
493 /* This should give better performance, according to the cuFFT documentation.
494 * The performance seems to be the same though.
495 * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
498 // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
499 pmeGpu
->archSpecific
->context
= pmeGpu
->programHandle_
->impl_
->context
;
501 // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
502 if (GMX_GPU
== GMX_GPU_CUDA
)
504 /* WARNING: CUDA timings are incorrect with multiple streams.
505 * This is the main reason why they are disabled by default.
507 // TODO: Consider turning on by default when we can detect nr of streams.
508 pmeGpu
->archSpecific
->useTiming
= (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
510 else if (GMX_GPU
== GMX_GPU_OPENCL
)
512 pmeGpu
->archSpecific
->useTiming
= (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
515 #if GMX_GPU == GMX_GPU_CUDA
516 pmeGpu
->maxGridWidthX
= pmeGpu
->deviceInfo
->prop
.maxGridSize
[0];
517 #elif GMX_GPU == GMX_GPU_OPENCL
518 pmeGpu
->maxGridWidthX
= INT32_MAX
/ 2;
519 // TODO: is there no really global work size limit in OpenCL?
522 /* Creating a PME GPU stream:
523 * - default high priority with CUDA
524 * - no priorities implemented yet with OpenCL; see #2532
526 #if GMX_GPU == GMX_GPU_CUDA
528 int highest_priority
, lowest_priority
;
529 stat
= cudaDeviceGetStreamPriorityRange(&lowest_priority
, &highest_priority
);
530 CU_RET_ERR(stat
, "PME cudaDeviceGetStreamPriorityRange failed");
531 stat
= cudaStreamCreateWithPriority(&pmeGpu
->archSpecific
->pmeStream
,
532 cudaStreamDefault
, // cudaStreamNonBlocking,
534 CU_RET_ERR(stat
, "cudaStreamCreateWithPriority on the PME stream failed");
535 #elif GMX_GPU == GMX_GPU_OPENCL
536 cl_command_queue_properties queueProperties
=
537 pmeGpu
->archSpecific
->useTiming
? CL_QUEUE_PROFILING_ENABLE
: 0;
538 cl_device_id device_id
= pmeGpu
->deviceInfo
->ocl_gpu_id
.ocl_device_id
;
540 pmeGpu
->archSpecific
->pmeStream
=
541 clCreateCommandQueue(pmeGpu
->archSpecific
->context
, device_id
, queueProperties
, &clError
);
542 if (clError
!= CL_SUCCESS
)
544 GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
549 void pme_gpu_destroy_specific(const PmeGpu
* pmeGpu
)
551 #if GMX_GPU == GMX_GPU_CUDA
552 /* Destroy the CUDA stream */
553 cudaError_t stat
= cudaStreamDestroy(pmeGpu
->archSpecific
->pmeStream
);
554 CU_RET_ERR(stat
, "PME cudaStreamDestroy error");
555 #elif GMX_GPU == GMX_GPU_OPENCL
556 cl_int clError
= clReleaseCommandQueue(pmeGpu
->archSpecific
->pmeStream
);
557 if (clError
!= CL_SUCCESS
)
559 gmx_warning("Failed to destroy PME command queue");
564 void pme_gpu_reinit_3dfft(const PmeGpu
* pmeGpu
)
566 if (pme_gpu_settings(pmeGpu
).performGPUFFT
)
568 pmeGpu
->archSpecific
->fftSetup
.resize(0);
569 for (int i
= 0; i
< pmeGpu
->common
->ngrids
; i
++)
571 pmeGpu
->archSpecific
->fftSetup
.push_back(std::make_unique
<GpuParallel3dFft
>(pmeGpu
));
576 void pme_gpu_destroy_3dfft(const PmeGpu
* pmeGpu
)
578 pmeGpu
->archSpecific
->fftSetup
.resize(0);
581 int getSplineParamFullIndex(int order
, int splineIndex
, int dimIndex
, int atomIndex
, int atomsPerWarp
)
583 if (order
!= c_pmeGpuOrder
)
587 constexpr int fixedOrder
= c_pmeGpuOrder
;
588 GMX_UNUSED_VALUE(fixedOrder
);
590 const int atomWarpIndex
= atomIndex
% atomsPerWarp
;
591 const int warpIndex
= atomIndex
/ atomsPerWarp
;
592 int indexBase
, result
;
593 switch (atomsPerWarp
)
596 indexBase
= getSplineParamIndexBase
<fixedOrder
, 1>(warpIndex
, atomWarpIndex
);
597 result
= getSplineParamIndex
<fixedOrder
, 1>(indexBase
, dimIndex
, splineIndex
);
601 indexBase
= getSplineParamIndexBase
<fixedOrder
, 2>(warpIndex
, atomWarpIndex
);
602 result
= getSplineParamIndex
<fixedOrder
, 2>(indexBase
, dimIndex
, splineIndex
);
606 indexBase
= getSplineParamIndexBase
<fixedOrder
, 4>(warpIndex
, atomWarpIndex
);
607 result
= getSplineParamIndex
<fixedOrder
, 4>(indexBase
, dimIndex
, splineIndex
);
611 indexBase
= getSplineParamIndexBase
<fixedOrder
, 8>(warpIndex
, atomWarpIndex
);
612 result
= getSplineParamIndex
<fixedOrder
, 8>(indexBase
, dimIndex
, splineIndex
);
616 GMX_THROW(gmx::NotImplementedError(
617 gmx::formatString("Test function call not unrolled for atomsPerWarp = %d in "
618 "getSplineParamFullIndex",
624 void pme_gpu_getEnergyAndVirial(const gmx_pme_t
& pme
, PmeOutput
* output
)
626 const PmeGpu
* pmeGpu
= pme
.gpu
;
627 for (int j
= 0; j
< c_virialAndEnergyCount
; j
++)
629 GMX_ASSERT(std::isfinite(pmeGpu
->staging
.h_virialAndEnergy
[j
]),
630 "PME GPU produces incorrect energy/virial.");
634 output
->coulombVirial_
[XX
][XX
] = 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
635 output
->coulombVirial_
[YY
][YY
] = 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
636 output
->coulombVirial_
[ZZ
][ZZ
] = 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
637 output
->coulombVirial_
[XX
][YY
] = output
->coulombVirial_
[YY
][XX
] =
638 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
639 output
->coulombVirial_
[XX
][ZZ
] = output
->coulombVirial_
[ZZ
][XX
] =
640 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
641 output
->coulombVirial_
[YY
][ZZ
] = output
->coulombVirial_
[ZZ
][YY
] =
642 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
643 output
->coulombEnergy_
= 0.5F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
646 /*! \brief Sets the force-related members in \p output
648 * \param[in] pmeGpu PME GPU data structure
649 * \param[out] output Pointer to PME output data structure
651 static void pme_gpu_getForceOutput(PmeGpu
* pmeGpu
, PmeOutput
* output
)
653 output
->haveForceOutput_
= !pmeGpu
->settings
.useGpuForceReduction
;
654 if (output
->haveForceOutput_
)
656 output
->forces_
= pmeGpu
->staging
.h_forces
;
660 PmeOutput
pme_gpu_getOutput(const gmx_pme_t
& pme
, const int flags
)
662 PmeGpu
* pmeGpu
= pme
.gpu
;
663 const bool haveComputedEnergyAndVirial
= (flags
& GMX_PME_CALC_ENER_VIR
) != 0;
667 pme_gpu_getForceOutput(pmeGpu
, &output
);
669 // The caller knows from the flags that the energy and the virial are not usable
670 // on the else branch
671 if (haveComputedEnergyAndVirial
)
673 if (pme_gpu_settings(pmeGpu
).performGPUSolve
)
675 pme_gpu_getEnergyAndVirial(pme
, &output
);
679 get_pme_ener_vir_q(pme
.solve_work
, pme
.nthread
, &output
);
685 void pme_gpu_update_input_box(PmeGpu gmx_unused
* pmeGpu
, const matrix gmx_unused box
)
688 GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
691 pmeGpu
->common
->boxScaler
->scaleBox(box
, scaledBox
);
692 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
693 kernelParamsPtr
->current
.boxVolume
= scaledBox
[XX
][XX
] * scaledBox
[YY
][YY
] * scaledBox
[ZZ
][ZZ
];
694 GMX_ASSERT(kernelParamsPtr
->current
.boxVolume
!= 0.0F
, "Zero volume of the unit cell");
696 gmx::invertBoxMatrix(scaledBox
, recipBox
);
698 /* The GPU recipBox is transposed as compared to the CPU recipBox.
699 * Spread uses matrix columns (while solve and gather use rows).
700 * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
702 const real newRecipBox
[DIM
][DIM
] = { { recipBox
[XX
][XX
], recipBox
[YY
][XX
], recipBox
[ZZ
][XX
] },
703 { 0.0, recipBox
[YY
][YY
], recipBox
[ZZ
][YY
] },
704 { 0.0, 0.0, recipBox
[ZZ
][ZZ
] } };
705 memcpy(kernelParamsPtr
->current
.recipBox
, newRecipBox
, sizeof(matrix
));
709 /*! \brief \libinternal
710 * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
712 * \param[in] pmeGpu The PME GPU structure.
714 static void pme_gpu_reinit_grids(PmeGpu
* pmeGpu
)
716 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
717 kernelParamsPtr
->grid
.ewaldFactor
=
718 (M_PI
* M_PI
) / (pmeGpu
->common
->ewaldcoeff_q
* pmeGpu
->common
->ewaldcoeff_q
);
720 /* The grid size variants */
721 for (int i
= 0; i
< DIM
; i
++)
723 kernelParamsPtr
->grid
.realGridSize
[i
] = pmeGpu
->common
->nk
[i
];
724 kernelParamsPtr
->grid
.realGridSizeFP
[i
] =
725 static_cast<float>(kernelParamsPtr
->grid
.realGridSize
[i
]);
726 kernelParamsPtr
->grid
.realGridSizePadded
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
728 // The complex grid currently uses no padding;
729 // if it starts to do so, then another test should be added for that
730 kernelParamsPtr
->grid
.complexGridSize
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
731 kernelParamsPtr
->grid
.complexGridSizePadded
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
733 /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
734 if (!pme_gpu_settings(pmeGpu
).performGPUFFT
)
736 // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
737 kernelParamsPtr
->grid
.realGridSizePadded
[ZZ
] =
738 (kernelParamsPtr
->grid
.realGridSize
[ZZ
] / 2 + 1) * 2;
741 /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
742 kernelParamsPtr
->grid
.complexGridSize
[ZZ
] /= 2;
743 kernelParamsPtr
->grid
.complexGridSize
[ZZ
]++;
744 kernelParamsPtr
->grid
.complexGridSizePadded
[ZZ
] = kernelParamsPtr
->grid
.complexGridSize
[ZZ
];
746 pme_gpu_realloc_and_copy_fract_shifts(pmeGpu
);
747 pme_gpu_realloc_and_copy_bspline_values(pmeGpu
);
748 pme_gpu_realloc_grids(pmeGpu
);
749 pme_gpu_reinit_3dfft(pmeGpu
);
752 /* Several GPU functions that refer to the CPU PME data live here.
753 * We would like to keep these away from the GPU-framework specific code for clarity,
754 * as well as compilation issues with MPI.
757 /*! \brief \libinternal
758 * Copies everything useful from the PME CPU to the PME GPU structure.
759 * The goal is to minimize interaction with the PME CPU structure in the GPU code.
761 * \param[in] pme The PME structure.
763 static void pme_gpu_copy_common_data_from(const gmx_pme_t
* pme
)
765 /* TODO: Consider refactoring the CPU PME code to use the same structure,
766 * so that this function becomes 2 lines */
767 PmeGpu
* pmeGpu
= pme
->gpu
;
768 pmeGpu
->common
->ngrids
= pme
->ngrids
;
769 pmeGpu
->common
->epsilon_r
= pme
->epsilon_r
;
770 pmeGpu
->common
->ewaldcoeff_q
= pme
->ewaldcoeff_q
;
771 pmeGpu
->common
->nk
[XX
] = pme
->nkx
;
772 pmeGpu
->common
->nk
[YY
] = pme
->nky
;
773 pmeGpu
->common
->nk
[ZZ
] = pme
->nkz
;
774 pmeGpu
->common
->pme_order
= pme
->pme_order
;
775 if (pmeGpu
->common
->pme_order
!= c_pmeGpuOrder
)
777 GMX_THROW(gmx::NotImplementedError("pme_order != 4 is not implemented!"));
779 for (int i
= 0; i
< DIM
; i
++)
781 pmeGpu
->common
->bsp_mod
[i
].assign(pme
->bsp_mod
[i
], pme
->bsp_mod
[i
] + pmeGpu
->common
->nk
[i
]);
783 const int cellCount
= c_pmeNeighborUnitcellCount
;
784 pmeGpu
->common
->fsh
.resize(0);
785 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshx
, pme
->fshx
+ cellCount
* pme
->nkx
);
786 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshy
, pme
->fshy
+ cellCount
* pme
->nky
);
787 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshz
, pme
->fshz
+ cellCount
* pme
->nkz
);
788 pmeGpu
->common
->nn
.resize(0);
789 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nnx
, pme
->nnx
+ cellCount
* pme
->nkx
);
790 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nny
, pme
->nny
+ cellCount
* pme
->nky
);
791 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nnz
, pme
->nnz
+ cellCount
* pme
->nkz
);
792 pmeGpu
->common
->runMode
= pme
->runMode
;
793 pmeGpu
->common
->isRankPmeOnly
= !pme
->bPPnode
;
794 pmeGpu
->common
->boxScaler
= pme
->boxScaler
;
797 /*! \libinternal \brief
798 * uses heuristics to select the best performing PME gather and scatter kernels
800 * \param[in,out] pmeGpu The PME GPU structure.
802 static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu
* pmeGpu
)
804 if (pmeGpu
->kernelParams
->atoms
.nAtoms
> c_pmeGpuPerformanceAtomLimit
&& (GMX_GPU
== GMX_GPU_CUDA
))
806 pmeGpu
->settings
.useOrderThreadsPerAtom
= true;
807 pmeGpu
->settings
.recalculateSplines
= true;
811 pmeGpu
->settings
.useOrderThreadsPerAtom
= false;
812 pmeGpu
->settings
.recalculateSplines
= false;
817 /*! \libinternal \brief
818 * Initializes the PME GPU data at the beginning of the run.
819 * TODO: this should become PmeGpu::PmeGpu()
821 * \param[in,out] pme The PME structure.
822 * \param[in,out] gpuInfo The GPU information structure.
823 * \param[in] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
825 static void pme_gpu_init(gmx_pme_t
* pme
, const gmx_device_info_t
* gpuInfo
, const PmeGpuProgram
* pmeGpuProgram
)
827 pme
->gpu
= new PmeGpu();
828 PmeGpu
* pmeGpu
= pme
->gpu
;
829 changePinningPolicy(&pmeGpu
->staging
.h_forces
, pme_get_pinning_policy());
830 pmeGpu
->common
= std::make_shared
<PmeShared
>();
832 /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
833 /* A convenience variable. */
834 pmeGpu
->settings
.useDecomposition
= (pme
->nnodes
!= 1);
835 /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
836 pmeGpu
->settings
.performGPUGather
= true;
837 // By default GPU-side reduction is off (explicitly set here for tests, otherwise reset per-step)
838 pmeGpu
->settings
.useGpuForceReduction
= false;
840 pme_gpu_set_testing(pmeGpu
, false);
842 pmeGpu
->deviceInfo
= gpuInfo
;
843 GMX_ASSERT(pmeGpuProgram
!= nullptr, "GPU kernels must be already compiled");
844 pmeGpu
->programHandle_
= pmeGpuProgram
;
846 pmeGpu
->initializedClfftLibrary_
= std::make_unique
<gmx::ClfftInitializer
>();
848 pme_gpu_init_internal(pmeGpu
);
849 pme_gpu_alloc_energy_virial(pmeGpu
);
851 pme_gpu_copy_common_data_from(pme
);
853 GMX_ASSERT(pmeGpu
->common
->epsilon_r
!= 0.0F
, "PME GPU: bad electrostatic coefficient");
855 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
856 kernelParamsPtr
->constants
.elFactor
= ONE_4PI_EPS0
/ pmeGpu
->common
->epsilon_r
;
859 void pme_gpu_transform_spline_atom_data(const PmeGpu
* pmeGpu
,
860 const PmeAtomComm
* atc
,
861 PmeSplineDataType type
,
863 PmeLayoutTransform transform
)
865 // The GPU atom spline data is laid out in a different way currently than the CPU one.
866 // This function converts the data from GPU to CPU layout (in the host memory).
867 // It is only intended for testing purposes so far.
868 // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
869 // performance (e.g. spreading on GPU, gathering on CPU).
870 GMX_RELEASE_ASSERT(atc
->nthread
== 1, "Only the serial PME data layout is supported");
871 const uintmax_t threadIndex
= 0;
872 const auto atomCount
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
)->atoms
.nAtoms
;
873 const auto atomsPerWarp
= pme_gpu_get_atoms_per_warp(pmeGpu
);
874 const auto pmeOrder
= pmeGpu
->common
->pme_order
;
875 GMX_ASSERT(pmeOrder
== c_pmeGpuOrder
, "Only PME order 4 is implemented");
877 real
* cpuSplineBuffer
;
878 float* h_splineBuffer
;
881 case PmeSplineDataType::Values
:
882 cpuSplineBuffer
= atc
->spline
[threadIndex
].theta
.coefficients
[dimIndex
];
883 h_splineBuffer
= pmeGpu
->staging
.h_theta
;
886 case PmeSplineDataType::Derivatives
:
887 cpuSplineBuffer
= atc
->spline
[threadIndex
].dtheta
.coefficients
[dimIndex
];
888 h_splineBuffer
= pmeGpu
->staging
.h_dtheta
;
891 default: GMX_THROW(gmx::InternalError("Unknown spline data type"));
894 for (auto atomIndex
= 0; atomIndex
< atomCount
; atomIndex
++)
896 for (auto orderIndex
= 0; orderIndex
< pmeOrder
; orderIndex
++)
898 const auto gpuValueIndex
=
899 getSplineParamFullIndex(pmeOrder
, orderIndex
, dimIndex
, atomIndex
, atomsPerWarp
);
900 const auto cpuValueIndex
= atomIndex
* pmeOrder
+ orderIndex
;
901 GMX_ASSERT(cpuValueIndex
< atomCount
* pmeOrder
,
902 "Atom spline data index out of bounds (while transforming GPU data layout "
906 case PmeLayoutTransform::GpuToHost
:
907 cpuSplineBuffer
[cpuValueIndex
] = h_splineBuffer
[gpuValueIndex
];
910 case PmeLayoutTransform::HostToGpu
:
911 h_splineBuffer
[gpuValueIndex
] = cpuSplineBuffer
[cpuValueIndex
];
914 default: GMX_THROW(gmx::InternalError("Unknown layout transform"));
920 void pme_gpu_get_real_grid_sizes(const PmeGpu
* pmeGpu
, gmx::IVec
* gridSize
, gmx::IVec
* paddedGridSize
)
922 GMX_ASSERT(gridSize
!= nullptr, "");
923 GMX_ASSERT(paddedGridSize
!= nullptr, "");
924 GMX_ASSERT(pmeGpu
!= nullptr, "");
925 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
926 for (int i
= 0; i
< DIM
; i
++)
928 (*gridSize
)[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
929 (*paddedGridSize
)[i
] = kernelParamsPtr
->grid
.realGridSizePadded
[i
];
933 void pme_gpu_reinit(gmx_pme_t
* pme
, const gmx_device_info_t
* gpuInfo
, const PmeGpuProgram
* pmeGpuProgram
)
935 GMX_ASSERT(pme
!= nullptr, "Need valid PME object");
936 if (pme
->runMode
== PmeRunMode::CPU
)
938 GMX_ASSERT(pme
->gpu
== nullptr, "Should not have PME GPU object");
944 /* First-time initialization */
945 pme_gpu_init(pme
, gpuInfo
, pmeGpuProgram
);
949 /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */
950 pme_gpu_copy_common_data_from(pme
);
952 /* GPU FFT will only get used for a single rank.*/
953 pme
->gpu
->settings
.performGPUFFT
=
954 (pme
->gpu
->common
->runMode
== PmeRunMode::GPU
) && !pme
->gpu
->settings
.useDecomposition
;
955 pme
->gpu
->settings
.performGPUSolve
= (pme
->gpu
->common
->runMode
== PmeRunMode::GPU
);
957 /* Reinit active timers */
958 pme_gpu_reinit_timings(pme
->gpu
);
960 pme_gpu_reinit_grids(pme
->gpu
);
961 // Note: if timing the reinit launch overhead becomes more relevant
962 // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
963 pme_gpu_reinit_computation(pme
, nullptr);
964 /* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
965 * update for mixed mode on grid switch. TODO: use shared recipbox field.
967 std::memset(pme
->gpu
->common
->previousBox
, 0, sizeof(pme
->gpu
->common
->previousBox
));
968 pme_gpu_select_best_performing_pme_spreadgather_kernels(pme
->gpu
);
971 void pme_gpu_destroy(PmeGpu
* pmeGpu
)
973 /* Free lots of data */
974 pme_gpu_free_energy_virial(pmeGpu
);
975 pme_gpu_free_bspline_values(pmeGpu
);
976 pme_gpu_free_forces(pmeGpu
);
977 pme_gpu_free_coefficients(pmeGpu
);
978 pme_gpu_free_spline_data(pmeGpu
);
979 pme_gpu_free_grid_indices(pmeGpu
);
980 pme_gpu_free_fract_shifts(pmeGpu
);
981 pme_gpu_free_grids(pmeGpu
);
983 pme_gpu_destroy_3dfft(pmeGpu
);
985 /* Free the GPU-framework specific data last */
986 pme_gpu_destroy_specific(pmeGpu
);
991 void pme_gpu_reinit_atoms(PmeGpu
* pmeGpu
, const int nAtoms
, const real
* charges
)
993 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
994 kernelParamsPtr
->atoms
.nAtoms
= nAtoms
;
995 const int alignment
= pme_gpu_get_atom_data_alignment(pmeGpu
);
996 pmeGpu
->nAtomsPadded
= ((nAtoms
+ alignment
- 1) / alignment
) * alignment
;
997 const int nAtomsAlloc
= c_usePadding
? pmeGpu
->nAtomsPadded
: nAtoms
;
998 const bool haveToRealloc
=
999 (pmeGpu
->nAtomsAlloc
< nAtomsAlloc
); /* This check might be redundant, but is logical */
1000 pmeGpu
->nAtomsAlloc
= nAtomsAlloc
;
1003 GMX_RELEASE_ASSERT(false, "Only single precision supported");
1004 GMX_UNUSED_VALUE(charges
);
1006 pme_gpu_realloc_and_copy_input_coefficients(pmeGpu
, reinterpret_cast<const float*>(charges
));
1007 /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
1012 pme_gpu_realloc_forces(pmeGpu
);
1013 pme_gpu_realloc_spline_data(pmeGpu
);
1014 pme_gpu_realloc_grid_indices(pmeGpu
);
1018 /*! \internal \brief
1019 * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
1020 * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
1022 * \param[in] pmeGpu The PME GPU data structure.
1023 * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
1025 static CommandEvent
* pme_gpu_fetch_timing_event(const PmeGpu
* pmeGpu
, size_t PMEStageId
)
1027 CommandEvent
* timingEvent
= nullptr;
1028 if (pme_gpu_timings_enabled(pmeGpu
))
1030 GMX_ASSERT(PMEStageId
< pmeGpu
->archSpecific
->timingEvents
.size(),
1031 "Wrong PME GPU timing event index");
1032 timingEvent
= pmeGpu
->archSpecific
->timingEvents
[PMEStageId
].fetchNextEvent();
1037 void pme_gpu_3dfft(const PmeGpu
* pmeGpu
, gmx_fft_direction dir
, int grid_index
)
1039 int timerId
= (dir
== GMX_FFT_REAL_TO_COMPLEX
) ? gtPME_FFT_R2C
: gtPME_FFT_C2R
;
1041 pme_gpu_start_timing(pmeGpu
, timerId
);
1042 pmeGpu
->archSpecific
->fftSetup
[grid_index
]->perform3dFft(
1043 dir
, pme_gpu_fetch_timing_event(pmeGpu
, timerId
));
1044 pme_gpu_stop_timing(pmeGpu
, timerId
);
1048 * Given possibly large \p blockCount, returns a compact 1D or 2D grid for kernel scheduling,
1049 * to minimize number of unused blocks.
1051 std::pair
<int, int> inline pmeGpuCreateGrid(const PmeGpu
* pmeGpu
, int blockCount
)
1053 // How many maximum widths in X do we need (hopefully just one)
1054 const int minRowCount
= (blockCount
+ pmeGpu
->maxGridWidthX
- 1) / pmeGpu
->maxGridWidthX
;
1055 // Trying to make things even
1056 const int colCount
= (blockCount
+ minRowCount
- 1) / minRowCount
;
1057 GMX_ASSERT((colCount
* minRowCount
- blockCount
) >= 0, "pmeGpuCreateGrid: totally wrong");
1058 GMX_ASSERT((colCount
* minRowCount
- blockCount
) < minRowCount
,
1059 "pmeGpuCreateGrid: excessive blocks");
1060 return std::pair
<int, int>(colCount
, minRowCount
);
1064 * Returns a pointer to appropriate spline and spread kernel based on the input bool values
1066 * \param[in] pmeGpu The PME GPU structure.
1067 * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
1068 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
1070 * \return Pointer to CUDA kernel
1072 static auto selectSplineAndSpreadKernelPtr(const PmeGpu
* pmeGpu
, bool useOrderThreadsPerAtom
, bool writeSplinesToGlobal
)
1074 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1075 if (writeSplinesToGlobal
)
1077 if (useOrderThreadsPerAtom
)
1079 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelWriteSplinesThPerAtom4
;
1083 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelWriteSplines
;
1088 if (useOrderThreadsPerAtom
)
1090 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelThPerAtom4
;
1094 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernel
;
1102 * Returns a pointer to appropriate spline kernel based on the input bool values
1104 * \param[in] pmeGpu The PME GPU structure.
1105 * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
1106 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
1108 * \return Pointer to CUDA kernel
1110 static auto selectSplineKernelPtr(const PmeGpu
* pmeGpu
, bool useOrderThreadsPerAtom
, bool gmx_unused writeSplinesToGlobal
)
1112 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1114 writeSplinesToGlobal
,
1115 "Spline data should always be written to global memory when just calculating splines");
1117 if (useOrderThreadsPerAtom
)
1119 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineKernelThPerAtom4
;
1123 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineKernel
;
1129 * Returns a pointer to appropriate spread kernel based on the input bool values
1131 * \param[in] pmeGpu The PME GPU structure.
1132 * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
1133 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
1135 * \return Pointer to CUDA kernel
1137 static auto selectSpreadKernelPtr(const PmeGpu
* pmeGpu
, bool useOrderThreadsPerAtom
, bool writeSplinesToGlobal
)
1139 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1140 if (writeSplinesToGlobal
)
1142 if (useOrderThreadsPerAtom
)
1144 kernelPtr
= pmeGpu
->programHandle_
->impl_
->spreadKernelThPerAtom4
;
1148 kernelPtr
= pmeGpu
->programHandle_
->impl_
->spreadKernel
;
1153 /* if we are not saving the spline data we need to recalculate it
1154 using the spline and spread Kernel */
1155 if (useOrderThreadsPerAtom
)
1157 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelThPerAtom4
;
1161 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernel
;
1167 void pme_gpu_spread(const PmeGpu
* pmeGpu
,
1168 GpuEventSynchronizer
* xReadyOnDevice
,
1169 int gmx_unused gridIndex
,
1171 bool computeSplines
,
1174 GMX_ASSERT(computeSplines
|| spreadCharges
,
1175 "PME spline/spread kernel has invalid input (nothing to do)");
1176 const auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1177 GMX_ASSERT(kernelParamsPtr
->atoms
.nAtoms
> 0, "No atom data in PME GPU spread");
1179 const size_t blockSize
= pmeGpu
->programHandle_
->impl_
->spreadWorkGroupSize
;
1181 const int order
= pmeGpu
->common
->pme_order
;
1182 GMX_ASSERT(order
== c_pmeGpuOrder
, "Only PME order 4 is implemented");
1183 const bool writeGlobal
= pmeGpu
->settings
.copyAllOutputs
;
1184 const bool useOrderThreadsPerAtom
= pmeGpu
->settings
.useOrderThreadsPerAtom
;
1185 const bool recalculateSplines
= pmeGpu
->settings
.recalculateSplines
;
1186 #if GMX_GPU == GMX_GPU_OPENCL
1187 GMX_ASSERT(!useOrderThreadsPerAtom
, "Only 16 threads per atom supported in OpenCL");
1188 GMX_ASSERT(!recalculateSplines
, "Recalculating splines not supported in OpenCL");
1190 const int atomsPerBlock
= useOrderThreadsPerAtom
? blockSize
/ c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
1191 : blockSize
/ c_pmeSpreadGatherThreadsPerAtom
;
1193 // TODO: pick smaller block size in runtime if needed
1194 // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
1195 // If doing so, change atomsPerBlock in the kernels as well.
1196 // TODO: test varying block sizes on modern arch-s as well
1197 // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
1198 //(for spline data mostly)
1199 GMX_ASSERT(!c_usePadding
|| !(c_pmeAtomDataAlignment
% atomsPerBlock
),
1200 "inconsistent atom data padding vs. spreading block size");
1202 // Ensure that coordinates are ready on the device before launching spread;
1203 // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
1204 // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
1205 GMX_ASSERT(xReadyOnDevice
!= nullptr || (GMX_GPU
!= GMX_GPU_CUDA
)
1206 || pmeGpu
->common
->isRankPmeOnly
|| pme_gpu_settings(pmeGpu
).copyAllOutputs
,
1207 "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
1210 xReadyOnDevice
->enqueueWaitEvent(pmeGpu
->archSpecific
->pmeStream
);
1213 const int blockCount
= pmeGpu
->nAtomsPadded
/ atomsPerBlock
;
1214 auto dimGrid
= pmeGpuCreateGrid(pmeGpu
, blockCount
);
1216 KernelLaunchConfig config
;
1217 config
.blockSize
[0] = order
;
1218 config
.blockSize
[1] = useOrderThreadsPerAtom
? 1 : order
;
1219 config
.blockSize
[2] = atomsPerBlock
;
1220 config
.gridSize
[0] = dimGrid
.first
;
1221 config
.gridSize
[1] = dimGrid
.second
;
1222 config
.stream
= pmeGpu
->archSpecific
->pmeStream
;
1225 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1230 timingId
= gtPME_SPLINEANDSPREAD
;
1231 kernelPtr
= selectSplineAndSpreadKernelPtr(pmeGpu
, useOrderThreadsPerAtom
,
1232 writeGlobal
|| (!recalculateSplines
));
1236 timingId
= gtPME_SPLINE
;
1237 kernelPtr
= selectSplineKernelPtr(pmeGpu
, useOrderThreadsPerAtom
,
1238 writeGlobal
|| (!recalculateSplines
));
1243 timingId
= gtPME_SPREAD
;
1244 kernelPtr
= selectSpreadKernelPtr(pmeGpu
, useOrderThreadsPerAtom
,
1245 writeGlobal
|| (!recalculateSplines
));
1249 pme_gpu_start_timing(pmeGpu
, timingId
);
1250 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1251 #if c_canEmbedBuffers
1252 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1254 const auto kernelArgs
= prepareGpuKernelArguments(
1255 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->atoms
.d_theta
,
1256 &kernelParamsPtr
->atoms
.d_dtheta
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
1257 &kernelParamsPtr
->grid
.d_realGrid
, &kernelParamsPtr
->grid
.d_fractShiftsTable
,
1258 &kernelParamsPtr
->grid
.d_gridlineIndicesTable
, &kernelParamsPtr
->atoms
.d_coefficients
,
1259 &kernelParamsPtr
->atoms
.d_coordinates
);
1262 launchGpuKernel(kernelPtr
, config
, timingEvent
, "PME spline/spread", kernelArgs
);
1263 pme_gpu_stop_timing(pmeGpu
, timingId
);
1265 const auto& settings
= pmeGpu
->settings
;
1266 const bool copyBackGrid
= spreadCharges
&& (!settings
.performGPUFFT
|| settings
.copyAllOutputs
);
1269 pme_gpu_copy_output_spread_grid(pmeGpu
, h_grid
);
1271 const bool copyBackAtomData
=
1272 computeSplines
&& (!settings
.performGPUGather
|| settings
.copyAllOutputs
);
1273 if (copyBackAtomData
)
1275 pme_gpu_copy_output_spread_atom_data(pmeGpu
);
1279 void pme_gpu_solve(const PmeGpu
* pmeGpu
, t_complex
* h_grid
, GridOrdering gridOrdering
, bool computeEnergyAndVirial
)
1281 const auto& settings
= pmeGpu
->settings
;
1282 const bool copyInputAndOutputGrid
= !settings
.performGPUFFT
|| settings
.copyAllOutputs
;
1284 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1286 float* h_gridFloat
= reinterpret_cast<float*>(h_grid
);
1287 if (copyInputAndOutputGrid
)
1289 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_fourierGrid
, h_gridFloat
, 0,
1290 pmeGpu
->archSpecific
->complexGridSize
, pmeGpu
->archSpecific
->pmeStream
,
1291 pmeGpu
->settings
.transferKind
, nullptr);
1294 int majorDim
= -1, middleDim
= -1, minorDim
= -1;
1295 switch (gridOrdering
)
1297 case GridOrdering::YZX
:
1303 case GridOrdering::XYZ
:
1309 default: GMX_ASSERT(false, "Implement grid ordering here and below for the kernel launch");
1312 const int maxBlockSize
= pmeGpu
->programHandle_
->impl_
->solveMaxWorkGroupSize
;
1314 const int gridLineSize
= pmeGpu
->kernelParams
->grid
.complexGridSize
[minorDim
];
1315 const int gridLinesPerBlock
= std::max(maxBlockSize
/ gridLineSize
, 1);
1316 const int blocksPerGridLine
= (gridLineSize
+ maxBlockSize
- 1) / maxBlockSize
;
1318 if (blocksPerGridLine
== 1)
1320 cellsPerBlock
= gridLineSize
* gridLinesPerBlock
;
1324 cellsPerBlock
= (gridLineSize
+ blocksPerGridLine
- 1) / blocksPerGridLine
;
1326 const int warpSize
= pmeGpu
->programHandle_
->impl_
->warpSize
;
1327 const int blockSize
= (cellsPerBlock
+ warpSize
- 1) / warpSize
* warpSize
;
1329 static_assert(GMX_GPU
!= GMX_GPU_CUDA
|| c_solveMaxWarpsPerBlock
/ 2 >= 4,
1330 "The CUDA solve energy kernels needs at least 4 warps. "
1331 "Here we launch at least half of the max warps.");
1333 KernelLaunchConfig config
;
1334 config
.blockSize
[0] = blockSize
;
1335 config
.gridSize
[0] = blocksPerGridLine
;
1336 // rounding up to full warps so that shuffle operations produce defined results
1337 config
.gridSize
[1] = (pmeGpu
->kernelParams
->grid
.complexGridSize
[middleDim
] + gridLinesPerBlock
- 1)
1338 / gridLinesPerBlock
;
1339 config
.gridSize
[2] = pmeGpu
->kernelParams
->grid
.complexGridSize
[majorDim
];
1340 config
.stream
= pmeGpu
->archSpecific
->pmeStream
;
1342 int timingId
= gtPME_SOLVE
;
1343 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1344 if (gridOrdering
== GridOrdering::YZX
)
1346 kernelPtr
= computeEnergyAndVirial
? pmeGpu
->programHandle_
->impl_
->solveYZXEnergyKernel
1347 : pmeGpu
->programHandle_
->impl_
->solveYZXKernel
;
1349 else if (gridOrdering
== GridOrdering::XYZ
)
1351 kernelPtr
= computeEnergyAndVirial
? pmeGpu
->programHandle_
->impl_
->solveXYZEnergyKernel
1352 : pmeGpu
->programHandle_
->impl_
->solveXYZKernel
;
1355 pme_gpu_start_timing(pmeGpu
, timingId
);
1356 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1357 #if c_canEmbedBuffers
1358 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1360 const auto kernelArgs
= prepareGpuKernelArguments(
1361 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->grid
.d_splineModuli
,
1362 &kernelParamsPtr
->constants
.d_virialAndEnergy
, &kernelParamsPtr
->grid
.d_fourierGrid
);
1364 launchGpuKernel(kernelPtr
, config
, timingEvent
, "PME solve", kernelArgs
);
1365 pme_gpu_stop_timing(pmeGpu
, timingId
);
1367 if (computeEnergyAndVirial
)
1369 copyFromDeviceBuffer(pmeGpu
->staging
.h_virialAndEnergy
,
1370 &kernelParamsPtr
->constants
.d_virialAndEnergy
, 0, c_virialAndEnergyCount
,
1371 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
1374 if (copyInputAndOutputGrid
)
1376 copyFromDeviceBuffer(h_gridFloat
, &kernelParamsPtr
->grid
.d_fourierGrid
, 0,
1377 pmeGpu
->archSpecific
->complexGridSize
, pmeGpu
->archSpecific
->pmeStream
,
1378 pmeGpu
->settings
.transferKind
, nullptr);
1383 * Returns a pointer to appropriate gather kernel based on the inputvalues
1385 * \param[in] pmeGpu The PME GPU structure.
1386 * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
1387 * \param[in] readSplinesFromGlobal bool controlling if we should write spline data to global memory
1388 * \param[in] forceTreatment Controls if the forces from the gather should increment or replace the input forces.
1390 * \return Pointer to CUDA kernel
1392 inline auto selectGatherKernelPtr(const PmeGpu
* pmeGpu
,
1393 bool useOrderThreadsPerAtom
,
1394 bool readSplinesFromGlobal
,
1395 PmeForceOutputHandling forceTreatment
)
1398 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1400 if (readSplinesFromGlobal
)
1402 if (useOrderThreadsPerAtom
)
1404 kernelPtr
= (forceTreatment
== PmeForceOutputHandling::Set
)
1405 ? pmeGpu
->programHandle_
->impl_
->gatherKernelReadSplinesThPerAtom4
1406 : pmeGpu
->programHandle_
->impl_
->gatherReduceWithInputKernelReadSplinesThPerAtom4
;
1410 kernelPtr
= (forceTreatment
== PmeForceOutputHandling::Set
)
1411 ? pmeGpu
->programHandle_
->impl_
->gatherKernelReadSplines
1412 : pmeGpu
->programHandle_
->impl_
->gatherReduceWithInputKernelReadSplines
;
1417 if (useOrderThreadsPerAtom
)
1419 kernelPtr
= (forceTreatment
== PmeForceOutputHandling::Set
)
1420 ? pmeGpu
->programHandle_
->impl_
->gatherKernelThPerAtom4
1421 : pmeGpu
->programHandle_
->impl_
->gatherReduceWithInputKernelThPerAtom4
;
1425 kernelPtr
= (forceTreatment
== PmeForceOutputHandling::Set
)
1426 ? pmeGpu
->programHandle_
->impl_
->gatherKernel
1427 : pmeGpu
->programHandle_
->impl_
->gatherReduceWithInputKernel
;
1434 void pme_gpu_gather(PmeGpu
* pmeGpu
, PmeForceOutputHandling forceTreatment
, const float* h_grid
)
1436 /* Copying the input CPU forces for reduction */
1437 if (forceTreatment
!= PmeForceOutputHandling::Set
)
1439 pme_gpu_copy_input_forces(pmeGpu
);
1442 const auto& settings
= pmeGpu
->settings
;
1443 if (!settings
.performGPUFFT
|| settings
.copyAllOutputs
)
1445 pme_gpu_copy_input_gather_grid(pmeGpu
, const_cast<float*>(h_grid
));
1448 if (settings
.copyAllOutputs
)
1450 pme_gpu_copy_input_gather_atom_data(pmeGpu
);
1453 /* Set if we have unit tests */
1454 const bool readGlobal
= pmeGpu
->settings
.copyAllOutputs
;
1455 const size_t blockSize
= pmeGpu
->programHandle_
->impl_
->gatherWorkGroupSize
;
1456 const bool useOrderThreadsPerAtom
= pmeGpu
->settings
.useOrderThreadsPerAtom
;
1457 const bool recalculateSplines
= pmeGpu
->settings
.recalculateSplines
;
1458 #if GMX_GPU == GMX_GPU_OPENCL
1459 GMX_ASSERT(!useOrderThreadsPerAtom
, "Only 16 threads per atom supported in OpenCL");
1460 GMX_ASSERT(!recalculateSplines
, "Recalculating splines not supported in OpenCL");
1462 const int atomsPerBlock
= useOrderThreadsPerAtom
? blockSize
/ c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
1463 : blockSize
/ c_pmeSpreadGatherThreadsPerAtom
;
1465 GMX_ASSERT(!c_usePadding
|| !(c_pmeAtomDataAlignment
% atomsPerBlock
),
1466 "inconsistent atom data padding vs. gathering block size");
1468 const int blockCount
= pmeGpu
->nAtomsPadded
/ atomsPerBlock
;
1469 auto dimGrid
= pmeGpuCreateGrid(pmeGpu
, blockCount
);
1471 const int order
= pmeGpu
->common
->pme_order
;
1472 GMX_ASSERT(order
== c_pmeGpuOrder
, "Only PME order 4 is implemented");
1474 KernelLaunchConfig config
;
1475 config
.blockSize
[0] = order
;
1476 config
.blockSize
[1] = useOrderThreadsPerAtom
? 1 : order
;
1477 config
.blockSize
[2] = atomsPerBlock
;
1478 config
.gridSize
[0] = dimGrid
.first
;
1479 config
.gridSize
[1] = dimGrid
.second
;
1480 config
.stream
= pmeGpu
->archSpecific
->pmeStream
;
1482 // TODO test different cache configs
1484 int timingId
= gtPME_GATHER
;
1485 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= selectGatherKernelPtr(
1486 pmeGpu
, useOrderThreadsPerAtom
, readGlobal
|| (!recalculateSplines
), forceTreatment
);
1487 // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
1489 pme_gpu_start_timing(pmeGpu
, timingId
);
1490 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1491 const auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1492 #if c_canEmbedBuffers
1493 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1495 const auto kernelArgs
= prepareGpuKernelArguments(
1496 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->atoms
.d_coefficients
,
1497 &kernelParamsPtr
->grid
.d_realGrid
, &kernelParamsPtr
->atoms
.d_theta
,
1498 &kernelParamsPtr
->atoms
.d_dtheta
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
1499 &kernelParamsPtr
->atoms
.d_forces
);
1501 launchGpuKernel(kernelPtr
, config
, timingEvent
, "PME gather", kernelArgs
);
1502 pme_gpu_stop_timing(pmeGpu
, timingId
);
1504 if (pmeGpu
->settings
.useGpuForceReduction
)
1506 pmeGpu
->archSpecific
->pmeForcesReady
.markEvent(pmeGpu
->archSpecific
->pmeStream
);
1510 pme_gpu_copy_output_forces(pmeGpu
);
1514 void* pme_gpu_get_kernelparam_forces(const PmeGpu
* pmeGpu
)
1516 if (pmeGpu
&& pmeGpu
->kernelParams
)
1518 return pmeGpu
->kernelParams
->atoms
.d_forces
;
1526 /*! \brief Check the validity of the device buffer.
1528 * Checks if the buffer is not nullptr and, when possible, if it is big enough.
1530 * \todo Split and move this function to gpu_utils.
1532 * \param[in] buffer Device buffer to be checked.
1533 * \param[in] requiredSize Number of elements that the buffer will have to accommodate.
1535 * \returns If the device buffer can be set.
1537 template<typename T
>
1538 static bool checkDeviceBuffer(gmx_unused DeviceBuffer
<T
> buffer
, gmx_unused
int requiredSize
)
1540 #if GMX_GPU == GMX_GPU_CUDA
1541 GMX_ASSERT(buffer
!= nullptr, "The device pointer is nullptr");
1542 return buffer
!= nullptr;
1543 #elif GMX_GPU == GMX_GPU_OPENCL
1545 int retval
= clGetMemObjectInfo(buffer
, CL_MEM_SIZE
, sizeof(size
), &size
, nullptr);
1546 GMX_ASSERT(retval
== CL_SUCCESS
,
1547 gmx::formatString("clGetMemObjectInfo failed with error code #%d", retval
).c_str());
1548 GMX_ASSERT(static_cast<int>(size
) >= requiredSize
,
1549 "Number of atoms in device buffer is smaller then required size.");
1550 return retval
== CL_SUCCESS
&& static_cast<int>(size
) >= requiredSize
;
1551 #elif GMX_GPU == GMX_GPU_NONE
1552 GMX_ASSERT(false, "Setter for device-side coordinates was called in non-GPU build.");
1557 void pme_gpu_set_kernelparam_coordinates(const PmeGpu
* pmeGpu
, DeviceBuffer
<float> d_x
)
1559 GMX_ASSERT(pmeGpu
&& pmeGpu
->kernelParams
,
1560 "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
1563 GMX_ASSERT(checkDeviceBuffer(d_x
, pmeGpu
->kernelParams
->atoms
.nAtoms
),
1564 "The device-side buffer can not be set.");
1566 pmeGpu
->kernelParams
->atoms
.d_coordinates
= d_x
;
1569 void* pme_gpu_get_stream(const PmeGpu
* pmeGpu
)
1573 return static_cast<void*>(&pmeGpu
->archSpecific
->pmeStream
);
1581 void* pme_gpu_get_context(const PmeGpu
* pmeGpu
)
1585 return static_cast<void*>(&pmeGpu
->archSpecific
->context
);
1593 GpuEventSynchronizer
* pme_gpu_get_forces_ready_synchronizer(const PmeGpu
* pmeGpu
)
1595 if (pmeGpu
&& pmeGpu
->kernelParams
)
1597 return &pmeGpu
->archSpecific
->pmeForcesReady
;