2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file contains internal function implementations
39 * for performing the PME calculations on GPU.
41 * Note that this file is compiled as regular C++ source in OpenCL builds, but
42 * it is treated as CUDA source in CUDA-enabled GPU builds.
44 * \author Aleksei Iupinov <a.yupinov@gmail.com>
45 * \ingroup module_ewald
50 #include "pme_gpu_internal.h"
58 #include "gromacs/ewald/ewald_utils.h"
59 #include "gromacs/gpu_utils/device_context.h"
60 #include "gromacs/gpu_utils/device_stream.h"
61 #include "gromacs/gpu_utils/gpu_utils.h"
62 #include "gromacs/math/invertmatrix.h"
63 #include "gromacs/math/units.h"
64 #include "gromacs/timing/gpu_timing.h"
65 #include "gromacs/utility/exceptions.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/gmxassert.h"
68 #include "gromacs/utility/logger.h"
69 #include "gromacs/utility/stringutil.h"
72 # include "gromacs/gpu_utils/pmalloc_cuda.h"
76 # include "gromacs/gpu_utils/gmxopencl.h"
79 #include "gromacs/ewald/pme.h"
81 #include "pme_gpu_3dfft.h"
82 #include "pme_gpu_calculate_splines.h"
83 #include "pme_gpu_constants.h"
84 #include "pme_gpu_program_impl.h"
85 #include "pme_gpu_timings.h"
86 #include "pme_gpu_types.h"
87 #include "pme_gpu_types_host.h"
88 #include "pme_gpu_types_host_impl.h"
90 #include "pme_internal.h"
91 #include "pme_solve.h"
95 * Atom limit above which it is advantageous to turn on the
96 * recalcuating of the splines in the gather and using less threads per atom in the spline and spread
98 constexpr int c_pmeGpuPerformanceAtomLimit
= 23000;
101 * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
103 * \param[in] pmeGpu The PME GPU structure.
104 * \returns The pointer to the kernel parameters.
106 static PmeGpuKernelParamsBase
* pme_gpu_get_kernel_params_base_ptr(const PmeGpu
* pmeGpu
)
108 // reinterpret_cast is needed because the derived CUDA structure is not known in this file
109 auto* kernelParamsPtr
= reinterpret_cast<PmeGpuKernelParamsBase
*>(pmeGpu
->kernelParams
.get());
110 return kernelParamsPtr
;
114 * Atom data block size (in terms of number of atoms).
115 * This is the least common multiple of number of atoms processed by
116 * a single block/workgroup of the spread and gather kernels.
117 * The GPU atom data buffers must be padded, which means that
118 * the numbers of atoms used for determining the size of the memory
119 * allocation must be divisible by this.
121 constexpr int c_pmeAtomDataBlockSize
= 64;
123 int pme_gpu_get_atom_data_block_size()
125 return c_pmeAtomDataBlockSize
;
128 void pme_gpu_synchronize(const PmeGpu
* pmeGpu
)
130 pmeGpu
->archSpecific
->pmeStream_
.synchronize();
133 void pme_gpu_alloc_energy_virial(PmeGpu
* pmeGpu
)
135 const size_t energyAndVirialSize
= c_virialAndEnergyCount
* sizeof(float);
138 pmeGpu
->common
->ngrids
== 1 || pmeGpu
->common
->ngrids
== 2,
139 "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
141 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
143 allocateDeviceBuffer(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
[gridIndex
],
144 c_virialAndEnergyCount
, pmeGpu
->archSpecific
->deviceContext_
);
145 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
]), energyAndVirialSize
);
149 void pme_gpu_free_energy_virial(PmeGpu
* pmeGpu
)
151 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
153 freeDeviceBuffer(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
[gridIndex
]);
154 pfree(pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
]);
155 pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
] = nullptr;
159 void pme_gpu_clear_energy_virial(const PmeGpu
* pmeGpu
)
161 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
163 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
[gridIndex
], 0,
164 c_virialAndEnergyCount
, pmeGpu
->archSpecific
->pmeStream_
);
168 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu
* pmeGpu
, const int gridIndex
)
171 pmeGpu
->common
->ngrids
== 1 || pmeGpu
->common
->ngrids
== 2,
172 "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
173 GMX_ASSERT(gridIndex
< pmeGpu
->common
->ngrids
,
174 "Invalid combination of gridIndex and number of grids");
176 const int splineValuesOffset
[DIM
] = { 0, pmeGpu
->kernelParams
->grid
.realGridSize
[XX
],
177 pmeGpu
->kernelParams
->grid
.realGridSize
[XX
]
178 + pmeGpu
->kernelParams
->grid
.realGridSize
[YY
] };
179 memcpy(&pmeGpu
->kernelParams
->grid
.splineValuesOffset
, &splineValuesOffset
, sizeof(splineValuesOffset
));
181 const int newSplineValuesSize
= pmeGpu
->kernelParams
->grid
.realGridSize
[XX
]
182 + pmeGpu
->kernelParams
->grid
.realGridSize
[YY
]
183 + pmeGpu
->kernelParams
->grid
.realGridSize
[ZZ
];
184 const bool shouldRealloc
= (newSplineValuesSize
> pmeGpu
->archSpecific
->splineValuesSize
[gridIndex
]);
185 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
[gridIndex
],
186 newSplineValuesSize
, &pmeGpu
->archSpecific
->splineValuesSize
[gridIndex
],
187 &pmeGpu
->archSpecific
->splineValuesCapacity
[gridIndex
],
188 pmeGpu
->archSpecific
->deviceContext_
);
191 /* Reallocate the host buffer */
192 pfree(pmeGpu
->staging
.h_splineModuli
[gridIndex
]);
193 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_splineModuli
[gridIndex
]),
194 newSplineValuesSize
* sizeof(float));
196 for (int i
= 0; i
< DIM
; i
++)
198 memcpy(pmeGpu
->staging
.h_splineModuli
[gridIndex
] + splineValuesOffset
[i
],
199 pmeGpu
->common
->bsp_mod
[i
].data(), pmeGpu
->common
->bsp_mod
[i
].size() * sizeof(float));
201 /* TODO: pin original buffer instead! */
202 copyToDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
[gridIndex
],
203 pmeGpu
->staging
.h_splineModuli
[gridIndex
], 0, newSplineValuesSize
,
204 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
207 void pme_gpu_free_bspline_values(const PmeGpu
* pmeGpu
)
209 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
211 pfree(pmeGpu
->staging
.h_splineModuli
[gridIndex
]);
212 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
[gridIndex
]);
216 void pme_gpu_realloc_forces(PmeGpu
* pmeGpu
)
218 const size_t newForcesSize
= pmeGpu
->nAtomsAlloc
* DIM
;
219 GMX_ASSERT(newForcesSize
> 0, "Bad number of atoms in PME GPU");
220 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
, newForcesSize
,
221 &pmeGpu
->archSpecific
->forcesSize
, &pmeGpu
->archSpecific
->forcesSizeAlloc
,
222 pmeGpu
->archSpecific
->deviceContext_
);
223 pmeGpu
->staging
.h_forces
.reserveWithPadding(pmeGpu
->nAtomsAlloc
);
224 pmeGpu
->staging
.h_forces
.resizeWithPadding(pmeGpu
->kernelParams
->atoms
.nAtoms
);
227 void pme_gpu_free_forces(const PmeGpu
* pmeGpu
)
229 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
);
232 void pme_gpu_copy_input_forces(PmeGpu
* pmeGpu
)
234 GMX_ASSERT(pmeGpu
->kernelParams
->atoms
.nAtoms
> 0, "Bad number of atoms in PME GPU");
235 float* h_forcesFloat
= reinterpret_cast<float*>(pmeGpu
->staging
.h_forces
.data());
236 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
, h_forcesFloat
, 0,
237 DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
, pmeGpu
->archSpecific
->pmeStream_
,
238 pmeGpu
->settings
.transferKind
, nullptr);
241 void pme_gpu_copy_output_forces(PmeGpu
* pmeGpu
)
243 GMX_ASSERT(pmeGpu
->kernelParams
->atoms
.nAtoms
> 0, "Bad number of atoms in PME GPU");
244 float* h_forcesFloat
= reinterpret_cast<float*>(pmeGpu
->staging
.h_forces
.data());
245 copyFromDeviceBuffer(h_forcesFloat
, &pmeGpu
->kernelParams
->atoms
.d_forces
, 0,
246 DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
, pmeGpu
->archSpecific
->pmeStream_
,
247 pmeGpu
->settings
.transferKind
, nullptr);
250 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu
* pmeGpu
,
251 const float* h_coefficients
,
254 GMX_ASSERT(h_coefficients
, "Bad host-side charge buffer in PME GPU");
255 const size_t newCoefficientsSize
= pmeGpu
->nAtomsAlloc
;
256 GMX_ASSERT(newCoefficientsSize
> 0, "Bad number of atoms in PME GPU");
257 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
[gridIndex
],
258 newCoefficientsSize
, &pmeGpu
->archSpecific
->coefficientsSize
[gridIndex
],
259 &pmeGpu
->archSpecific
->coefficientsCapacity
[gridIndex
],
260 pmeGpu
->archSpecific
->deviceContext_
);
261 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
[gridIndex
],
262 const_cast<float*>(h_coefficients
), 0, pmeGpu
->kernelParams
->atoms
.nAtoms
,
263 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
265 const size_t paddingIndex
= pmeGpu
->kernelParams
->atoms
.nAtoms
;
266 const size_t paddingCount
= pmeGpu
->nAtomsAlloc
- paddingIndex
;
267 if (paddingCount
> 0)
269 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->atoms
.d_coefficients
[gridIndex
], paddingIndex
,
270 paddingCount
, pmeGpu
->archSpecific
->pmeStream_
);
274 void pme_gpu_free_coefficients(const PmeGpu
* pmeGpu
)
276 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
278 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
[gridIndex
]);
282 void pme_gpu_realloc_spline_data(PmeGpu
* pmeGpu
)
284 const int order
= pmeGpu
->common
->pme_order
;
285 const int newSplineDataSize
= DIM
* order
* pmeGpu
->nAtomsAlloc
;
286 GMX_ASSERT(newSplineDataSize
> 0, "Bad number of atoms in PME GPU");
287 /* Two arrays of the same size */
288 const bool shouldRealloc
= (newSplineDataSize
> pmeGpu
->archSpecific
->splineDataSize
);
289 int currentSizeTemp
= pmeGpu
->archSpecific
->splineDataSize
;
290 int currentSizeTempAlloc
= pmeGpu
->archSpecific
->splineDataSizeAlloc
;
291 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_theta
, newSplineDataSize
, ¤tSizeTemp
,
292 ¤tSizeTempAlloc
, pmeGpu
->archSpecific
->deviceContext_
);
293 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_dtheta
, newSplineDataSize
,
294 &pmeGpu
->archSpecific
->splineDataSize
, &pmeGpu
->archSpecific
->splineDataSizeAlloc
,
295 pmeGpu
->archSpecific
->deviceContext_
);
296 // the host side reallocation
299 pfree(pmeGpu
->staging
.h_theta
);
300 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_theta
), newSplineDataSize
* sizeof(float));
301 pfree(pmeGpu
->staging
.h_dtheta
);
302 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_dtheta
), newSplineDataSize
* sizeof(float));
306 void pme_gpu_free_spline_data(const PmeGpu
* pmeGpu
)
308 /* Two arrays of the same size */
309 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_theta
);
310 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_dtheta
);
311 pfree(pmeGpu
->staging
.h_theta
);
312 pfree(pmeGpu
->staging
.h_dtheta
);
315 void pme_gpu_realloc_grid_indices(PmeGpu
* pmeGpu
)
317 const size_t newIndicesSize
= DIM
* pmeGpu
->nAtomsAlloc
;
318 GMX_ASSERT(newIndicesSize
> 0, "Bad number of atoms in PME GPU");
319 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_gridlineIndices
, newIndicesSize
,
320 &pmeGpu
->archSpecific
->gridlineIndicesSize
,
321 &pmeGpu
->archSpecific
->gridlineIndicesSizeAlloc
,
322 pmeGpu
->archSpecific
->deviceContext_
);
323 pfree(pmeGpu
->staging
.h_gridlineIndices
);
324 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_gridlineIndices
), newIndicesSize
* sizeof(int));
327 void pme_gpu_free_grid_indices(const PmeGpu
* pmeGpu
)
329 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_gridlineIndices
);
330 pfree(pmeGpu
->staging
.h_gridlineIndices
);
333 void pme_gpu_realloc_grids(PmeGpu
* pmeGpu
)
335 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
337 const int newRealGridSize
= kernelParamsPtr
->grid
.realGridSizePadded
[XX
]
338 * kernelParamsPtr
->grid
.realGridSizePadded
[YY
]
339 * kernelParamsPtr
->grid
.realGridSizePadded
[ZZ
];
340 const int newComplexGridSize
= kernelParamsPtr
->grid
.complexGridSizePadded
[XX
]
341 * kernelParamsPtr
->grid
.complexGridSizePadded
[YY
]
342 * kernelParamsPtr
->grid
.complexGridSizePadded
[ZZ
] * 2;
343 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
345 // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
346 if (pmeGpu
->archSpecific
->performOutOfPlaceFFT
)
348 /* 2 separate grids */
349 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_fourierGrid
[gridIndex
], newComplexGridSize
,
350 &pmeGpu
->archSpecific
->complexGridSize
[gridIndex
],
351 &pmeGpu
->archSpecific
->complexGridCapacity
[gridIndex
],
352 pmeGpu
->archSpecific
->deviceContext_
);
353 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_realGrid
[gridIndex
], newRealGridSize
,
354 &pmeGpu
->archSpecific
->realGridSize
[gridIndex
],
355 &pmeGpu
->archSpecific
->realGridCapacity
[gridIndex
],
356 pmeGpu
->archSpecific
->deviceContext_
);
360 /* A single buffer so that any grid will fit */
361 const int newGridsSize
= std::max(newRealGridSize
, newComplexGridSize
);
362 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_realGrid
[gridIndex
], newGridsSize
,
363 &pmeGpu
->archSpecific
->realGridSize
[gridIndex
],
364 &pmeGpu
->archSpecific
->realGridCapacity
[gridIndex
],
365 pmeGpu
->archSpecific
->deviceContext_
);
366 kernelParamsPtr
->grid
.d_fourierGrid
[gridIndex
] = kernelParamsPtr
->grid
.d_realGrid
[gridIndex
];
367 pmeGpu
->archSpecific
->complexGridSize
[gridIndex
] =
368 pmeGpu
->archSpecific
->realGridSize
[gridIndex
];
369 // the size might get used later for copying the grid
374 void pme_gpu_free_grids(const PmeGpu
* pmeGpu
)
376 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
378 if (pmeGpu
->archSpecific
->performOutOfPlaceFFT
)
380 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_fourierGrid
[gridIndex
]);
382 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_realGrid
[gridIndex
]);
386 void pme_gpu_clear_grids(const PmeGpu
* pmeGpu
)
388 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
390 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->grid
.d_realGrid
[gridIndex
], 0,
391 pmeGpu
->archSpecific
->realGridSize
[gridIndex
],
392 pmeGpu
->archSpecific
->pmeStream_
);
396 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu
* pmeGpu
)
398 pme_gpu_free_fract_shifts(pmeGpu
);
400 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
402 const int nx
= kernelParamsPtr
->grid
.realGridSize
[XX
];
403 const int ny
= kernelParamsPtr
->grid
.realGridSize
[YY
];
404 const int nz
= kernelParamsPtr
->grid
.realGridSize
[ZZ
];
405 const int cellCount
= c_pmeNeighborUnitcellCount
;
406 const int gridDataOffset
[DIM
] = { 0, cellCount
* nx
, cellCount
* (nx
+ ny
) };
408 memcpy(kernelParamsPtr
->grid
.tablesOffsets
, &gridDataOffset
, sizeof(gridDataOffset
));
410 const int newFractShiftsSize
= cellCount
* (nx
+ ny
+ nz
);
412 initParamLookupTable(&kernelParamsPtr
->grid
.d_fractShiftsTable
,
413 &kernelParamsPtr
->fractShiftsTableTexture
, pmeGpu
->common
->fsh
.data(),
414 newFractShiftsSize
, pmeGpu
->archSpecific
->deviceContext_
);
416 initParamLookupTable(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
417 &kernelParamsPtr
->gridlineIndicesTableTexture
, pmeGpu
->common
->nn
.data(),
418 newFractShiftsSize
, pmeGpu
->archSpecific
->deviceContext_
);
421 void pme_gpu_free_fract_shifts(const PmeGpu
* pmeGpu
)
423 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
425 destroyParamLookupTable(&kernelParamsPtr
->grid
.d_fractShiftsTable
,
426 kernelParamsPtr
->fractShiftsTableTexture
);
427 destroyParamLookupTable(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
428 kernelParamsPtr
->gridlineIndicesTableTexture
);
430 freeDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
);
431 freeDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
);
435 bool pme_gpu_stream_query(const PmeGpu
* pmeGpu
)
437 return haveStreamTasksCompleted(pmeGpu
->archSpecific
->pmeStream_
);
440 void pme_gpu_copy_input_gather_grid(const PmeGpu
* pmeGpu
, const float* h_grid
, const int gridIndex
)
442 copyToDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_realGrid
[gridIndex
], h_grid
, 0,
443 pmeGpu
->archSpecific
->realGridSize
[gridIndex
],
444 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
447 void pme_gpu_copy_output_spread_grid(const PmeGpu
* pmeGpu
, float* h_grid
, const int gridIndex
)
449 copyFromDeviceBuffer(h_grid
, &pmeGpu
->kernelParams
->grid
.d_realGrid
[gridIndex
], 0,
450 pmeGpu
->archSpecific
->realGridSize
[gridIndex
],
451 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
452 pmeGpu
->archSpecific
->syncSpreadGridD2H
.markEvent(pmeGpu
->archSpecific
->pmeStream_
);
455 void pme_gpu_copy_output_spread_atom_data(const PmeGpu
* pmeGpu
)
457 const size_t splinesCount
= DIM
* pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
;
458 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
459 copyFromDeviceBuffer(pmeGpu
->staging
.h_dtheta
, &kernelParamsPtr
->atoms
.d_dtheta
, 0, splinesCount
,
460 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
461 copyFromDeviceBuffer(pmeGpu
->staging
.h_theta
, &kernelParamsPtr
->atoms
.d_theta
, 0, splinesCount
,
462 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
463 copyFromDeviceBuffer(pmeGpu
->staging
.h_gridlineIndices
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
464 0, kernelParamsPtr
->atoms
.nAtoms
* DIM
, pmeGpu
->archSpecific
->pmeStream_
,
465 pmeGpu
->settings
.transferKind
, nullptr);
468 void pme_gpu_copy_input_gather_atom_data(const PmeGpu
* pmeGpu
)
470 const size_t splinesCount
= DIM
* pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
;
471 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
473 // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
474 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_gridlineIndices
, 0, pmeGpu
->nAtomsAlloc
* DIM
,
475 pmeGpu
->archSpecific
->pmeStream_
);
476 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_dtheta
, 0,
477 pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
* DIM
,
478 pmeGpu
->archSpecific
->pmeStream_
);
479 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_theta
, 0,
480 pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
* DIM
,
481 pmeGpu
->archSpecific
->pmeStream_
);
483 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_dtheta
, pmeGpu
->staging
.h_dtheta
, 0, splinesCount
,
484 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
485 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_theta
, pmeGpu
->staging
.h_theta
, 0, splinesCount
,
486 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
487 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_gridlineIndices
, pmeGpu
->staging
.h_gridlineIndices
,
488 0, kernelParamsPtr
->atoms
.nAtoms
* DIM
, pmeGpu
->archSpecific
->pmeStream_
,
489 pmeGpu
->settings
.transferKind
, nullptr);
492 void pme_gpu_sync_spread_grid(const PmeGpu
* pmeGpu
)
494 pmeGpu
->archSpecific
->syncSpreadGridD2H
.waitForEvent();
497 /*! \brief Internal GPU initialization for PME.
499 * \param[in] pmeGpu GPU PME data.
500 * \param[in] deviceContext GPU context.
501 * \param[in] deviceStream GPU stream.
503 static void pme_gpu_init_internal(PmeGpu
* pmeGpu
, const DeviceContext
& deviceContext
, const DeviceStream
& deviceStream
)
506 // Prepare to use the device that this PME task was assigned earlier.
507 // Other entities, such as CUDA timing events, are known to implicitly use the device context.
508 CU_RET_ERR(cudaSetDevice(deviceContext
.deviceInfo().id
), "Switching to PME CUDA device");
511 /* Allocate the target-specific structures */
512 pmeGpu
->archSpecific
.reset(new PmeGpuSpecific(deviceContext
, deviceStream
));
513 pmeGpu
->kernelParams
.reset(new PmeGpuKernelParams());
515 pmeGpu
->archSpecific
->performOutOfPlaceFFT
= true;
516 /* This should give better performance, according to the cuFFT documentation.
517 * The performance seems to be the same though.
518 * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
522 pmeGpu
->maxGridWidthX
= deviceContext
.deviceInfo().prop
.maxGridSize
[0];
524 // Use this path for any non-CUDA GPU acceleration
525 // TODO: is there no really global work size limit in OpenCL?
526 pmeGpu
->maxGridWidthX
= INT32_MAX
/ 2;
530 void pme_gpu_reinit_3dfft(const PmeGpu
* pmeGpu
)
532 if (pme_gpu_settings(pmeGpu
).performGPUFFT
)
534 pmeGpu
->archSpecific
->fftSetup
.resize(0);
535 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
537 pmeGpu
->archSpecific
->fftSetup
.push_back(std::make_unique
<GpuParallel3dFft
>(pmeGpu
, gridIndex
));
542 void pme_gpu_destroy_3dfft(const PmeGpu
* pmeGpu
)
544 pmeGpu
->archSpecific
->fftSetup
.resize(0);
547 void pme_gpu_getEnergyAndVirial(const gmx_pme_t
& pme
, const float lambda
, PmeOutput
* output
)
549 const PmeGpu
* pmeGpu
= pme
.gpu
;
551 GMX_ASSERT(lambda
== 1.0 || pmeGpu
->common
->ngrids
== 2,
552 "Invalid combination of lambda and number of grids");
554 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
556 for (int j
= 0; j
< c_virialAndEnergyCount
; j
++)
558 GMX_ASSERT(std::isfinite(pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][j
]),
559 "PME GPU produces incorrect energy/virial.");
562 for (int dim1
= 0; dim1
< DIM
; dim1
++)
564 for (int dim2
= 0; dim2
< DIM
; dim2
++)
566 output
->coulombVirial_
[dim1
][dim2
] = 0;
569 output
->coulombEnergy_
= 0;
571 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
573 if (pmeGpu
->common
->ngrids
== 2)
575 scale
= gridIndex
== 0 ? (1.0 - lambda
) : lambda
;
577 output
->coulombVirial_
[XX
][XX
] +=
578 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][0];
579 output
->coulombVirial_
[YY
][YY
] +=
580 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][1];
581 output
->coulombVirial_
[ZZ
][ZZ
] +=
582 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][2];
583 output
->coulombVirial_
[XX
][YY
] +=
584 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][3];
585 output
->coulombVirial_
[YY
][XX
] +=
586 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][3];
587 output
->coulombVirial_
[XX
][ZZ
] +=
588 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][4];
589 output
->coulombVirial_
[ZZ
][XX
] +=
590 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][4];
591 output
->coulombVirial_
[YY
][ZZ
] +=
592 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][5];
593 output
->coulombVirial_
[ZZ
][YY
] +=
594 scale
* 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][5];
595 output
->coulombEnergy_
+= scale
* 0.5F
* pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
][6];
597 if (pmeGpu
->common
->ngrids
> 1)
599 output
->coulombDvdl_
= 0.5F
600 * (pmeGpu
->staging
.h_virialAndEnergy
[FEP_STATE_B
][6]
601 - pmeGpu
->staging
.h_virialAndEnergy
[FEP_STATE_A
][6]);
605 /*! \brief Sets the force-related members in \p output
607 * \param[in] pmeGpu PME GPU data structure
608 * \param[out] output Pointer to PME output data structure
610 static void pme_gpu_getForceOutput(PmeGpu
* pmeGpu
, PmeOutput
* output
)
612 output
->haveForceOutput_
= !pmeGpu
->settings
.useGpuForceReduction
;
613 if (output
->haveForceOutput_
)
615 output
->forces_
= pmeGpu
->staging
.h_forces
;
619 PmeOutput
pme_gpu_getOutput(const gmx_pme_t
& pme
, const bool computeEnergyAndVirial
, const real lambdaQ
)
621 PmeGpu
* pmeGpu
= pme
.gpu
;
625 pme_gpu_getForceOutput(pmeGpu
, &output
);
627 if (computeEnergyAndVirial
)
629 if (pme_gpu_settings(pmeGpu
).performGPUSolve
)
631 pme_gpu_getEnergyAndVirial(pme
, lambdaQ
, &output
);
635 get_pme_ener_vir_q(pme
.solve_work
, pme
.nthread
, &output
);
641 void pme_gpu_update_input_box(PmeGpu gmx_unused
* pmeGpu
, const matrix gmx_unused box
)
644 GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
647 pmeGpu
->common
->boxScaler
->scaleBox(box
, scaledBox
);
648 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
649 kernelParamsPtr
->current
.boxVolume
= scaledBox
[XX
][XX
] * scaledBox
[YY
][YY
] * scaledBox
[ZZ
][ZZ
];
650 GMX_ASSERT(kernelParamsPtr
->current
.boxVolume
!= 0.0F
, "Zero volume of the unit cell");
652 gmx::invertBoxMatrix(scaledBox
, recipBox
);
654 /* The GPU recipBox is transposed as compared to the CPU recipBox.
655 * Spread uses matrix columns (while solve and gather use rows).
656 * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
658 const real newRecipBox
[DIM
][DIM
] = { { recipBox
[XX
][XX
], recipBox
[YY
][XX
], recipBox
[ZZ
][XX
] },
659 { 0.0, recipBox
[YY
][YY
], recipBox
[ZZ
][YY
] },
660 { 0.0, 0.0, recipBox
[ZZ
][ZZ
] } };
661 memcpy(kernelParamsPtr
->current
.recipBox
, newRecipBox
, sizeof(matrix
));
665 /*! \brief \libinternal
666 * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
668 * \param[in] pmeGpu The PME GPU structure.
670 static void pme_gpu_reinit_grids(PmeGpu
* pmeGpu
)
672 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
675 pmeGpu
->common
->ngrids
== 1 || pmeGpu
->common
->ngrids
== 2,
676 "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
678 kernelParamsPtr
->grid
.ewaldFactor
=
679 (M_PI
* M_PI
) / (pmeGpu
->common
->ewaldcoeff_q
* pmeGpu
->common
->ewaldcoeff_q
);
680 /* The grid size variants */
681 for (int i
= 0; i
< DIM
; i
++)
683 kernelParamsPtr
->grid
.realGridSize
[i
] = pmeGpu
->common
->nk
[i
];
684 kernelParamsPtr
->grid
.realGridSizeFP
[i
] =
685 static_cast<float>(kernelParamsPtr
->grid
.realGridSize
[i
]);
686 kernelParamsPtr
->grid
.realGridSizePadded
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
688 // The complex grid currently uses no padding;
689 // if it starts to do so, then another test should be added for that
690 kernelParamsPtr
->grid
.complexGridSize
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
691 kernelParamsPtr
->grid
.complexGridSizePadded
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
693 /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
694 if (!pme_gpu_settings(pmeGpu
).performGPUFFT
)
696 // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
697 kernelParamsPtr
->grid
.realGridSizePadded
[ZZ
] =
698 (kernelParamsPtr
->grid
.realGridSize
[ZZ
] / 2 + 1) * 2;
700 /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
701 kernelParamsPtr
->grid
.complexGridSize
[ZZ
] /= 2;
702 kernelParamsPtr
->grid
.complexGridSize
[ZZ
]++;
703 kernelParamsPtr
->grid
.complexGridSizePadded
[ZZ
] = kernelParamsPtr
->grid
.complexGridSize
[ZZ
];
705 pme_gpu_realloc_and_copy_fract_shifts(pmeGpu
);
706 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
708 pme_gpu_realloc_and_copy_bspline_values(pmeGpu
, gridIndex
);
711 pme_gpu_realloc_grids(pmeGpu
);
712 pme_gpu_reinit_3dfft(pmeGpu
);
715 /* Several GPU functions that refer to the CPU PME data live here.
716 * We would like to keep these away from the GPU-framework specific code for clarity,
717 * as well as compilation issues with MPI.
720 /*! \brief \libinternal
721 * Copies everything useful from the PME CPU to the PME GPU structure.
722 * The goal is to minimize interaction with the PME CPU structure in the GPU code.
724 * \param[in] pme The PME structure.
726 static void pme_gpu_copy_common_data_from(const gmx_pme_t
* pme
)
728 /* TODO: Consider refactoring the CPU PME code to use the same structure,
729 * so that this function becomes 2 lines */
730 PmeGpu
* pmeGpu
= pme
->gpu
;
731 pmeGpu
->common
->ngrids
= pme
->bFEP_q
? 2 : 1;
732 pmeGpu
->common
->epsilon_r
= pme
->epsilon_r
;
733 pmeGpu
->common
->ewaldcoeff_q
= pme
->ewaldcoeff_q
;
734 pmeGpu
->common
->nk
[XX
] = pme
->nkx
;
735 pmeGpu
->common
->nk
[YY
] = pme
->nky
;
736 pmeGpu
->common
->nk
[ZZ
] = pme
->nkz
;
737 pmeGpu
->common
->pme_order
= pme
->pme_order
;
738 if (pmeGpu
->common
->pme_order
!= c_pmeGpuOrder
)
740 GMX_THROW(gmx::NotImplementedError("pme_order != 4 is not implemented!"));
742 for (int i
= 0; i
< DIM
; i
++)
744 pmeGpu
->common
->bsp_mod
[i
].assign(pme
->bsp_mod
[i
], pme
->bsp_mod
[i
] + pmeGpu
->common
->nk
[i
]);
746 const int cellCount
= c_pmeNeighborUnitcellCount
;
747 pmeGpu
->common
->fsh
.resize(0);
748 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshx
, pme
->fshx
+ cellCount
* pme
->nkx
);
749 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshy
, pme
->fshy
+ cellCount
* pme
->nky
);
750 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshz
, pme
->fshz
+ cellCount
* pme
->nkz
);
751 pmeGpu
->common
->nn
.resize(0);
752 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nnx
, pme
->nnx
+ cellCount
* pme
->nkx
);
753 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nny
, pme
->nny
+ cellCount
* pme
->nky
);
754 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nnz
, pme
->nnz
+ cellCount
* pme
->nkz
);
755 pmeGpu
->common
->runMode
= pme
->runMode
;
756 pmeGpu
->common
->isRankPmeOnly
= !pme
->bPPnode
;
757 pmeGpu
->common
->boxScaler
= pme
->boxScaler
;
760 /*! \libinternal \brief
761 * uses heuristics to select the best performing PME gather and scatter kernels
763 * \param[in,out] pmeGpu The PME GPU structure.
765 static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu
* pmeGpu
)
767 if (GMX_GPU_CUDA
&& pmeGpu
->kernelParams
->atoms
.nAtoms
> c_pmeGpuPerformanceAtomLimit
)
769 pmeGpu
->settings
.threadsPerAtom
= ThreadsPerAtom::Order
;
770 pmeGpu
->settings
.recalculateSplines
= true;
774 pmeGpu
->settings
.threadsPerAtom
= ThreadsPerAtom::OrderSquared
;
775 pmeGpu
->settings
.recalculateSplines
= false;
780 /*! \libinternal \brief
781 * Initializes the PME GPU data at the beginning of the run.
782 * TODO: this should become PmeGpu::PmeGpu()
784 * \param[in,out] pme The PME structure.
785 * \param[in] deviceContext The GPU context.
786 * \param[in] deviceStream The GPU stream.
787 * \param[in,out] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
789 static void pme_gpu_init(gmx_pme_t
* pme
,
790 const DeviceContext
& deviceContext
,
791 const DeviceStream
& deviceStream
,
792 const PmeGpuProgram
* pmeGpuProgram
)
794 pme
->gpu
= new PmeGpu();
795 PmeGpu
* pmeGpu
= pme
->gpu
;
796 changePinningPolicy(&pmeGpu
->staging
.h_forces
, pme_get_pinning_policy());
797 pmeGpu
->common
= std::make_shared
<PmeShared
>();
799 /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
800 /* A convenience variable. */
801 pmeGpu
->settings
.useDecomposition
= (pme
->nnodes
!= 1);
802 /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
803 pmeGpu
->settings
.performGPUGather
= true;
804 // By default GPU-side reduction is off (explicitly set here for tests, otherwise reset per-step)
805 pmeGpu
->settings
.useGpuForceReduction
= false;
807 pme_gpu_set_testing(pmeGpu
, false);
809 GMX_ASSERT(pmeGpuProgram
!= nullptr, "GPU kernels must be already compiled");
810 pmeGpu
->programHandle_
= pmeGpuProgram
;
812 pmeGpu
->initializedClfftLibrary_
= std::make_unique
<gmx::ClfftInitializer
>();
814 pme_gpu_init_internal(pmeGpu
, deviceContext
, deviceStream
);
816 pme_gpu_copy_common_data_from(pme
);
817 pme_gpu_alloc_energy_virial(pmeGpu
);
819 GMX_ASSERT(pmeGpu
->common
->epsilon_r
!= 0.0F
, "PME GPU: bad electrostatic coefficient");
821 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
822 kernelParamsPtr
->constants
.elFactor
= ONE_4PI_EPS0
/ pmeGpu
->common
->epsilon_r
;
825 void pme_gpu_get_real_grid_sizes(const PmeGpu
* pmeGpu
, gmx::IVec
* gridSize
, gmx::IVec
* paddedGridSize
)
827 GMX_ASSERT(gridSize
!= nullptr, "");
828 GMX_ASSERT(paddedGridSize
!= nullptr, "");
829 GMX_ASSERT(pmeGpu
!= nullptr, "");
830 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
831 for (int i
= 0; i
< DIM
; i
++)
833 (*gridSize
)[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
834 (*paddedGridSize
)[i
] = kernelParamsPtr
->grid
.realGridSizePadded
[i
];
838 void pme_gpu_reinit(gmx_pme_t
* pme
,
839 const DeviceContext
* deviceContext
,
840 const DeviceStream
* deviceStream
,
841 const PmeGpuProgram
* pmeGpuProgram
)
843 GMX_ASSERT(pme
!= nullptr, "Need valid PME object");
847 GMX_RELEASE_ASSERT(deviceContext
!= nullptr,
848 "Device context can not be nullptr when setting up PME on GPU.");
849 GMX_RELEASE_ASSERT(deviceStream
!= nullptr,
850 "Device stream can not be nullptr when setting up PME on GPU.");
851 /* First-time initialization */
852 pme_gpu_init(pme
, *deviceContext
, *deviceStream
, pmeGpuProgram
);
856 /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */
857 pme_gpu_copy_common_data_from(pme
);
859 /* GPU FFT will only get used for a single rank.*/
860 pme
->gpu
->settings
.performGPUFFT
=
861 (pme
->gpu
->common
->runMode
== PmeRunMode::GPU
) && !pme
->gpu
->settings
.useDecomposition
;
862 pme
->gpu
->settings
.performGPUSolve
= (pme
->gpu
->common
->runMode
== PmeRunMode::GPU
);
864 /* Reinit active timers */
865 pme_gpu_reinit_timings(pme
->gpu
);
867 pme_gpu_reinit_grids(pme
->gpu
);
868 // Note: if timing the reinit launch overhead becomes more relevant
869 // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
870 pme_gpu_reinit_computation(pme
, nullptr);
871 /* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
872 * update for mixed mode on grid switch. TODO: use shared recipbox field.
874 std::memset(pme
->gpu
->common
->previousBox
, 0, sizeof(pme
->gpu
->common
->previousBox
));
877 void pme_gpu_destroy(PmeGpu
* pmeGpu
)
879 /* Free lots of data */
880 pme_gpu_free_energy_virial(pmeGpu
);
881 pme_gpu_free_bspline_values(pmeGpu
);
882 pme_gpu_free_forces(pmeGpu
);
883 pme_gpu_free_coefficients(pmeGpu
);
884 pme_gpu_free_spline_data(pmeGpu
);
885 pme_gpu_free_grid_indices(pmeGpu
);
886 pme_gpu_free_fract_shifts(pmeGpu
);
887 pme_gpu_free_grids(pmeGpu
);
889 pme_gpu_destroy_3dfft(pmeGpu
);
894 void pme_gpu_reinit_atoms(PmeGpu
* pmeGpu
, const int nAtoms
, const real
* chargesA
, const real
* chargesB
)
896 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
897 kernelParamsPtr
->atoms
.nAtoms
= nAtoms
;
898 const int block_size
= pme_gpu_get_atom_data_block_size();
899 const int nAtomsNewPadded
= ((nAtoms
+ block_size
- 1) / block_size
) * block_size
;
900 const bool haveToRealloc
= (pmeGpu
->nAtomsAlloc
< nAtomsNewPadded
);
901 pmeGpu
->nAtomsAlloc
= nAtomsNewPadded
;
904 GMX_RELEASE_ASSERT(false, "Only single precision supported");
905 GMX_UNUSED_VALUE(charges
);
908 /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
909 pme_gpu_realloc_and_copy_input_coefficients(pmeGpu
, reinterpret_cast<const float*>(chargesA
), gridIndex
);
911 if (chargesB
!= nullptr)
913 pme_gpu_realloc_and_copy_input_coefficients(
914 pmeGpu
, reinterpret_cast<const float*>(chargesB
), gridIndex
);
918 /* Fill the second set of coefficients with chargesA as well to be able to avoid
919 * conditionals in the GPU kernels */
920 /* FIXME: This should be avoided by making a separate templated version of the
921 * relevant kernel(s) (probably only pme_gather_kernel). That would require a
922 * reduction of the current number of templated parameters of that kernel. */
923 pme_gpu_realloc_and_copy_input_coefficients(
924 pmeGpu
, reinterpret_cast<const float*>(chargesA
), gridIndex
);
930 pme_gpu_realloc_forces(pmeGpu
);
931 pme_gpu_realloc_spline_data(pmeGpu
);
932 pme_gpu_realloc_grid_indices(pmeGpu
);
934 pme_gpu_select_best_performing_pme_spreadgather_kernels(pmeGpu
);
938 * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
939 * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
941 * \param[in] pmeGpu The PME GPU data structure.
942 * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
944 static CommandEvent
* pme_gpu_fetch_timing_event(const PmeGpu
* pmeGpu
, size_t PMEStageId
)
946 CommandEvent
* timingEvent
= nullptr;
947 if (pme_gpu_timings_enabled(pmeGpu
))
949 GMX_ASSERT(PMEStageId
< pmeGpu
->archSpecific
->timingEvents
.size(),
950 "Wrong PME GPU timing event index");
951 timingEvent
= pmeGpu
->archSpecific
->timingEvents
[PMEStageId
].fetchNextEvent();
956 void pme_gpu_3dfft(const PmeGpu
* pmeGpu
, gmx_fft_direction dir
, const int grid_index
)
958 int timerId
= (dir
== GMX_FFT_REAL_TO_COMPLEX
) ? gtPME_FFT_R2C
: gtPME_FFT_C2R
;
960 pme_gpu_start_timing(pmeGpu
, timerId
);
961 pmeGpu
->archSpecific
->fftSetup
[grid_index
]->perform3dFft(
962 dir
, pme_gpu_fetch_timing_event(pmeGpu
, timerId
));
963 pme_gpu_stop_timing(pmeGpu
, timerId
);
967 * Given possibly large \p blockCount, returns a compact 1D or 2D grid for kernel scheduling,
968 * to minimize number of unused blocks.
970 std::pair
<int, int> inline pmeGpuCreateGrid(const PmeGpu
* pmeGpu
, int blockCount
)
972 // How many maximum widths in X do we need (hopefully just one)
973 const int minRowCount
= (blockCount
+ pmeGpu
->maxGridWidthX
- 1) / pmeGpu
->maxGridWidthX
;
974 // Trying to make things even
975 const int colCount
= (blockCount
+ minRowCount
- 1) / minRowCount
;
976 GMX_ASSERT((colCount
* minRowCount
- blockCount
) >= 0, "pmeGpuCreateGrid: totally wrong");
977 GMX_ASSERT((colCount
* minRowCount
- blockCount
) < minRowCount
,
978 "pmeGpuCreateGrid: excessive blocks");
979 return std::pair
<int, int>(colCount
, minRowCount
);
983 * Returns a pointer to appropriate spline and spread kernel based on the input bool values
985 * \param[in] pmeGpu The PME GPU structure.
986 * \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom
987 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
988 * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed.
990 * \return Pointer to CUDA kernel
992 static auto selectSplineAndSpreadKernelPtr(const PmeGpu
* pmeGpu
,
993 ThreadsPerAtom threadsPerAtom
,
994 bool writeSplinesToGlobal
,
997 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
998 if (writeSplinesToGlobal
)
1000 if (threadsPerAtom
== ThreadsPerAtom::Order
)
1004 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelWriteSplinesThPerAtom4Dual
;
1008 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelWriteSplinesThPerAtom4Single
;
1015 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelWriteSplinesDual
;
1019 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelWriteSplinesSingle
;
1025 if (threadsPerAtom
== ThreadsPerAtom::Order
)
1029 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelThPerAtom4Dual
;
1033 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelThPerAtom4Single
;
1040 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelDual
;
1044 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelSingle
;
1053 * Returns a pointer to appropriate spline kernel based on the input bool values
1055 * \param[in] pmeGpu The PME GPU structure.
1056 * \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom
1057 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
1058 * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed.
1060 * \return Pointer to CUDA kernel
1062 static auto selectSplineKernelPtr(const PmeGpu
* pmeGpu
,
1063 ThreadsPerAtom threadsPerAtom
,
1064 bool gmx_unused writeSplinesToGlobal
,
1067 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1069 writeSplinesToGlobal
,
1070 "Spline data should always be written to global memory when just calculating splines");
1072 if (threadsPerAtom
== ThreadsPerAtom::Order
)
1076 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineKernelThPerAtom4Dual
;
1080 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineKernelThPerAtom4Single
;
1087 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineKernelDual
;
1091 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineKernelSingle
;
1098 * Returns a pointer to appropriate spread kernel based on the input bool values
1100 * \param[in] pmeGpu The PME GPU structure.
1101 * \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom
1102 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
1103 * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed.
1105 * \return Pointer to CUDA kernel
1107 static auto selectSpreadKernelPtr(const PmeGpu
* pmeGpu
,
1108 ThreadsPerAtom threadsPerAtom
,
1109 bool writeSplinesToGlobal
,
1112 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1113 if (writeSplinesToGlobal
)
1115 if (threadsPerAtom
== ThreadsPerAtom::Order
)
1119 kernelPtr
= pmeGpu
->programHandle_
->impl_
->spreadKernelThPerAtom4Dual
;
1122 kernelPtr
= pmeGpu
->programHandle_
->impl_
->spreadKernelThPerAtom4Single
;
1129 kernelPtr
= pmeGpu
->programHandle_
->impl_
->spreadKernelDual
;
1133 kernelPtr
= pmeGpu
->programHandle_
->impl_
->spreadKernelSingle
;
1139 /* if we are not saving the spline data we need to recalculate it
1140 using the spline and spread Kernel */
1141 if (threadsPerAtom
== ThreadsPerAtom::Order
)
1145 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelThPerAtom4Dual
;
1149 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelThPerAtom4Single
;
1156 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelDual
;
1160 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelSingle
;
1167 void pme_gpu_spread(const PmeGpu
* pmeGpu
,
1168 GpuEventSynchronizer
* xReadyOnDevice
,
1170 bool computeSplines
,
1175 pmeGpu
->common
->ngrids
== 1 || pmeGpu
->common
->ngrids
== 2,
1176 "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
1178 GMX_ASSERT(computeSplines
|| spreadCharges
,
1179 "PME spline/spread kernel has invalid input (nothing to do)");
1180 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1181 GMX_ASSERT(kernelParamsPtr
->atoms
.nAtoms
> 0, "No atom data in PME GPU spread");
1183 const size_t blockSize
= pmeGpu
->programHandle_
->impl_
->spreadWorkGroupSize
;
1185 const int order
= pmeGpu
->common
->pme_order
;
1186 GMX_ASSERT(order
== c_pmeGpuOrder
, "Only PME order 4 is implemented");
1187 const bool writeGlobal
= pmeGpu
->settings
.copyAllOutputs
;
1188 const int threadsPerAtom
=
1189 (pmeGpu
->settings
.threadsPerAtom
== ThreadsPerAtom::Order
? order
: order
* order
);
1190 const bool recalculateSplines
= pmeGpu
->settings
.recalculateSplines
;
1192 GMX_ASSERT(!GMX_GPU_OPENCL
|| pmeGpu
->settings
.threadsPerAtom
== ThreadsPerAtom::OrderSquared
,
1193 "Only 16 threads per atom supported in OpenCL");
1194 GMX_ASSERT(!GMX_GPU_OPENCL
|| !recalculateSplines
,
1195 "Recalculating splines not supported in OpenCL");
1197 const int atomsPerBlock
= blockSize
/ threadsPerAtom
;
1199 // TODO: pick smaller block size in runtime if needed
1200 // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
1201 // If doing so, change atomsPerBlock in the kernels as well.
1202 // TODO: test varying block sizes on modern arch-s as well
1203 // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
1204 //(for spline data mostly)
1205 GMX_ASSERT(!(c_pmeAtomDataBlockSize
% atomsPerBlock
),
1206 "inconsistent atom data padding vs. spreading block size");
1208 // Ensure that coordinates are ready on the device before launching spread;
1209 // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
1210 // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
1211 GMX_ASSERT(!GMX_GPU_CUDA
|| xReadyOnDevice
!= nullptr || pmeGpu
->common
->isRankPmeOnly
1212 || pme_gpu_settings(pmeGpu
).copyAllOutputs
,
1213 "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
1217 xReadyOnDevice
->enqueueWaitEvent(pmeGpu
->archSpecific
->pmeStream_
);
1220 const int blockCount
= pmeGpu
->nAtomsAlloc
/ atomsPerBlock
;
1221 auto dimGrid
= pmeGpuCreateGrid(pmeGpu
, blockCount
);
1223 if (pmeGpu
->common
->ngrids
== 1)
1225 kernelParamsPtr
->current
.scale
= 1.0;
1229 kernelParamsPtr
->current
.scale
= 1.0 - lambda
;
1232 KernelLaunchConfig config
;
1233 config
.blockSize
[0] = order
;
1234 config
.blockSize
[1] = (pmeGpu
->settings
.threadsPerAtom
== ThreadsPerAtom::Order
? 1 : order
);
1235 config
.blockSize
[2] = atomsPerBlock
;
1236 config
.gridSize
[0] = dimGrid
.first
;
1237 config
.gridSize
[1] = dimGrid
.second
;
1240 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1245 timingId
= gtPME_SPLINEANDSPREAD
;
1246 kernelPtr
= selectSplineAndSpreadKernelPtr(pmeGpu
, pmeGpu
->settings
.threadsPerAtom
,
1247 writeGlobal
|| (!recalculateSplines
),
1248 pmeGpu
->common
->ngrids
);
1252 timingId
= gtPME_SPLINE
;
1253 kernelPtr
= selectSplineKernelPtr(pmeGpu
, pmeGpu
->settings
.threadsPerAtom
,
1254 writeGlobal
|| (!recalculateSplines
),
1255 pmeGpu
->common
->ngrids
);
1260 timingId
= gtPME_SPREAD
;
1261 kernelPtr
= selectSpreadKernelPtr(pmeGpu
, pmeGpu
->settings
.threadsPerAtom
,
1262 writeGlobal
|| (!recalculateSplines
), pmeGpu
->common
->ngrids
);
1266 pme_gpu_start_timing(pmeGpu
, timingId
);
1267 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1268 #if c_canEmbedBuffers
1269 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1271 const auto kernelArgs
= prepareGpuKernelArguments(
1272 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->atoms
.d_theta
,
1273 &kernelParamsPtr
->atoms
.d_dtheta
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
1274 &kernelParamsPtr
->grid
.d_realGrid
[FEP_STATE_A
], &kernelParamsPtr
->grid
.d_realGrid
[FEP_STATE_B
],
1275 &kernelParamsPtr
->grid
.d_fractShiftsTable
, &kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
1276 &kernelParamsPtr
->atoms
.d_coefficients
[FEP_STATE_A
],
1277 &kernelParamsPtr
->atoms
.d_coefficients
[FEP_STATE_B
], &kernelParamsPtr
->atoms
.d_coordinates
);
1280 launchGpuKernel(kernelPtr
, config
, pmeGpu
->archSpecific
->pmeStream_
, timingEvent
,
1281 "PME spline/spread", kernelArgs
);
1282 pme_gpu_stop_timing(pmeGpu
, timingId
);
1284 const auto& settings
= pmeGpu
->settings
;
1285 const bool copyBackGrid
= spreadCharges
&& (!settings
.performGPUFFT
|| settings
.copyAllOutputs
);
1288 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
1290 float* h_grid
= h_grids
[gridIndex
];
1291 pme_gpu_copy_output_spread_grid(pmeGpu
, h_grid
, gridIndex
);
1294 const bool copyBackAtomData
=
1295 computeSplines
&& (!settings
.performGPUGather
|| settings
.copyAllOutputs
);
1296 if (copyBackAtomData
)
1298 pme_gpu_copy_output_spread_atom_data(pmeGpu
);
1302 void pme_gpu_solve(const PmeGpu
* pmeGpu
,
1303 const int gridIndex
,
1305 GridOrdering gridOrdering
,
1306 bool computeEnergyAndVirial
)
1309 pmeGpu
->common
->ngrids
== 1 || pmeGpu
->common
->ngrids
== 2,
1310 "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
1311 GMX_ASSERT(gridIndex
< pmeGpu
->common
->ngrids
,
1312 "Invalid combination of gridIndex and number of grids");
1314 const auto& settings
= pmeGpu
->settings
;
1315 const bool copyInputAndOutputGrid
= !settings
.performGPUFFT
|| settings
.copyAllOutputs
;
1317 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1319 float* h_gridFloat
= reinterpret_cast<float*>(h_grid
);
1320 if (copyInputAndOutputGrid
)
1322 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_fourierGrid
[gridIndex
], h_gridFloat
, 0,
1323 pmeGpu
->archSpecific
->complexGridSize
[gridIndex
],
1324 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
1327 int majorDim
= -1, middleDim
= -1, minorDim
= -1;
1328 switch (gridOrdering
)
1330 case GridOrdering::YZX
:
1336 case GridOrdering::XYZ
:
1342 default: GMX_ASSERT(false, "Implement grid ordering here and below for the kernel launch");
1345 const int maxBlockSize
= pmeGpu
->programHandle_
->impl_
->solveMaxWorkGroupSize
;
1347 const int gridLineSize
= pmeGpu
->kernelParams
->grid
.complexGridSize
[minorDim
];
1348 const int gridLinesPerBlock
= std::max(maxBlockSize
/ gridLineSize
, 1);
1349 const int blocksPerGridLine
= (gridLineSize
+ maxBlockSize
- 1) / maxBlockSize
;
1351 if (blocksPerGridLine
== 1)
1353 cellsPerBlock
= gridLineSize
* gridLinesPerBlock
;
1357 cellsPerBlock
= (gridLineSize
+ blocksPerGridLine
- 1) / blocksPerGridLine
;
1359 const int warpSize
= pmeGpu
->programHandle_
->warpSize();
1360 const int blockSize
= (cellsPerBlock
+ warpSize
- 1) / warpSize
* warpSize
;
1362 static_assert(!GMX_GPU_CUDA
|| c_solveMaxWarpsPerBlock
/ 2 >= 4,
1363 "The CUDA solve energy kernels needs at least 4 warps. "
1364 "Here we launch at least half of the max warps.");
1366 KernelLaunchConfig config
;
1367 config
.blockSize
[0] = blockSize
;
1368 config
.gridSize
[0] = blocksPerGridLine
;
1369 // rounding up to full warps so that shuffle operations produce defined results
1370 config
.gridSize
[1] = (pmeGpu
->kernelParams
->grid
.complexGridSize
[middleDim
] + gridLinesPerBlock
- 1)
1371 / gridLinesPerBlock
;
1372 config
.gridSize
[2] = pmeGpu
->kernelParams
->grid
.complexGridSize
[majorDim
];
1374 int timingId
= gtPME_SOLVE
;
1375 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1376 if (gridOrdering
== GridOrdering::YZX
)
1380 kernelPtr
= computeEnergyAndVirial
? pmeGpu
->programHandle_
->impl_
->solveYZXEnergyKernelA
1381 : pmeGpu
->programHandle_
->impl_
->solveYZXKernelA
;
1385 kernelPtr
= computeEnergyAndVirial
? pmeGpu
->programHandle_
->impl_
->solveYZXEnergyKernelB
1386 : pmeGpu
->programHandle_
->impl_
->solveYZXKernelB
;
1389 else if (gridOrdering
== GridOrdering::XYZ
)
1393 kernelPtr
= computeEnergyAndVirial
? pmeGpu
->programHandle_
->impl_
->solveXYZEnergyKernelA
1394 : pmeGpu
->programHandle_
->impl_
->solveXYZKernelA
;
1398 kernelPtr
= computeEnergyAndVirial
? pmeGpu
->programHandle_
->impl_
->solveXYZEnergyKernelB
1399 : pmeGpu
->programHandle_
->impl_
->solveXYZKernelB
;
1403 pme_gpu_start_timing(pmeGpu
, timingId
);
1404 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1405 #if c_canEmbedBuffers
1406 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1408 const auto kernelArgs
= prepareGpuKernelArguments(
1409 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->grid
.d_splineModuli
[gridIndex
],
1410 &kernelParamsPtr
->constants
.d_virialAndEnergy
[gridIndex
],
1411 &kernelParamsPtr
->grid
.d_fourierGrid
[gridIndex
]);
1413 launchGpuKernel(kernelPtr
, config
, pmeGpu
->archSpecific
->pmeStream_
, timingEvent
, "PME solve",
1415 pme_gpu_stop_timing(pmeGpu
, timingId
);
1417 if (computeEnergyAndVirial
)
1419 copyFromDeviceBuffer(pmeGpu
->staging
.h_virialAndEnergy
[gridIndex
],
1420 &kernelParamsPtr
->constants
.d_virialAndEnergy
[gridIndex
], 0,
1421 c_virialAndEnergyCount
, pmeGpu
->archSpecific
->pmeStream_
,
1422 pmeGpu
->settings
.transferKind
, nullptr);
1425 if (copyInputAndOutputGrid
)
1427 copyFromDeviceBuffer(h_gridFloat
, &kernelParamsPtr
->grid
.d_fourierGrid
[gridIndex
], 0,
1428 pmeGpu
->archSpecific
->complexGridSize
[gridIndex
],
1429 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
1434 * Returns a pointer to appropriate gather kernel based on the inputvalues
1436 * \param[in] pmeGpu The PME GPU structure.
1437 * \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom
1438 * \param[in] readSplinesFromGlobal bool controlling if we should write spline data to global memory
1439 * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed.
1441 * \return Pointer to CUDA kernel
1443 inline auto selectGatherKernelPtr(const PmeGpu
* pmeGpu
,
1444 ThreadsPerAtom threadsPerAtom
,
1445 bool readSplinesFromGlobal
,
1449 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1451 if (readSplinesFromGlobal
)
1453 if (threadsPerAtom
== ThreadsPerAtom::Order
)
1457 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelReadSplinesThPerAtom4Dual
;
1461 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelReadSplinesThPerAtom4Single
;
1468 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelReadSplinesDual
;
1472 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelReadSplinesSingle
;
1478 if (threadsPerAtom
== ThreadsPerAtom::Order
)
1482 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelThPerAtom4Dual
;
1486 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelThPerAtom4Single
;
1493 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelDual
;
1497 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelSingle
;
1504 void pme_gpu_gather(PmeGpu
* pmeGpu
, real
** h_grids
, const float lambda
)
1507 pmeGpu
->common
->ngrids
== 1 || pmeGpu
->common
->ngrids
== 2,
1508 "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
1510 const auto& settings
= pmeGpu
->settings
;
1512 if (!settings
.performGPUFFT
|| settings
.copyAllOutputs
)
1514 for (int gridIndex
= 0; gridIndex
< pmeGpu
->common
->ngrids
; gridIndex
++)
1516 float* h_grid
= const_cast<float*>(h_grids
[gridIndex
]);
1517 pme_gpu_copy_input_gather_grid(pmeGpu
, h_grid
, gridIndex
);
1521 if (settings
.copyAllOutputs
)
1523 pme_gpu_copy_input_gather_atom_data(pmeGpu
);
1526 /* Set if we have unit tests */
1527 const bool readGlobal
= pmeGpu
->settings
.copyAllOutputs
;
1528 const size_t blockSize
= pmeGpu
->programHandle_
->impl_
->gatherWorkGroupSize
;
1529 const int order
= pmeGpu
->common
->pme_order
;
1530 GMX_ASSERT(order
== c_pmeGpuOrder
, "Only PME order 4 is implemented");
1531 const int threadsPerAtom
=
1532 (pmeGpu
->settings
.threadsPerAtom
== ThreadsPerAtom::Order
? order
: order
* order
);
1533 const bool recalculateSplines
= pmeGpu
->settings
.recalculateSplines
;
1535 GMX_ASSERT(!GMX_GPU_OPENCL
|| pmeGpu
->settings
.threadsPerAtom
== ThreadsPerAtom::OrderSquared
,
1536 "Only 16 threads per atom supported in OpenCL");
1537 GMX_ASSERT(!GMX_GPU_OPENCL
|| !recalculateSplines
,
1538 "Recalculating splines not supported in OpenCL");
1540 const int atomsPerBlock
= blockSize
/ threadsPerAtom
;
1542 GMX_ASSERT(!(c_pmeAtomDataBlockSize
% atomsPerBlock
),
1543 "inconsistent atom data padding vs. gathering block size");
1545 const int blockCount
= pmeGpu
->nAtomsAlloc
/ atomsPerBlock
;
1546 auto dimGrid
= pmeGpuCreateGrid(pmeGpu
, blockCount
);
1548 KernelLaunchConfig config
;
1549 config
.blockSize
[0] = order
;
1550 config
.blockSize
[1] = (pmeGpu
->settings
.threadsPerAtom
== ThreadsPerAtom::Order
? 1 : order
);
1551 config
.blockSize
[2] = atomsPerBlock
;
1552 config
.gridSize
[0] = dimGrid
.first
;
1553 config
.gridSize
[1] = dimGrid
.second
;
1555 // TODO test different cache configs
1557 int timingId
= gtPME_GATHER
;
1558 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
=
1559 selectGatherKernelPtr(pmeGpu
, pmeGpu
->settings
.threadsPerAtom
,
1560 readGlobal
|| (!recalculateSplines
), pmeGpu
->common
->ngrids
);
1561 // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
1563 pme_gpu_start_timing(pmeGpu
, timingId
);
1564 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1565 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1566 if (pmeGpu
->common
->ngrids
== 1)
1568 kernelParamsPtr
->current
.scale
= 1.0;
1572 kernelParamsPtr
->current
.scale
= 1.0 - lambda
;
1575 #if c_canEmbedBuffers
1576 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1578 const auto kernelArgs
= prepareGpuKernelArguments(
1579 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->atoms
.d_coefficients
[FEP_STATE_A
],
1580 &kernelParamsPtr
->atoms
.d_coefficients
[FEP_STATE_B
],
1581 &kernelParamsPtr
->grid
.d_realGrid
[FEP_STATE_A
], &kernelParamsPtr
->grid
.d_realGrid
[FEP_STATE_B
],
1582 &kernelParamsPtr
->atoms
.d_theta
, &kernelParamsPtr
->atoms
.d_dtheta
,
1583 &kernelParamsPtr
->atoms
.d_gridlineIndices
, &kernelParamsPtr
->atoms
.d_forces
);
1585 launchGpuKernel(kernelPtr
, config
, pmeGpu
->archSpecific
->pmeStream_
, timingEvent
, "PME gather",
1587 pme_gpu_stop_timing(pmeGpu
, timingId
);
1589 if (pmeGpu
->settings
.useGpuForceReduction
)
1591 pmeGpu
->archSpecific
->pmeForcesReady
.markEvent(pmeGpu
->archSpecific
->pmeStream_
);
1595 pme_gpu_copy_output_forces(pmeGpu
);
1599 void* pme_gpu_get_kernelparam_forces(const PmeGpu
* pmeGpu
)
1601 if (pmeGpu
&& pmeGpu
->kernelParams
)
1603 return pmeGpu
->kernelParams
->atoms
.d_forces
;
1611 void pme_gpu_set_kernelparam_coordinates(const PmeGpu
* pmeGpu
, DeviceBuffer
<gmx::RVec
> d_x
)
1613 GMX_ASSERT(pmeGpu
&& pmeGpu
->kernelParams
,
1614 "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
1617 GMX_ASSERT(checkDeviceBuffer(d_x
, pmeGpu
->kernelParams
->atoms
.nAtoms
),
1618 "The device-side buffer can not be set.");
1620 pmeGpu
->kernelParams
->atoms
.d_coordinates
= d_x
;
1623 GpuEventSynchronizer
* pme_gpu_get_forces_ready_synchronizer(const PmeGpu
* pmeGpu
)
1625 if (pmeGpu
&& pmeGpu
->kernelParams
)
1627 return &pmeGpu
->archSpecific
->pmeForcesReady
;