2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file contains internal function definitions for performing the PME calculations on GPU.
39 * These are not meant to be exposed outside of the PME GPU code.
40 * As of now, their bodies are still in the common pme_gpu.cpp files.
42 * \author Aleksei Iupinov <a.yupinov@gmail.com>
43 * \ingroup module_ewald
46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
47 #define GMX_EWALD_PME_GPU_INTERNAL_H
49 #include "gromacs/fft/fft.h" // for the gmx_fft_direction enum
50 #include "gromacs/gpu_utils/gpu_macros.h" // for the GPU_FUNC_ macros
51 #include "gromacs/utility/arrayref.h"
53 #include "pme_gpu_types_host.h" // for the inline functions accessing PmeGpu members
57 struct gmx_pme_t
; // only used in pme_gpu_reinit
58 struct gmx_wallclock_gpu_pme_t
;
67 //! Type of spline data
68 enum class PmeSplineDataType
71 Derivatives
, // dtheta
72 }; //TODO move this into new and shiny pme.h (pme-types.h?)
74 //! PME grid dimension ordering (from major to minor)
75 enum class GridOrdering
81 /*! \libinternal \brief
82 * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
83 * Depends on CUDA-specific block sizes, needed for the atom data padding.
85 * \param[in] pmeGpu The PME GPU structure.
86 * \returns Number of atoms in a single GPU atom data chunk.
88 int pme_gpu_get_atom_data_alignment(const PmeGpu
*pmeGpu
);
90 /*! \libinternal \brief
91 * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
93 * \param[in] pmeGpu The PME GPU structure.
94 * \returns Number of atoms in a single GPU atom spline data chunk.
96 int pme_gpu_get_atoms_per_warp(const PmeGpu
*pmeGpu
);
98 /*! \libinternal \brief
99 * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
101 * \param[in] pmeGpu The PME GPU structure.
103 GPU_FUNC_QUALIFIER
void pme_gpu_synchronize(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
)) GPU_FUNC_TERM
;
105 /*! \libinternal \brief
106 * Allocates the fixed size energy and virial buffer both on GPU and CPU.
108 * \param[in,out] pmeGpu The PME GPU structure.
110 void pme_gpu_alloc_energy_virial(PmeGpu
*pmeGpu
);
112 /*! \libinternal \brief
113 * Frees the energy and virial memory both on GPU and CPU.
115 * \param[in] pmeGpu The PME GPU structure.
117 void pme_gpu_free_energy_virial(PmeGpu
*pmeGpu
);
119 /*! \libinternal \brief
120 * Clears the energy and virial memory on GPU with 0.
121 * Should be called at the end of PME computation which returned energy/virial.
123 * \param[in] pmeGpu The PME GPU structure.
125 void pme_gpu_clear_energy_virial(const PmeGpu
*pmeGpu
);
127 /*! \libinternal \brief
128 * Reallocates and copies the pre-computed B-spline values to the GPU.
130 * \param[in,out] pmeGpu The PME GPU structure.
132 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu
*pmeGpu
);
134 /*! \libinternal \brief
135 * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
137 * \param[in] pmeGpu The PME GPU structure.
139 void pme_gpu_free_bspline_values(const PmeGpu
*pmeGpu
);
141 /*! \libinternal \brief
142 * Reallocates the GPU buffer for the PME forces.
144 * \param[in] pmeGpu The PME GPU structure.
146 void pme_gpu_realloc_forces(PmeGpu
*pmeGpu
);
148 /*! \libinternal \brief
149 * Frees the GPU buffer for the PME forces.
151 * \param[in] pmeGpu The PME GPU structure.
153 void pme_gpu_free_forces(const PmeGpu
*pmeGpu
);
155 /*! \libinternal \brief
156 * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
157 * To be called e.g. after the bonded calculations.
159 * \param[in] pmeGpu The PME GPU structure.
161 void pme_gpu_copy_input_forces(PmeGpu
*pmeGpu
);
163 /*! \libinternal \brief
164 * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
166 * \param[in] pmeGpu The PME GPU structure.
168 void pme_gpu_copy_output_forces(PmeGpu
*pmeGpu
);
170 /*! \libinternal \brief
171 * Checks whether work in the PME GPU stream has completed.
173 * \param[in] pmeGpu The PME GPU structure.
175 * \returns True if work in the PME stream has completed.
177 bool pme_gpu_stream_query(const PmeGpu
*pmeGpu
);
179 /*! \libinternal \brief
180 * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
182 * \param[in] pmeGpu The PME GPU structure.
184 * Needs to be called on every DD step/in the beginning.
186 void pme_gpu_realloc_coordinates(const PmeGpu
*pmeGpu
);
188 /*! \libinternal \brief
189 * Frees the coordinates on the GPU.
191 * \param[in] pmeGpu The PME GPU structure.
193 void pme_gpu_free_coordinates(const PmeGpu
*pmeGpu
);
195 /*! \libinternal \brief
196 * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
197 * Clears the padded part if needed.
199 * \param[in] pmeGpu The PME GPU structure.
200 * \param[in] h_coefficients The input atom charges/coefficients.
202 * Does not need to be done for every PME computation, only whenever the local charges change.
203 * (So, in the beginning of the run, or on DD step).
205 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu
*pmeGpu
,
206 const float *h_coefficients
);
208 /*! \libinternal \brief
209 * Frees the charges/coefficients on the GPU.
211 * \param[in] pmeGpu The PME GPU structure.
213 void pme_gpu_free_coefficients(const PmeGpu
*pmeGpu
);
215 /*! \libinternal \brief
216 * Reallocates the buffers on the GPU and the host for the atoms spline data.
218 * \param[in,out] pmeGpu The PME GPU structure.
220 void pme_gpu_realloc_spline_data(PmeGpu
*pmeGpu
);
222 /*! \libinternal \brief
223 * Frees the buffers on the GPU for the atoms spline data.
225 * \param[in] pmeGpu The PME GPU structure.
227 void pme_gpu_free_spline_data(const PmeGpu
*pmeGpu
);
229 /*! \libinternal \brief
230 * Reallocates the buffers on the GPU and the host for the particle gridline indices.
232 * \param[in,out] pmeGpu The PME GPU structure.
234 void pme_gpu_realloc_grid_indices(PmeGpu
*pmeGpu
);
236 /*! \libinternal \brief
237 * Frees the buffer on the GPU for the particle gridline indices.
239 * \param[in] pmeGpu The PME GPU structure.
241 void pme_gpu_free_grid_indices(const PmeGpu
*pmeGpu
);
243 /*! \libinternal \brief
244 * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
246 * \param[in] pmeGpu The PME GPU structure.
248 void pme_gpu_realloc_grids(PmeGpu
*pmeGpu
);
250 /*! \libinternal \brief
251 * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
253 * \param[in] pmeGpu The PME GPU structure.
255 void pme_gpu_free_grids(const PmeGpu
*pmeGpu
);
257 /*! \libinternal \brief
258 * Clears the real space grid on the GPU.
259 * Should be called at the end of each computation.
261 * \param[in] pmeGpu The PME GPU structure.
263 void pme_gpu_clear_grids(const PmeGpu
*pmeGpu
);
265 /*! \libinternal \brief
266 * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
268 * \param[in] pmeGpu The PME GPU structure.
270 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu
*pmeGpu
);
272 /*! \libinternal \brief
273 * Frees the pre-computed fractional coordinates' shifts on the GPU.
275 * \param[in] pmeGpu The PME GPU structure.
277 void pme_gpu_free_fract_shifts(const PmeGpu
*pmeGpu
);
279 /*! \libinternal \brief
280 * Copies the input real-space grid from the host to the GPU.
282 * \param[in] pmeGpu The PME GPU structure.
283 * \param[in] h_grid The host-side grid buffer.
285 void pme_gpu_copy_input_gather_grid(const PmeGpu
*pmeGpu
,
288 /*! \libinternal \brief
289 * Copies the output real-space grid from the GPU to the host.
291 * \param[in] pmeGpu The PME GPU structure.
292 * \param[out] h_grid The host-side grid buffer.
294 void pme_gpu_copy_output_spread_grid(const PmeGpu
*pmeGpu
,
297 /*! \libinternal \brief
298 * Copies the spread output spline data and gridline indices from the GPU to the host.
300 * \param[in] pmeGpu The PME GPU structure.
302 void pme_gpu_copy_output_spread_atom_data(const PmeGpu
*pmeGpu
);
304 /*! \libinternal \brief
305 * Copies the gather input spline data and gridline indices from the host to the GPU.
307 * \param[in] pmeGpu The PME GPU structure.
309 void pme_gpu_copy_input_gather_atom_data(const PmeGpu
*pmeGpu
);
311 /*! \libinternal \brief
312 * Waits for the grid copying to the host-side buffer after spreading to finish.
314 * \param[in] pmeGpu The PME GPU structure.
316 void pme_gpu_sync_spread_grid(const PmeGpu
*pmeGpu
);
318 /*! \libinternal \brief
319 * Does the one-time GPU-framework specific PME initialization.
320 * For CUDA, the PME stream is created with the highest priority.
322 * \param[in] pmeGpu The PME GPU structure.
324 void pme_gpu_init_internal(PmeGpu
*pmeGpu
);
326 /*! \libinternal \brief
327 * Destroys the PME GPU-framework specific data.
328 * Should be called last in the PME GPU destructor.
330 * \param[in] pmeGpu The PME GPU structure.
332 void pme_gpu_destroy_specific(const PmeGpu
*pmeGpu
);
334 /*! \libinternal \brief
335 * Initializes the CUDA FFT structures.
337 * \param[in] pmeGpu The PME GPU structure.
339 void pme_gpu_reinit_3dfft(const PmeGpu
*pmeGpu
);
341 /*! \libinternal \brief
342 * Destroys the CUDA FFT structures.
344 * \param[in] pmeGpu The PME GPU structure.
346 void pme_gpu_destroy_3dfft(const PmeGpu
*pmeGpu
);
348 /* Several GPU event-based timing functions that live in pme_gpu_timings.cpp */
350 /*! \libinternal \brief
351 * Finalizes all the active PME GPU stage timings for the current computation. Should be called at the end of every computation.
353 * \param[in] pmeGpu The PME GPU structure.
355 void pme_gpu_update_timings(const PmeGpu
*pmeGpu
);
357 /*! \libinternal \brief
358 * Updates the internal list of active PME GPU stages (if timings are enabled).
360 * \param[in] pmeGpu The PME GPU data structure.
362 void pme_gpu_reinit_timings(const PmeGpu
*pmeGpu
);
365 * Resets the PME GPU timings. To be called at the reset MD step.
367 * \param[in] pmeGpu The PME GPU structure.
369 void pme_gpu_reset_timings(const PmeGpu
*pmeGpu
);
371 /*! \libinternal \brief
372 * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
374 * \param[in] pmeGpu The PME GPU structure.
375 * \param[in] timings The gmx_wallclock_gpu_pme_t structure.
377 void pme_gpu_get_timings(const PmeGpu
*pmeGpu
,
378 gmx_wallclock_gpu_pme_t
*timings
);
380 /* The PME stages themselves */
382 /*! \libinternal \brief
383 * A GPU spline computation and charge spreading function.
385 * \param[in] pmeGpu The PME GPU structure.
386 * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
387 * \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
388 * e.g. testing or host-side FFT)
389 * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
390 * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
392 GPU_FUNC_QUALIFIER
void pme_gpu_spread(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
),
393 int GPU_FUNC_ARGUMENT(gridIndex
),
394 real
*GPU_FUNC_ARGUMENT(h_grid
),
395 bool GPU_FUNC_ARGUMENT(computeSplines
),
396 bool GPU_FUNC_ARGUMENT(spreadCharges
)) GPU_FUNC_TERM
;
398 /*! \libinternal \brief
399 * 3D FFT R2C/C2R routine.
401 * \param[in] pmeGpu The PME GPU structure.
402 * \param[in] direction Transform direction (real-to-complex or complex-to-real)
403 * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
405 void pme_gpu_3dfft(const PmeGpu
*pmeGpu
,
406 enum gmx_fft_direction direction
,
409 /*! \libinternal \brief
410 * A GPU Fourier space solving function.
412 * \param[in] pmeGpu The PME GPU structure.
413 * \param[in,out] h_grid The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
414 * \param[in] gridOrdering Specifies the dimenion ordering of the complex grid. TODO: store this information?
415 * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should also be performed.
417 GPU_FUNC_QUALIFIER
void pme_gpu_solve(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
),
418 t_complex
*GPU_FUNC_ARGUMENT(h_grid
),
419 GridOrdering
GPU_FUNC_ARGUMENT(gridOrdering
),
420 bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial
)) GPU_FUNC_TERM
;
422 /*! \libinternal \brief
423 * A GPU force gathering function.
425 * \param[in] pmeGpu The PME GPU structure.
426 * \param[in] forceTreatment Tells how data in h_forces should be treated.
427 * TODO: determine efficiency/balance of host/device-side reductions.
428 * \param[in] h_grid The host-side grid buffer (used only in testing mode)
430 GPU_FUNC_QUALIFIER
void pme_gpu_gather(PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
),
431 PmeForceOutputHandling
GPU_FUNC_ARGUMENT(forceTreatment
),
432 const float *GPU_FUNC_ARGUMENT(h_grid
)) GPU_FUNC_TERM
;
434 /*! \brief Return pointer to device copy of coordinate data.
435 * \param[in] pmeGpu The PME GPU structure.
436 * \returns Pointer to coordinate data
438 GPU_FUNC_QUALIFIER DeviceBuffer
<float> pme_gpu_get_kernelparam_coordinates(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer
<float> {});
440 /*! \brief Sets the device pointer to coordinate data
441 * \param[in] pmeGpu The PME GPU structure.
442 * \param[in] d_x Pointer to coordinate data
444 GPU_FUNC_QUALIFIER
void pme_gpu_set_kernelparam_coordinates(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
),
445 DeviceBuffer
<float> GPU_FUNC_ARGUMENT(d_x
)) GPU_FUNC_TERM
;
447 /*! \brief Return pointer to device copy of force data.
448 * \param[in] pmeGpu The PME GPU structure.
449 * \returns Pointer to force data
451 GPU_FUNC_QUALIFIER
void * pme_gpu_get_kernelparam_forces(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
453 /*! \brief Return pointer to GPU stream.
454 * \param[in] pmeGpu The PME GPU structure.
455 * \returns Pointer to stream object.
457 GPU_FUNC_QUALIFIER
void * pme_gpu_get_stream(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
459 /*! \brief Return pointer to GPU context (for OpenCL builds).
460 * \param[in] pmeGpu The PME GPU structure.
461 * \returns Pointer to context object.
463 GPU_FUNC_QUALIFIER
void * pme_gpu_get_context(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
465 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
466 * \param[in] pmeGpu The PME GPU structure.
467 * \returns Pointer to sync object
469 GPU_FUNC_QUALIFIER GpuEventSynchronizer
*pme_gpu_get_forces_ready_synchronizer(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
471 /* The inlined convenience PME GPU status getters */
473 /*! \libinternal \brief
474 * Tells if PME runs on multiple GPUs with the decomposition.
476 * \param[in] pmeGpu The PME GPU structure.
477 * \returns True if PME runs on multiple GPUs, false otherwise.
479 inline bool pme_gpu_uses_dd(const PmeGpu
*pmeGpu
)
481 return !pmeGpu
->settings
.useDecomposition
;
484 /*! \libinternal \brief
485 * Tells if PME performs the gathering stage on GPU.
487 * \param[in] pmeGpu The PME GPU structure.
488 * \returns True if the gathering is performed on GPU, false otherwise.
490 inline bool pme_gpu_performs_gather(const PmeGpu
*pmeGpu
)
492 return pmeGpu
->settings
.performGPUGather
;
495 /*! \libinternal \brief
496 * Tells if PME performs the FFT stages on GPU.
498 * \param[in] pmeGpu The PME GPU structure.
499 * \returns True if FFT is performed on GPU, false otherwise.
501 inline bool pme_gpu_performs_FFT(const PmeGpu
*pmeGpu
)
503 return pmeGpu
->settings
.performGPUFFT
;
506 /*! \libinternal \brief
507 * Tells if PME performs the grid (un-)wrapping on GPU.
509 * \param[in] pmeGpu The PME GPU structure.
510 * \returns True if (un-)wrapping is performed on GPU, false otherwise.
512 inline bool pme_gpu_performs_wrapping(const PmeGpu
*pmeGpu
)
514 return pmeGpu
->settings
.useDecomposition
;
517 /*! \libinternal \brief
518 * Tells if PME performs the grid solving on GPU.
520 * \param[in] pmeGpu The PME GPU structure.
521 * \returns True if solving is performed on GPU, false otherwise.
523 inline bool pme_gpu_performs_solve(const PmeGpu
*pmeGpu
)
525 return pmeGpu
->settings
.performGPUSolve
;
528 /*! \libinternal \brief
529 * Enables or disables the testing mode.
530 * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
531 * and also makes the copies synchronous.
533 * \param[in] pmeGpu The PME GPU structure.
534 * \param[in] testing Should the testing mode be enabled, or disabled.
536 inline void pme_gpu_set_testing(PmeGpu
*pmeGpu
, bool testing
)
540 pmeGpu
->settings
.copyAllOutputs
= testing
;
541 pmeGpu
->settings
.transferKind
= testing
? GpuApiCallBehavior::Sync
: GpuApiCallBehavior::Async
;
545 /*! \libinternal \brief
546 * Tells if PME is in the testing mode.
548 * \param[in] pmeGpu The PME GPU structure.
549 * \returns true if testing mode is enabled, false otherwise.
551 inline bool pme_gpu_is_testing(const PmeGpu
*pmeGpu
)
553 return pmeGpu
->settings
.copyAllOutputs
;
556 /* A block of C++ functions that live in pme_gpu_internal.cpp */
558 /*! \libinternal \brief
559 * Returns the energy and virial GPU outputs, useful for testing.
561 * It is the caller's responsibility to be aware of whether the GPU
562 * handled the solve stage.
564 * \param[in] pme The PME structure.
565 * \param[out] output Pointer to output where energy and virial should be stored.
567 GPU_FUNC_QUALIFIER
void
568 pme_gpu_getEnergyAndVirial(const gmx_pme_t
&GPU_FUNC_ARGUMENT(pme
),
569 PmeOutput
*GPU_FUNC_ARGUMENT(output
)) GPU_FUNC_TERM
;
571 /*! \libinternal \brief
572 * Returns the GPU outputs (forces, energy and virial)
574 * \param[in] pme The PME structure.
575 * \param[in] flags The combination of flags that affected this PME computation.
576 * The flags are the GMX_PME_ flags from pme.h.
577 * \returns The output object.
579 GPU_FUNC_QUALIFIER PmeOutput
580 pme_gpu_getOutput(const gmx_pme_t
&GPU_FUNC_ARGUMENT(pme
),
581 int GPU_FUNC_ARGUMENT(flags
)) GPU_FUNC_TERM_WITH_RETURN(PmeOutput
{});
583 /*! \libinternal \brief
584 * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
586 * \param[in] pmeGpu The PME GPU structure.
587 * \param[in] box The unit cell box.
589 GPU_FUNC_QUALIFIER
void pme_gpu_update_input_box(PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
),
590 const matrix
GPU_FUNC_ARGUMENT(box
)) GPU_FUNC_TERM
;
592 /*! \libinternal \brief
593 * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
594 * If forces were computed, they will have arrived at the external host buffer provided to gather.
595 * If virial/energy were computed, they will have arrived into the internal staging buffer
596 * (even though that should have already happened before even launching the gather).
597 * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
598 * Additionally, device-side buffers are cleared asynchronously for the next computation.
600 * \param[in] pmeGpu The PME GPU structure.
602 void pme_gpu_finish_computation(const PmeGpu
*pmeGpu
);
604 //! A binary enum for spline data layout transformation
605 enum class PmeLayoutTransform
611 /*! \libinternal \brief
612 * Rearranges the atom spline data between the GPU and host layouts.
613 * Only used for test purposes so far, likely to be horribly slow.
615 * \param[in] pmeGpu The PME GPU structure.
616 * \param[out] atc The PME CPU atom data structure (with a single-threaded layout).
617 * \param[in] type The spline data type (values or derivatives).
618 * \param[in] dimIndex Dimension index.
619 * \param[in] transform Layout transform type
621 GPU_FUNC_QUALIFIER
void pme_gpu_transform_spline_atom_data(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
),
622 const PmeAtomComm
*GPU_FUNC_ARGUMENT(atc
),
623 PmeSplineDataType
GPU_FUNC_ARGUMENT(type
),
624 int GPU_FUNC_ARGUMENT(dimIndex
),
625 PmeLayoutTransform
GPU_FUNC_ARGUMENT(transform
)) GPU_FUNC_TERM
;
627 /*! \libinternal \brief
628 * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
629 * which is laid out for GPU spread/gather kernels. The index is wrt the execution block,
630 * in range(0, atomsPerBlock * order * DIM).
631 * This is a wrapper, only used in unit tests.
632 * \param[in] order PME order
633 * \param[in] splineIndex Spline contribution index (from 0 to \p order - 1)
634 * \param[in] dimIndex Dimension index (from 0 to 2)
635 * \param[in] atomIndex Atom index wrt the block.
636 * \param[in] atomsPerWarp Number of atoms processed by a warp.
638 * \returns Index into theta or dtheta array using GPU layout.
640 int getSplineParamFullIndex(int order
,
646 /*! \libinternal \brief
647 * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
649 * \param[in] pmeGpu The PME GPU structure.
650 * \param[out] gridSize Pointer to the grid dimensions to fill in.
651 * \param[out] paddedGridSize Pointer to the padded grid dimensions to fill in.
653 GPU_FUNC_QUALIFIER
void pme_gpu_get_real_grid_sizes(const PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
),
654 gmx::IVec
*GPU_FUNC_ARGUMENT(gridSize
),
655 gmx::IVec
*GPU_FUNC_ARGUMENT(paddedGridSize
)) GPU_FUNC_TERM
;
657 /*! \libinternal \brief
658 * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
660 * \param[in,out] pme The PME structure.
661 * \param[in] gpuInfo The GPU information structure.
662 * \param[in] pmeGpuProgram The PME GPU program data
663 * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
665 GPU_FUNC_QUALIFIER
void pme_gpu_reinit(gmx_pme_t
*GPU_FUNC_ARGUMENT(pme
),
666 const gmx_device_info_t
*GPU_FUNC_ARGUMENT(gpuInfo
),
667 PmeGpuProgramHandle
GPU_FUNC_ARGUMENT(pmeGpuProgram
)) GPU_FUNC_TERM
;
669 /*! \libinternal \brief
670 * Destroys the PME GPU data at the end of the run.
672 * \param[in] pmeGpu The PME GPU structure.
674 GPU_FUNC_QUALIFIER
void pme_gpu_destroy(PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
)) GPU_FUNC_TERM
;
676 /*! \libinternal \brief
677 * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
679 * \param[in] pmeGpu The PME GPU structure.
680 * \param[in] nAtoms The number of particles.
681 * \param[in] charges The pointer to the host-side array of particle charges.
683 * This is a function that should only be called in the beginning of the run and on domain decomposition.
684 * Should be called before the pme_gpu_set_io_ranges.
686 GPU_FUNC_QUALIFIER
void pme_gpu_reinit_atoms(PmeGpu
*GPU_FUNC_ARGUMENT(pmeGpu
),
687 int GPU_FUNC_ARGUMENT(nAtoms
),
688 const real
*GPU_FUNC_ARGUMENT(charges
)) GPU_FUNC_TERM
;
690 /*! \brief \libinternal
691 * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
693 * This clears the device-side working buffers in preparation for new computation.
695 * \param[in] pmeGpu The PME GPU structure.
697 void pme_gpu_reinit_computation(const PmeGpu
*pmeGpu
);
700 * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
701 * (if they were to be computed).
703 * \param[in] pme The PME data structure.
704 * \param[in] flags The combination of flags to affect this PME computation.
705 * The flags are the GMX_PME_ flags from pme.h.
706 * \param[out] wcycle The wallclock counter.
707 * \return The output forces, energy and virial
709 GPU_FUNC_QUALIFIER PmeOutput
710 pme_gpu_wait_finish_task(gmx_pme_t
*GPU_FUNC_ARGUMENT(pme
),
711 int GPU_FUNC_ARGUMENT(flags
),
712 gmx_wallcycle
*GPU_FUNC_ARGUMENT(wcycle
)) GPU_FUNC_TERM_WITH_RETURN(PmeOutput
{}