2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements Leap-Frog using CUDA
39 * This file contains implementation of basic Leap-Frog integrator
40 * using CUDA, including class initialization, data-structures management
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_mdlib
49 #include "leapfrog_gpu.cuh"
58 #include "gromacs/gpu_utils/cudautils.cuh"
59 #include "gromacs/gpu_utils/devicebuffer.h"
60 #include "gromacs/gpu_utils/vectype_ops.cuh"
61 #include "gromacs/math/vec.h"
62 #include "gromacs/pbcutil/pbc.h"
63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
64 #include "gromacs/utility/arrayref.h"
69 /*!\brief Number of CUDA threads in a block
71 * \todo Check if using smaller block size will lead to better prformance.
73 constexpr static int c_threadsPerBlock = 256;
74 //! Maximum number of threads in a block (for __launch_bounds__)
75 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
77 /*! \brief Sets the number of different temperature coupling values
79 * This is needed to template the kernel
80 * \todo Unify with similar enum in CPU update module
82 enum class NumTempScaleValues
84 None, //!< No temperature coupling
85 Single, //!< Single T-scaling value (one group)
86 Multiple //!< Multiple T-scaling values, need to use T-group indices
89 /*! \brief Different variants of the Parrinello-Rahman velocity scaling
91 * This is needed to template the kernel
92 * \todo Unify with similar enum in CPU update module
94 enum class VelocityScalingType
96 None, //!< Do not apply velocity scaling (not a PR-coupling run or step)
97 Diagonal, //!< Apply velocity scaling using a diagonal matrix
98 Full //!< Apply velocity scaling using a full matrix
101 /*! \brief Main kernel for Leap-Frog integrator.
103 * The coordinates and velocities are updated on the GPU. Also saves the intermediate values of the coordinates for
104 * further use in constraints.
106 * Each GPU thread works with a single particle. Empty declaration is needed to
107 * avoid "no previous prototype for function" clang warning.
109 * \todo Check if the force should be set to zero here.
110 * \todo This kernel can also accumulate incidental temperatures for each atom.
112 * \tparam numTempScaleValues The number of different T-couple values.
113 * \tparam velocityScaling Type of the Parrinello-Rahman velocity rescaling.
114 * \param[in] numAtoms Total number of atoms.
115 * \param[in,out] gm_x Coordinates to update upon integration.
116 * \param[out] gm_xp A copy of the coordinates before the integration (for constraints).
117 * \param[in,out] gm_v Velocities to update.
118 * \param[in] gm_f Atomic forces.
119 * \param[in] gm_inverseMasses Reciprocal masses.
120 * \param[in] dt Timestep.
121 * \param[in] gm_lambdas Temperature scaling factors (one per group)
122 * \param[in] gm_tempScaleGroups Mapping of atoms into groups.
123 * \param[in] dtPressureCouple Time step for pressure coupling
124 * \param[in] prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix
126 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
127 __launch_bounds__(c_maxThreadsPerBlock) __global__
128 void leapfrog_kernel(const int numAtoms,
129 float3* __restrict__ gm_x,
130 float3* __restrict__ gm_xp,
131 float3* __restrict__ gm_v,
132 const float3* __restrict__ gm_f,
133 const float* __restrict__ gm_inverseMasses,
135 const float* __restrict__ gm_lambdas,
136 const unsigned short* __restrict__ gm_tempScaleGroups,
137 const float3 prVelocityScalingMatrixDiagonal);
139 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
140 __launch_bounds__(c_maxThreadsPerBlock) __global__
141 void leapfrog_kernel(const int numAtoms,
142 float3* __restrict__ gm_x,
143 float3* __restrict__ gm_xp,
144 float3* __restrict__ gm_v,
145 const float3* __restrict__ gm_f,
146 const float* __restrict__ gm_inverseMasses,
148 const float* __restrict__ gm_lambdas,
149 const unsigned short* __restrict__ gm_tempScaleGroups,
150 const float3 prVelocityScalingMatrixDiagonal)
152 int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
153 if (threadIndex < numAtoms)
155 float3 x = gm_x[threadIndex];
156 float3 v = gm_v[threadIndex];
157 float3 f = gm_f[threadIndex];
158 float im = gm_inverseMasses[threadIndex];
159 float imdt = im * dt;
161 // Swapping places for xp and x so that the x will contain the updated coordinates and xp - the
162 // coordinates before update. This should be taken into account when (if) constraints are applied
163 // after the update: x and xp have to be passed to constraints in the 'wrong' order.
164 gm_xp[threadIndex] = x;
166 if (numTempScaleValues != NumTempScaleValues::None || velocityScaling != VelocityScalingType::None)
170 if (numTempScaleValues != NumTempScaleValues::None)
173 if (numTempScaleValues == NumTempScaleValues::Single)
175 lambda = gm_lambdas[0];
177 else if (numTempScaleValues == NumTempScaleValues::Multiple)
179 int tempScaleGroup = gm_tempScaleGroups[threadIndex];
180 lambda = gm_lambdas[tempScaleGroup];
185 if (velocityScaling == VelocityScalingType::Diagonal)
187 vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
188 vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
189 vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
198 gm_v[threadIndex] = v;
199 gm_x[threadIndex] = x;
204 /*! \brief Select templated kernel.
206 * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
207 * whether or not the temperature and(or) pressure coupling is enabled.
209 * \param[in] doTemperatureScaling If the kernel with temperature coupling velocity scaling
210 * should be selected.
211 * \param[in] numTempScaleValues Number of temperature coupling groups in the system.
212 * \param[in] prVelocityScalingType Type of the Parrinello-Rahman velocity scaling.
214 * \retrun Pointer to CUDA kernel
216 inline auto selectLeapFrogKernelPtr(bool doTemperatureScaling,
217 int numTempScaleValues,
218 VelocityScalingType prVelocityScalingType)
220 // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
221 GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
222 "Temperature coupling was requested with no temperature coupling groups.");
223 auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
225 if (prVelocityScalingType == VelocityScalingType::None)
227 if (!doTemperatureScaling)
229 kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
231 else if (numTempScaleValues == 1)
233 kernelPtr = leapfrog_kernel<NumTempScaleValues::Single, VelocityScalingType::None>;
235 else if (numTempScaleValues > 1)
237 kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
240 else if (prVelocityScalingType == VelocityScalingType::Diagonal)
242 if (!doTemperatureScaling)
244 kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
246 else if (numTempScaleValues == 1)
248 kernelPtr = leapfrog_kernel<NumTempScaleValues::Single, VelocityScalingType::Diagonal>;
250 else if (numTempScaleValues > 1)
252 kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
257 GMX_RELEASE_ASSERT(false,
258 "Only isotropic Parrinello-Rahman pressure coupling is supported.");
263 void LeapFrogGpu::integrate(const float3* d_x,
268 const bool doTemperatureScaling,
269 gmx::ArrayRef<const t_grp_tcstat> tcstat,
270 const bool doParrinelloRahman,
271 const float dtPressureCouple,
272 const matrix prVelocityScalingMatrix)
275 ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
277 auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
278 if (doTemperatureScaling || doParrinelloRahman)
280 if (doTemperatureScaling)
282 GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
283 "Number of temperature scaling factors changed since it was set for the "
285 for (int i = 0; i < numTempScaleValues_; i++)
287 h_lambdas_[i] = tcstat[i].lambda;
289 copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
290 commandStream_, GpuApiCallBehavior::Async, nullptr);
292 VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
293 if (doParrinelloRahman)
295 prVelocityScalingType = VelocityScalingType::Diagonal;
296 GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
297 && prVelocityScalingMatrix[ZZ][YY] == 0
298 && prVelocityScalingMatrix[XX][YY] == 0
299 && prVelocityScalingMatrix[XX][ZZ] == 0
300 && prVelocityScalingMatrix[YY][ZZ] == 0,
301 "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
302 "in GPU version of Leap-Frog integrator.");
303 prVelocityScalingMatrixDiagonal_ =
304 make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
305 dtPressureCouple * prVelocityScalingMatrix[YY][YY],
306 dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
308 kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
311 const auto kernelArgs = prepareGpuKernelArguments(
312 kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_,
313 &dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_);
314 launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs);
319 LeapFrogGpu::LeapFrogGpu(CommandStream commandStream) : commandStream_(commandStream)
323 changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported);
325 kernelLaunchConfig_.blockSize[0] = c_threadsPerBlock;
326 kernelLaunchConfig_.blockSize[1] = 1;
327 kernelLaunchConfig_.blockSize[2] = 1;
328 kernelLaunchConfig_.sharedMemorySize = 0;
329 kernelLaunchConfig_.stream = commandStream_;
332 LeapFrogGpu::~LeapFrogGpu()
334 freeDeviceBuffer(&d_inverseMasses_);
337 void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const unsigned short* tempScaleGroups)
340 kernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
342 numTempScaleValues_ = numTempScaleValues;
344 reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
345 &numInverseMassesAlloc_, nullptr);
346 copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_,
347 GpuApiCallBehavior::Sync, nullptr);
349 // Temperature scale group map only used if there are more then one group
350 if (numTempScaleValues > 1)
352 reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
353 &numTempScaleGroupsAlloc_, nullptr);
354 copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_,
355 GpuApiCallBehavior::Sync, nullptr);
358 // If the temperature coupling is enabled, we need to make space for scaling factors
359 if (numTempScaleValues_ > 0)
361 h_lambdas_.resize(numTempScaleValues);
362 reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, nullptr);