2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements LINCS using CUDA
39 * This file contains implementation of LINCS constraints algorithm
40 * using CUDA, including class initialization, data-structures management
43 * \author Artem Zhmurov <zhmurov@gmail.com>
44 * \author Alan Gray <alang@nvidia.com>
46 * \ingroup module_mdlib
50 #include "lincs_gpu.cuh"
59 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
60 #include "gromacs/gpu_utils/cudautils.cuh"
61 #include "gromacs/gpu_utils/devicebuffer.cuh"
62 #include "gromacs/gpu_utils/gputraits.cuh"
63 #include "gromacs/gpu_utils/vectype_ops.cuh"
64 #include "gromacs/math/vec.h"
65 #include "gromacs/mdlib/constr.h"
66 #include "gromacs/pbcutil/pbc.h"
67 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
68 #include "gromacs/topology/forcefieldparameters.h"
69 #include "gromacs/topology/ifunc.h"
70 #include "gromacs/topology/topology.h"
75 //! Number of CUDA threads in a block
76 constexpr static int c_threadsPerBlock = 256;
77 //! Maximum number of threads in a block (for __launch_bounds__)
78 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
80 /*! \brief Main kernel for LINCS constraints.
82 * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
84 * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
85 * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
86 * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
87 * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
88 * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
89 * there is no need to synchronize across the device. However, extensive communication in a thread block
92 * \todo Reduce synchronization overhead. Some ideas are:
93 * 1. Consider going to warp-level synchronization for the coupled constraints.
94 * 2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
96 * 3. Use analytical solution for matrix A inversion.
97 * 4. Introduce mapping of thread id to both single constraint and single atom, thus designating
98 * Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
99 * See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885)
100 * \todo The use of __restrict__ for gm_xp and gm_v causes failure, probably because of the atomic
101 operations. Investigate this issue further.
103 * \param[in,out] kernelParams All parameters and pointers for the kernel condensed in single struct.
104 * \param[in] invdt Inverse timestep (needed to update velocities).
106 template<bool updateVelocities, bool computeVirial>
107 __launch_bounds__(c_maxThreadsPerBlock) __global__
108 void lincs_kernel(LincsGpuKernelParameters kernelParams,
109 const float3* __restrict__ gm_x,
114 const PbcAiuc pbcAiuc = kernelParams.pbcAiuc;
115 const int numConstraintsThreads = kernelParams.numConstraintsThreads;
116 const int numIterations = kernelParams.numIterations;
117 const int expansionOrder = kernelParams.expansionOrder;
118 const int2* __restrict__ gm_constraints = kernelParams.d_constraints;
119 const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
120 const int* __restrict__ gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts;
121 const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
122 const float* __restrict__ gm_massFactors = kernelParams.d_massFactors;
123 float* __restrict__ gm_matrixA = kernelParams.d_matrixA;
124 const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses;
125 float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled;
127 int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
129 // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
130 // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
131 assert(threadIndex < numConstraintsThreads);
133 // Vectors connecting constrained atoms before algorithm was applied.
134 // Needed to construct constrain matrix A
135 extern __shared__ float3 sm_r[];
137 int2 pair = gm_constraints[threadIndex];
141 // Mass-scaled Lagrange multiplier
142 float lagrangeScaled = 0.0f;
147 float sqrtReducedMass;
153 // i == -1 indicates dummy constraint at the end of the thread block.
154 bool isDummyThread = (i == -1);
156 // Everything computed for these dummies will be equal to zero
162 sqrtReducedMass = 0.0f;
164 xi = make_float3(0.0f, 0.0f, 0.0f);
165 xj = make_float3(0.0f, 0.0f, 0.0f);
166 rc = make_float3(0.0f, 0.0f, 0.0f);
171 targetLength = gm_constraintsTargetLengths[threadIndex];
172 inverseMassi = gm_inverseMasses[i];
173 inverseMassj = gm_inverseMasses[j];
174 sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
179 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
181 float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z);
185 sm_r[threadIdx.x] = rc;
186 // Make sure that all r's are saved into shared memory
187 // before they are accessed in the loop below
191 * Constructing LINCS matrix (A)
194 // Only non-zero values are saved (for coupled constraints)
195 int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
196 for (int n = 0; n < coupledConstraintsCount; n++)
198 int index = n * numConstraintsThreads + threadIndex;
199 int c1 = gm_coupledConstraintsIdxes[index];
201 float3 rc1 = sm_r[c1 - blockIdx.x * blockDim.x];
202 gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
205 // Skipping in dummy threads
212 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
214 float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength);
217 * Inverse matrix using a set of expansionOrder matrix multiplications
220 // This will use the same memory space as sm_r, which is no longer needed.
221 extern __shared__ float sm_rhs[];
222 // Save current right-hand-side vector in the shared memory
223 sm_rhs[threadIdx.x] = sol;
225 for (int rec = 0; rec < expansionOrder; rec++)
227 // Making sure that all sm_rhs are saved before they are accessed in a loop below
231 for (int n = 0; n < coupledConstraintsCount; n++)
233 int index = n * numConstraintsThreads + threadIndex;
234 int c1 = gm_coupledConstraintsIdxes[index];
235 // Convolute current right-hand-side with A
236 // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
237 mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
239 // 'Switch' rhs vectors, save current result
240 // These values will be accessed in the loop above during the next iteration.
241 sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
245 // Current mass-scaled Lagrange multipliers
246 lagrangeScaled = sqrtReducedMass * sol;
248 // Save updated coordinates before correction for the rotational lengthening
249 float3 tmp = rc * lagrangeScaled;
251 // Writing for all but dummy constraints
254 atomicAdd(&gm_xp[i], -tmp * inverseMassi);
255 atomicAdd(&gm_xp[j], tmp * inverseMassj);
259 * Correction for centripetal effects
261 for (int iter = 0; iter < numIterations; iter++)
263 // Make sure that all xp's are saved: atomic operation calls before are
264 // communicating current xp[..] values across thread block.
273 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
275 float len2 = targetLength * targetLength;
276 float dlen2 = 2.0f * len2 - norm2(dx);
278 // TODO A little bit more effective but slightly less readable version of the below would be:
279 // float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
283 proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
287 proj = sqrtReducedMass * targetLength;
290 sm_rhs[threadIdx.x] = proj;
294 * Same matrix inversion as above is used for updated data
296 for (int rec = 0; rec < expansionOrder; rec++)
298 // Make sure that all elements of rhs are saved into shared memory
302 for (int n = 0; n < coupledConstraintsCount; n++)
304 int index = n * numConstraintsThreads + threadIndex;
305 int c1 = gm_coupledConstraintsIdxes[index];
307 mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
309 sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
313 // Add corrections to Lagrange multipliers
314 float sqrtmu_sol = sqrtReducedMass * sol;
315 lagrangeScaled += sqrtmu_sol;
317 // Save updated coordinates for the next iteration
318 // Dummy constraints are skipped
321 float3 tmp = rc * sqrtmu_sol;
322 atomicAdd(&gm_xp[i], -tmp * inverseMassi);
323 atomicAdd(&gm_xp[j], tmp * inverseMassj);
327 // Updating particle velocities for all but dummy threads
328 if (updateVelocities && !isDummyThread)
330 float3 tmp = rc * invdt * lagrangeScaled;
331 atomicAdd(&gm_v[i], -tmp * inverseMassi);
332 atomicAdd(&gm_v[j], tmp * inverseMassj);
338 // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
339 // (targetLength) and the normalized vector connecting constrained atoms before
340 // the algorithm was applied (rc). The evaluation of virial in each thread is
341 // followed by basic reduction for the values inside single thread block.
342 // Then, the values are reduced across grid by atomicAdd(...).
344 // TODO Shuffle reduction.
345 // TODO Should be unified and/or done once when virial is actually needed.
346 // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
347 // one that works for any datatype.
349 // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
350 // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
351 // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
352 // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
353 // two are no longer in use.
354 extern __shared__ float sm_threadVirial[];
355 float mult = targetLength * lagrangeScaled;
356 sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
357 sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y;
358 sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z;
359 sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y;
360 sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z;
361 sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z;
365 // Reduce up to one virial per thread block. All blocks are divided by half, the first
366 // half of threads sums two virials. Then the first half is divided by two and the first
367 // half of it sums two values. This procedure is repeated until only one thread is left.
368 // Only works if the threads per blocks is a power of two (hence static_assert
369 // in the beginning of the kernel).
370 for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
372 int dividedAt = blockDim.x / divideBy;
373 if (static_cast<int>(threadIdx.x) < dividedAt)
375 for (int d = 0; d < 6; d++)
377 sm_threadVirial[d * blockDim.x + threadIdx.x] +=
378 sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)];
381 // Syncronize if not within one warp
382 if (dividedAt > warpSize / 2)
387 // First 6 threads in the block add the results of 6 tensor components to the global memory address.
390 atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
397 /*! \brief Select templated kernel.
399 * Returns pointer to a CUDA kernel based on provided booleans.
401 * \param[in] updateVelocities If the velocities should be constrained.
402 * \param[in] computeVirial If virial should be updated.
404 * \return Pointer to CUDA kernel
406 inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial)
409 auto kernelPtr = lincs_kernel<true, true>;
410 if (updateVelocities && computeVirial)
412 kernelPtr = lincs_kernel<true, true>;
414 else if (updateVelocities && !computeVirial)
416 kernelPtr = lincs_kernel<true, false>;
418 else if (!updateVelocities && computeVirial)
420 kernelPtr = lincs_kernel<false, true>;
422 else if (!updateVelocities && !computeVirial)
424 kernelPtr = lincs_kernel<false, false>;
429 void LincsGpu::apply(const float3* d_x,
431 const bool updateVelocities,
434 const bool computeVirial,
436 const PbcAiuc pbcAiuc)
438 ensureNoPendingDeviceError("In CUDA version of LINCS");
440 // Early exit if no constraints
441 if (kernelParams_.numConstraintsThreads == 0)
448 // Fill with zeros so the values can be reduced to it
449 // Only 6 values are needed because virial is symmetrical
450 clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
453 auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
455 KernelLaunchConfig config;
456 config.blockSize[0] = c_threadsPerBlock;
457 config.blockSize[1] = 1;
458 config.blockSize[2] = 1;
459 config.gridSize[0] = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock;
460 config.gridSize[1] = 1;
461 config.gridSize[2] = 1;
463 // Shared memory is used to store:
464 // -- Current coordinates (3 floats per thread)
465 // -- Right-hand-sides for matrix inversion (2 floats per thread)
466 // -- Virial tensor components (6 floats per thread)
467 // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
468 // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
469 // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
472 config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
476 config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
479 kernelParams_.pbcAiuc = pbcAiuc;
481 const auto kernelArgs =
482 prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
484 launchGpuKernel(kernelPtr, config, deviceStream_, nullptr,
485 "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
489 // Copy LINCS virial data and add it to the common virial
490 copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled, 0, 6,
491 deviceStream_, GpuApiCallBehavior::Sync, nullptr);
493 // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
494 virialScaled[XX][XX] += h_virialScaled_[0];
495 virialScaled[XX][YY] += h_virialScaled_[1];
496 virialScaled[XX][ZZ] += h_virialScaled_[2];
498 virialScaled[YY][XX] += h_virialScaled_[1];
499 virialScaled[YY][YY] += h_virialScaled_[3];
500 virialScaled[YY][ZZ] += h_virialScaled_[4];
502 virialScaled[ZZ][XX] += h_virialScaled_[2];
503 virialScaled[ZZ][YY] += h_virialScaled_[4];
504 virialScaled[ZZ][ZZ] += h_virialScaled_[5];
510 LincsGpu::LincsGpu(int numIterations,
512 const DeviceContext& deviceContext,
513 const DeviceStream& deviceStream) :
514 deviceContext_(deviceContext),
515 deviceStream_(deviceStream)
517 kernelParams_.numIterations = numIterations;
518 kernelParams_.expansionOrder = expansionOrder;
520 static_assert(sizeof(real) == sizeof(float),
521 "Real numbers should be in single precision in GPU code.");
523 c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
524 "Number of threads per block should be a power of two in order for reduction to work.");
526 allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
527 h_virialScaled_.resize(6);
529 // The data arrays should be expanded/reallocated on first call of set() function.
530 numConstraintsThreadsAlloc_ = 0;
534 LincsGpu::~LincsGpu()
536 freeDeviceBuffer(&kernelParams_.d_virialScaled);
538 if (numConstraintsThreadsAlloc_ > 0)
540 freeDeviceBuffer(&kernelParams_.d_constraints);
541 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
543 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
544 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
545 freeDeviceBuffer(&kernelParams_.d_massFactors);
546 freeDeviceBuffer(&kernelParams_.d_matrixA);
548 if (numAtomsAlloc_ > 0)
550 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
554 //! Helper type for discovering coupled constraints
555 struct AtomsAdjacencyListElement
557 AtomsAdjacencyListElement(const int indexOfSecondConstrainedAtom,
558 const int indexOfConstraint,
559 const int signFactor) :
560 indexOfSecondConstrainedAtom_(indexOfSecondConstrainedAtom),
561 indexOfConstraint_(indexOfConstraint),
562 signFactor_(signFactor)
565 //! The index of the other atom constrained to this atom.
566 int indexOfSecondConstrainedAtom_;
567 //! The index of this constraint in the container of constraints.
568 int indexOfConstraint_;
569 /*! \brief A multiplicative factor that indicates the relative
570 * order of the atoms in the atom list.
572 * Used for computing the mass factor of this constraint
573 * relative to any coupled constraints. */
576 //! Constructs and returns an atom constraint adjacency list
577 static std::vector<std::vector<AtomsAdjacencyListElement>>
578 constructAtomsAdjacencyList(const int numAtoms, ArrayRef<const int> iatoms)
580 const int stride = 1 + NRAL(F_CONSTR);
581 const int numConstraints = iatoms.ssize() / stride;
582 std::vector<std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList(numAtoms);
583 for (int c = 0; c < numConstraints; c++)
585 int a1 = iatoms[stride * c + 1];
586 int a2 = iatoms[stride * c + 2];
588 // Each constraint will be represented as a tuple, containing index of the second
589 // constrained atom, index of the constraint and a sign that indicates the order of atoms in
590 // which they are listed. Sign is needed to compute the mass factors.
591 atomsAdjacencyList[a1].emplace_back(a2, c, +1);
592 atomsAdjacencyList[a2].emplace_back(a1, c, -1);
595 return atomsAdjacencyList;
598 /*! \brief Helper function to go through constraints recursively.
600 * For each constraint, counts the number of coupled constraints and stores the value in \p numCoupledConstraints array.
601 * This information is used to split the array of constraints between thread blocks on a GPU so there is no
602 * coupling between constraints from different thread blocks. After the \p numCoupledConstraints array is filled, the
603 * value \p numCoupledConstraints[c] should be equal to the number of constraints that are coupled to \p c and located
604 * after it in the constraints array.
606 * \param[in] a Atom index.
607 * \param[in,out] numCoupledConstraints Indicates if the constraint was already counted and stores
608 * the number of constraints (i) connected to it and (ii) located
609 * after it in memory. This array is filled by this recursive function.
610 * For a set of coupled constraints, only for the first one in this list
611 * the number of consecutive coupled constraints is needed: if there is
612 * not enough space for this set of constraints in the thread block,
613 * the group has to be moved to the next one.
614 * \param[in] atomsAdjacencyList Stores information about connections between atoms.
616 inline int countCoupled(int a,
617 ArrayRef<int> numCoupledConstraints,
618 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
622 for (const auto& adjacentAtom : atomsAdjacencyList[a])
624 const int c2 = adjacentAtom.indexOfConstraint_;
625 if (numCoupledConstraints[c2] == -1)
627 numCoupledConstraints[c2] = 0; // To indicate we've been here
629 + countCoupled(adjacentAtom.indexOfSecondConstrainedAtom_,
630 numCoupledConstraints, atomsAdjacencyList);
636 /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
638 * Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
639 * if it was not yet added. Then goes through all the constraints coupled to \p c
640 * and calls itself recursively. This ensures that all the coupled constraints will
641 * be added to neighboring locations in the final data structures on the device,
642 * hence mapping all coupled constraints to the same thread block. A value of -1 in
643 * the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
645 * \param[in] iatoms The list of constraints.
646 * \param[in] stride Number of elements per constraint in \p iatoms.
647 * \param[in] atomsAdjacencyList Information about connections between atoms.
648 * \param[our] splitMap Map of sequential constraint indexes to indexes to be on the device
649 * \param[in] c Sequential index for constraint to consider adding.
650 * \param[in,out] currentMapIndex The rolling index for the constraints mapping.
652 inline void addWithCoupled(ArrayRef<const int> iatoms,
654 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList,
655 ArrayRef<int> splitMap,
657 int* currentMapIndex)
659 if (splitMap[c] == -1)
661 splitMap[c] = *currentMapIndex;
662 (*currentMapIndex)++;
664 // Constraints, coupled through both atoms.
665 for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
667 const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
668 for (const auto& adjacentAtom : atomsAdjacencyList[a1])
670 const int c2 = adjacentAtom.indexOfConstraint_;
673 addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
680 /*! \brief Computes and returns how many constraints are coupled to each constraint
682 * Needed to introduce splits in data so that all coupled constraints will be computed in a
683 * single GPU block. The position \p c of the vector \p numCoupledConstraints should have the number
684 * of constraints that are coupled to a constraint \p c and are after \p c in the vector. Only
685 * first index of the connected group of the constraints is needed later in the code, hence the
686 * numCoupledConstraints vector is also used to keep track if the constrain was already counted.
688 static std::vector<int> countNumCoupledConstraints(ArrayRef<const int> iatoms,
689 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
691 const int stride = 1 + NRAL(F_CONSTR);
692 const int numConstraints = iatoms.ssize() / stride;
693 std::vector<int> numCoupledConstraints(numConstraints, -1);
694 for (int c = 0; c < numConstraints; c++)
696 const int a1 = iatoms[stride * c + 1];
697 const int a2 = iatoms[stride * c + 2];
698 if (numCoupledConstraints[c] == -1)
700 numCoupledConstraints[c] = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
701 + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
705 return numCoupledConstraints;
708 bool LincsGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
710 for (const gmx_moltype_t& molType : mtop.moltype)
712 ArrayRef<const int> iatoms = molType.ilist[F_CONSTR].iatoms;
713 const auto atomsAdjacencyList = constructAtomsAdjacencyList(molType.atoms.nr, iatoms);
714 // Compute, how many constraints are coupled to each constraint
715 const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
716 for (const int numCoupled : numCoupledConstraints)
718 if (numCoupled > c_threadsPerBlock)
728 void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const real* invmass)
730 // List of constrained atoms (CPU memory)
731 std::vector<int2> constraintsHost;
732 // Equilibrium distances for the constraints (CPU)
733 std::vector<float> constraintsTargetLengthsHost;
734 // Number of constraints, coupled with the current one (CPU)
735 std::vector<int> coupledConstraintsCountsHost;
736 // List of coupled with the current one (CPU)
737 std::vector<int> coupledConstraintsIndicesHost;
738 // Mass factors (CPU)
739 std::vector<float> massFactorsHost;
741 // List of constrained atoms in local topology
742 ArrayRef<const int> iatoms = idef.il[F_CONSTR].iatoms;
743 const int stride = NRAL(F_CONSTR) + 1;
744 const int numConstraints = idef.il[F_CONSTR].size() / stride;
746 // Early exit if no constraints
747 if (numConstraints == 0)
749 kernelParams_.numConstraintsThreads = 0;
753 // Construct the adjacency list, a useful intermediate structure
754 const auto atomsAdjacencyList = constructAtomsAdjacencyList(numAtoms, iatoms);
756 // Compute, how many constraints are coupled to each constraint
757 const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
759 // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
760 // takes into account the empty spaces which might be needed in the end of each thread block.
761 std::vector<int> splitMap(numConstraints, -1);
762 int currentMapIndex = 0;
763 for (int c = 0; c < numConstraints; c++)
765 // Check if coupled constraints all fit in one block
766 if (numCoupledConstraints.at(c) > c_threadsPerBlock)
769 "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
770 "thread block (%d). Most likely, you are trying to use the GPU version of "
771 "LINCS with constraints on all-bonds, which is not supported for large "
772 "molecules. When compatible with the force field and integration settings, "
773 "using constraints on H-bonds only.",
774 numCoupledConstraints.at(c), c_threadsPerBlock);
776 if (currentMapIndex / c_threadsPerBlock
777 != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
779 currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
781 addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, ¤tMapIndex);
784 kernelParams_.numConstraintsThreads =
785 currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
786 GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
787 "Number of threads should be a multiple of the block size");
789 // Initialize constraints and their target indexes taking into account the splits in the data arrays.
793 constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
794 std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
795 constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
796 std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
797 for (int c = 0; c < numConstraints; c++)
799 int a1 = iatoms[stride * c + 1];
800 int a2 = iatoms[stride * c + 2];
801 int type = iatoms[stride * c];
806 constraintsHost.at(splitMap.at(c)) = pair;
807 constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
810 // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
811 // We map a single thread to a single constraint, hence each thread 'c' will be using one
812 // element from coupledConstraintsCountsHost array, which is the number of constraints coupled
813 // to the constraint 'c'. The coupled constraints indexes are placed into the
814 // coupledConstraintsIndicesHost array. Latter is organized as a one-dimensional array to ensure
815 // good memory alignment. It is addressed as [c + i*numConstraintsThreads], where 'i' goes from
816 // zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of the
817 // array --- a number, greater then total number of constraints, taking into account the splits
818 // in the constraints array due to the GPU block borders. This number can be adjusted to improve
819 // memory access pattern. Mass factors are saved in a similar data structure.
820 int maxCoupledConstraints = 0;
821 bool maxCoupledConstraintsHasIncreased = false;
822 for (int c = 0; c < numConstraints; c++)
824 int a1 = iatoms[stride * c + 1];
825 int a2 = iatoms[stride * c + 2];
827 // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
828 int nCoupledConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
830 if (nCoupledConstraints > maxCoupledConstraints)
832 maxCoupledConstraints = nCoupledConstraints;
833 maxCoupledConstraintsHasIncreased = true;
837 coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
838 coupledConstraintsIndicesHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
839 massFactorsHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
841 for (int c1 = 0; c1 < numConstraints; c1++)
843 coupledConstraintsCountsHost.at(splitMap.at(c1)) = 0;
844 int c1a1 = iatoms[stride * c1 + 1];
845 int c1a2 = iatoms[stride * c1 + 2];
847 // Constraints, coupled through the first atom.
849 for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a1])
851 int c2 = atomAdjacencyList.indexOfConstraint_;
855 int c2a2 = atomAdjacencyList.indexOfSecondConstrainedAtom_;
856 int sign = atomAdjacencyList.signFactor_;
857 int index = kernelParams_.numConstraintsThreads
858 * coupledConstraintsCountsHost.at(splitMap.at(c1))
861 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
865 float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
866 float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
868 massFactorsHost.at(index) = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
870 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
874 // Constraints, coupled through the second atom.
876 for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a2])
878 int c2 = atomAdjacencyList.indexOfConstraint_;
882 int c2a2 = atomAdjacencyList.indexOfSecondConstrainedAtom_;
883 int sign = atomAdjacencyList.signFactor_;
884 int index = kernelParams_.numConstraintsThreads
885 * coupledConstraintsCountsHost.at(splitMap.at(c1))
888 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
892 float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
893 float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
895 massFactorsHost.at(index) = sign * invmass[center] * sqrtmu1 * sqrtmu2;
897 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
902 // (Re)allocate the memory, if the number of constraints has increased.
903 if ((kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_) || maxCoupledConstraintsHasIncreased)
905 // Free memory if it was allocated before (i.e. if not the first time here).
906 if (numConstraintsThreadsAlloc_ > 0)
908 freeDeviceBuffer(&kernelParams_.d_constraints);
909 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
911 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
912 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
913 freeDeviceBuffer(&kernelParams_.d_massFactors);
914 freeDeviceBuffer(&kernelParams_.d_matrixA);
917 numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
919 allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads,
921 allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
922 kernelParams_.numConstraintsThreads, deviceContext_);
924 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
925 kernelParams_.numConstraintsThreads, deviceContext_);
926 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
927 maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
928 allocateDeviceBuffer(&kernelParams_.d_massFactors,
929 maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
930 allocateDeviceBuffer(&kernelParams_.d_matrixA,
931 maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
934 // (Re)allocate the memory, if the number of atoms has increased.
935 if (numAtoms > numAtomsAlloc_)
937 if (numAtomsAlloc_ > 0)
939 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
941 numAtomsAlloc_ = numAtoms;
942 allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
946 copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(), 0,
947 kernelParams_.numConstraintsThreads, deviceStream_, GpuApiCallBehavior::Sync,
949 copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
950 constraintsTargetLengthsHost.data(), 0, kernelParams_.numConstraintsThreads,
951 deviceStream_, GpuApiCallBehavior::Sync, nullptr);
952 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
953 coupledConstraintsCountsHost.data(), 0, kernelParams_.numConstraintsThreads,
954 deviceStream_, GpuApiCallBehavior::Sync, nullptr);
955 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
956 0, maxCoupledConstraints * kernelParams_.numConstraintsThreads,
957 deviceStream_, GpuApiCallBehavior::Sync, nullptr);
958 copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(), 0,
959 maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceStream_,
960 GpuApiCallBehavior::Sync, nullptr);
962 GMX_RELEASE_ASSERT(invmass != nullptr, "Masses of atoms should be specified.\n");
963 copyToDeviceBuffer(&kernelParams_.d_inverseMasses, invmass, 0, numAtoms, deviceStream_,
964 GpuApiCallBehavior::Sync, nullptr);