2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * CUDA non-bonded prune-only kernel.
40 * Unlike the non-bonded interaction kernels, this is not preprocessor-generated,
41 * the two flavors achieved by templating.
43 * \author Szilárd Páll <pall.szilard@gmail.com>
44 * \author Berk Hess <hess@kth.se>
45 * \ingroup module_nbnxm
49 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
50 #include "gromacs/math/utilities.h"
51 #include "gromacs/pbcutil/ishift.h"
53 #include "nbnxm_cuda_kernel_utils.cuh"
54 #include "nbnxm_cuda_types.h"
56 /* Note that floating-point constants in CUDA code should be suffixed
57 * with f (e.g. 0.5f), to stop the compiler producing intermediate
58 * code that is in double precision.
62 /*! \brief Compute capability dependent definition of kernel launch configuration parameters.
64 * Kernel launch bounds for different compute capabilities. The value of NTHREAD_Z
65 * represents the j-concurrency, hence it determines the number of threads per block.
66 * It is chosen such that 100% occupancy is maintained (on Maxwell and later for any NTHREAD_Z,
67 * requires >=4 warp/block, NTHREAD_Z>=2 on Kepler).
69 * Hence, values NTHREAD_Z >= 2 trade inter- for intra-block parallelism
70 * which has the advantage of lowering the overhead of starting up a block, filling shmem
71 * and registers, etc. Ideally we'd want to expose as much intra-block work as possible
72 * As we also split lists to cater for the block-parallelization needed by the register-
73 * limited non-bonded kernels, for very short j-loops large NTHREAD_Z will cause slowdown
74 * as it leads to intra-block warp imbalance. Ideally, we'd want to auto-tune the choice
75 * of NTHREAD_Z, but for now we instead pick a reasonable tradeoff-value.
77 * Note that given the above input size tradeoffs and that performance depends on
78 * additional factors including GPU arch, #SM's, we'll accept performance tradeoffs
79 * of using a fixed NTHREAD_Z=4. The following outliers have been observed:
80 * - up to 25% faster (rolling) prune kernels with NTHREAD_Z=8 in the regime where lists
81 * are not split (much), but the rolling chunks are small;
82 * - with large inputs NTHREAD_Z=1 is 2-3% faster (on CC>=5.0)
84 #define NTHREAD_Z (GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY)
85 #define THREADS_PER_BLOCK (c_clSize * c_clSize * NTHREAD_Z)
86 // we want 100% occupancy, so max threads/block
87 #define MIN_BLOCKS_PER_MP (GMX_CUDA_MAX_THREADS_PER_MP / THREADS_PER_BLOCK)
90 /*! \brief Nonbonded list pruning kernel.
92 * The \p haveFreshList template parameter defines the two flavors of the kernel; when
93 * true a new list from immediately after pair-list generation is pruned using rlistOuter,
94 * the pruned masks are stored in a separate buffer and the outer-list is pruned
95 * using the rlistInner distance; when false only the pruning with rlistInner is performed.
97 * Kernel launch parameters:
98 * - #blocks = #pair lists, blockId = pair list Id
99 * - #threads = NTHREAD_Z * c_clSize^2
100 * - shmem = see nbnxn_cuda.cu:calc_shmem_required_prune()
102 * Each thread calculates an i-j atom distance..
104 template<bool haveFreshList>
105 __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__
106 void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat,
107 const NBParamGpu nbparam,
108 const Nbnxm::gpu_plist plist,
111 #ifdef FUNCTION_DECLARATION_ONLY
112 ; /* Only do function declaration, omit the function body. */
114 // Add extern declarations so each translation unit understands that
115 // there will be a definition provided.
116 extern template __global__ void
117 nbnxn_kernel_prune_cuda<true>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
118 extern template __global__ void
119 nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
123 /* convenience variables */
124 const nbnxn_sci_t* pl_sci = plist.sci;
125 nbnxn_cj4_t* pl_cj4 = plist.cj4;
126 const float4* xq = atdat.xq;
127 const float3* shift_vec = atdat.shift_vec;
129 float rlistOuter_sq = nbparam.rlistOuter_sq;
130 float rlistInner_sq = nbparam.rlistInner_sq;
132 /* thread/block/warp id-s */
133 unsigned int tidxi = threadIdx.x;
134 unsigned int tidxj = threadIdx.y;
136 unsigned int tidxz = 0;
138 unsigned int tidxz = threadIdx.z;
140 unsigned int bidx = blockIdx.x;
141 unsigned int widx = (threadIdx.y * c_clSize) / warp_size; /* warp index */
143 /*********************************************************************
144 * Set up shared memory pointers.
145 * sm_nextSlotPtr should always be updated to point to the "next slot",
146 * that is past the last point where data has been stored.
148 extern __shared__ char sm_dynamicShmem[];
149 char* sm_nextSlotPtr = sm_dynamicShmem;
150 static_assert(sizeof(char) == 1,
151 "The shared memory offset calculation assumes that char is 1 byte");
153 /* shmem buffer for i x+q pre-loading */
154 float4* xib = (float4*)sm_nextSlotPtr;
155 sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xib));
157 /* shmem buffer for cj, for each warp separately */
158 int* cjs = (int*)(sm_nextSlotPtr);
159 /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
160 cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
161 sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
162 /*********************************************************************/
166 pl_sci[bidx * numParts + part]; /* my i super-cluster's index = sciOffset + current bidx * numParts + part */
167 int sci = nb_sci.sci; /* super-cluster */
168 int cij4_start = nb_sci.cj4_ind_start; /* first ...*/
169 int cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
173 /* Pre-load i-atom x and q into shared memory */
174 int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
175 int ai = ci * c_clSize + tidxi;
177 /* We don't need q, but using float4 in shmem avoids bank conflicts.
178 (but it also wastes L2 bandwidth). */
180 float4 xi = tmp + shift_vec[nb_sci.shift];
181 xib[tidxj * c_clSize + tidxi] = xi;
185 /* loop over the j clusters = seen by any of the atoms in the current super-cluster;
186 * The loop stride NTHREAD_Z ensures that consecutive warps-pairs are assigned
187 * consecutive j4's entries.
189 for (int j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
191 unsigned int imaskFull, imaskCheck, imaskNew;
195 /* Read the mask from the list transferred from the CPU */
196 imaskFull = pl_cj4[j4].imei[widx].imask;
197 /* We attempt to prune all pairs present in the original list */
198 imaskCheck = imaskFull;
203 /* Read the mask from the "warp-pruned" by rlistOuter mask array */
204 imaskFull = plist.imask[j4 * c_nbnxnGpuClusterpairSplit + widx];
205 /* Read the old rolling pruned mask, use as a base for new */
206 imaskNew = pl_cj4[j4].imei[widx].imask;
207 /* We only need to check pairs with different mask */
208 imaskCheck = (imaskNew ^ imaskFull);
213 /* Pre-load cj into shared memory on both warps separately */
214 if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize)
216 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize / c_splitClSize] = pl_cj4[j4].cj[tidxi];
218 __syncwarp(c_fullWarpMask);
221 for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
223 if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
225 unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
227 int cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize];
228 int aj = cj * c_clSize + tidxj;
230 /* load j atom data */
232 float3 xj = make_float3(tmp.x, tmp.y, tmp.z);
235 for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
237 if (imaskCheck & mask_ji)
239 /* load i-cluster coordinates from shmem */
240 float4 xi = xib[i * c_clSize + tidxi];
243 /* distance between i and j atoms */
244 float3 rv = make_float3(xi.x, xi.y, xi.z) - xj;
245 float r2 = norm2(rv);
247 /* If _none_ of the atoms pairs are in rlistOuter
248 range, the bit corresponding to the current
249 cluster-pair in imask gets set to 0. */
250 if (haveFreshList && !__any_sync(c_fullWarpMask, r2 < rlistOuter_sq))
252 imaskFull &= ~mask_ji;
254 /* If any atom pair is within range, set the bit
255 corresponding to the current cluster-pair. */
256 if (__any_sync(c_fullWarpMask, r2 < rlistInner_sq))
262 /* shift the mask bit by 1 */
270 /* copy the list pruned to rlistOuter to a separate buffer */
271 plist.imask[j4 * c_nbnxnGpuClusterpairSplit + widx] = imaskFull;
273 /* update the imask with only the pairs up to rlistInner */
274 plist.cj4[j4].imei[widx].imask = imaskNew;
276 // avoid shared memory WAR hazards between loop iterations
277 __syncwarp(c_fullWarpMask);
280 #endif /* FUNCTION_DECLARATION_ONLY */
283 #undef MIN_BLOCKS_PER_MP
284 #undef THREADS_PER_BLOCK