2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "gromacs/gpu_utils/device_utils.clh"
39 #include "gromacs/gpu_utils/vectype_ops.clh"
40 #include "gromacs/nbnxm/constants.h"
41 #include "gromacs/pbcutil/ishift.h"
43 #include "nbnxm_ocl_consts.h"
45 #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
46 #define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster
48 #define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
50 #if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
51 /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
52 * Note that this should precede the kernel_utils include.
54 #define USE_CJ_PREFETCH 1
56 #define USE_CJ_PREFETCH 0
59 #if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
60 #define HAVE_SUBGROUP 1
62 #define HAVE_SUBGROUP 0
65 #ifdef cl_intel_subgroups
66 #define HAVE_INTEL_SUBGROUP 1
68 #define HAVE_INTEL_SUBGROUP 0
71 #if defined _INTEL_SOURCE_
72 #define SUBGROUP_SIZE 8
73 #elif defined _AMD_SOURCE_
74 #define SUBGROUP_SIZE 64
76 #define SUBGROUP_SIZE 32
79 #define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
80 #define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
81 #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
83 /* 1.0 / sqrt(M_PI) */
84 #define M_FLOAT_1_SQRTPI 0.564189583547756f
88 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
89 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
92 #define WARP_SIZE_LOG2 (5)
93 #define CL_SIZE_LOG2 (3)
95 #define WARP_SIZE_LOG2 (3)
96 #define CL_SIZE_LOG2 (2)
98 #error unsupported CL_SIZE
101 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
102 #define FBUF_STRIDE (CL_SIZE_SQ)
104 #define ONE_SIXTH_F 0.16666667f
105 #define ONE_TWELVETH_F 0.08333333f
109 /* GCC, clang, and some ICC pretending to be GCC */
110 # define gmx_unused __attribute__ ((unused))
115 // Data structures shared between OpenCL device code and OpenCL host code
116 // TODO: review, improve
117 // Replaced real by float for now, to avoid including any other header
124 /* Used with potential switching:
125 * rsw = max(r - r_switch, 0)
126 * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
127 * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
128 * force = force*dsw - potential*sw
137 // Data structure shared between the OpenCL device code and OpenCL host code
138 // Must not contain OpenCL objects (buffers)
139 typedef struct cl_nbparam_params
142 int eeltype; /**< type of electrostatics, takes values from #eelCu */
143 int vdwtype; /**< type of VdW impl., takes values from #evdwCu */
145 float epsfac; /**< charge multiplication factor */
146 float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
147 float two_k_rf; /**< Reaction-field electrostatics constant */
148 float ewald_beta; /**< Ewald/PME parameter */
149 float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
150 float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
151 float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
153 float rcoulomb_sq; /**< Coulomb cut-off squared */
155 float rvdw_sq; /**< VdW cut-off squared */
156 float rvdw_switch; /**< VdW switched cut-off */
157 float rlistOuter_sq; /**< Full, outer pair-list cut-off squared */
158 float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
160 shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
161 shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
162 switch_consts_t vdw_switch; /**< VdW switch constants */
164 /* Ewald Coulomb force table data - accessed through texture memory */
165 float coulomb_tab_scale; /**< table scale/spacing */
166 }cl_nbparam_params_t;
169 int sci; /* i-super-cluster */
170 int shift; /* Shift vector index plus possible flags */
171 int cj4_ind_start; /* Start index into cj4 */
172 int cj4_ind_end; /* End index into cj4 */
176 unsigned int imask; /* The i-cluster interactions mask for 1 warp */
177 int excl_ind; /* Index into the exclusion array for 1 warp */
181 int cj[4]; /* The 4 j-clusters */
182 nbnxn_im_ei_t imei[2]; /* The i-cluster mask data for 2 warps */
187 unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
188 * each unsigned has bitS for 4*8 i clusters
192 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
193 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
196 void preloadCj4Generic(__local int *sm_cjPreload,
197 const __global int *gm_cj,
200 bool gmx_unused iMaskCond)
202 /* Pre-load cj into shared memory */
203 #if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
204 if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
206 sm_cjPreload[tidxi] = gm_cj[tidxi];
209 const int c_clSize = CL_SIZE;
210 const int c_nbnxnGpuClusterpairSplit = 2;
211 const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
212 if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
214 sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
220 #if USE_SUBGROUP_PRELOAD
222 int preloadCj4Subgroup(const __global int *gm_cj)
224 //loads subgroup-size # of elements (8) instead of the 4 required
225 //equivalent to *cjs = *gm_cj
226 return intel_sub_group_block_read((const __global uint *)gm_cj);
228 #endif //USE_SUBGROUP_PRELOAD
230 #if USE_SUBGROUP_PRELOAD
231 typedef size_t CjType;
233 typedef __local int* CjType;
236 /*! \brief Preload cj4
238 * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
239 * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
240 * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
242 * It is the caller's responsibility to make sure that data is consumed only when
243 * it's ready. This function does not call a barrier.
246 void preloadCj4(CjType gmx_unused *cjs,
247 const __global int gmx_unused *gm_cj,
248 int gmx_unused tidxi,
249 int gmx_unused tidxj,
250 bool gmx_unused iMaskCond)
252 #if USE_SUBGROUP_PRELOAD
253 *cjs = preloadCj4Subgroup(gm_cj);
254 #elif USE_CJ_PREFETCH
255 preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
262 int loadCjPreload(__local int * sm_cjPreload,
264 int gmx_unused tidxi,
265 int gmx_unused tidxj)
267 #if defined _AMD_SOURCE_
268 int warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
270 const int c_clSize = CL_SIZE;
271 const int c_nbnxnGpuClusterpairSplit = 2;
272 const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
273 int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
275 return sm_cjPreload[jm + warpLoadOffset];
278 /* \brief Load a cj given a jm index.
280 * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
283 int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
284 int jm, int gmx_unused tidxi, int gmx_unused tidxj)
286 #if USE_SUBGROUP_PRELOAD
287 return sub_group_broadcast(cjs, jm);
288 #elif USE_CJ_PREFETCH
289 return loadCjPreload(cjs, jm, tidxi, tidxj);
295 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
297 void convert_sigma_epsilon_to_c6_c12(const float sigma,
302 float sigma2, sigma6;
304 sigma2 = sigma * sigma;
305 sigma6 = sigma2 *sigma2 * sigma2;
306 *c6 = epsilon * sigma6;
311 /*! Apply force switch, force + energy version. */
313 void calculate_force_switch_F(const cl_nbparam_params_t *nbparam,
322 /* force switch constants */
323 float disp_shift_V2 = nbparam->dispersion_shift.c2;
324 float disp_shift_V3 = nbparam->dispersion_shift.c3;
325 float repu_shift_V2 = nbparam->repulsion_shift.c2;
326 float repu_shift_V3 = nbparam->repulsion_shift.c3;
329 r_switch = r - nbparam->rvdw_switch;
330 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
333 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
334 c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
337 /*! Apply force switch, force-only version. */
339 void calculate_force_switch_F_E(const cl_nbparam_params_t *nbparam,
349 /* force switch constants */
350 float disp_shift_V2 = nbparam->dispersion_shift.c2;
351 float disp_shift_V3 = nbparam->dispersion_shift.c3;
352 float repu_shift_V2 = nbparam->repulsion_shift.c2;
353 float repu_shift_V3 = nbparam->repulsion_shift.c3;
355 float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
356 float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
357 float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
358 float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
361 r_switch = r - nbparam->rvdw_switch;
362 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
365 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
366 c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
368 c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
369 c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
372 /*! Apply potential switch, force-only version. */
374 void calculate_potential_switch_F(const cl_nbparam_params_t *nbparam,
383 /* potential switch constants */
384 float switch_V3 = nbparam->vdw_switch.c3;
385 float switch_V4 = nbparam->vdw_switch.c4;
386 float switch_V5 = nbparam->vdw_switch.c5;
387 float switch_F2 = nbparam->vdw_switch.c3;
388 float switch_F3 = nbparam->vdw_switch.c4;
389 float switch_F4 = nbparam->vdw_switch.c5;
392 r_switch = r - nbparam->rvdw_switch;
394 /* Unlike in the F+E kernel, conditional is faster here */
397 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
398 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
400 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
404 /*! Apply potential switch, force + energy version. */
406 void calculate_potential_switch_F_E(const cl_nbparam_params_t *nbparam,
415 /* potential switch constants */
416 float switch_V3 = nbparam->vdw_switch.c3;
417 float switch_V4 = nbparam->vdw_switch.c4;
418 float switch_V5 = nbparam->vdw_switch.c5;
419 float switch_F2 = nbparam->vdw_switch.c3;
420 float switch_F3 = nbparam->vdw_switch.c4;
421 float switch_F4 = nbparam->vdw_switch.c5;
424 r_switch = r - nbparam->rvdw_switch;
425 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
427 /* Unlike in the F-only kernel, masking is faster here */
428 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
429 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
431 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
435 /*! Calculate LJ-PME grid force contribution with
436 * geometric combination rule.
439 void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
448 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
450 c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
452 /* Recalculate inv_r6 without exclusion mask */
453 inv_r6_nm = inv_r2*inv_r2*inv_r2;
456 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
458 /* Subtract the grid force from the total LJ force */
459 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
462 /*! Calculate LJ-PME grid force + energy contribution with
463 * geometric combination rule.
466 void calculate_lj_ewald_comb_geom_F_E(__constant const float *nbfp_comb_climg2d,
467 const cl_nbparam_params_t *nbparam,
478 float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
480 c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
482 /* Recalculate inv_r6 without exclusion mask */
483 inv_r6_nm = inv_r2*inv_r2*inv_r2;
486 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
488 /* Subtract the grid force from the total LJ force */
489 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
491 /* Shift should be applied only to real LJ pairs */
492 sh_mask = nbparam->sh_lj_ewald*int_bit;
493 *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
496 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
497 * Lorentz-Berthelot combination rule.
498 * We use a single F+E kernel with conditional because the performance impact
499 * of this is pretty small and LB on the CPU is anyway very slow.
502 void calculate_lj_ewald_comb_LB_F_E(__constant const float *nbfp_comb_climg2d,
503 const cl_nbparam_params_t *nbparam,
515 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
516 float sigma, sigma2, epsilon;
518 /* sigma and epsilon are scaled to give 6*C6 */
519 sigma = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
521 epsilon = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
523 sigma2 = sigma*sigma;
524 c6grid = epsilon*sigma2*sigma2*sigma2;
526 /* Recalculate inv_r6 without exclusion mask */
527 inv_r6_nm = inv_r2*inv_r2*inv_r2;
530 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
532 /* Subtract the grid force from the total LJ force */
533 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
539 /* Shift should be applied only to real LJ pairs */
540 sh_mask = nbparam->sh_lj_ewald*int_bit;
541 *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
545 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
546 * Original idea: from the OpenMM project
548 gmx_opencl_inline float
549 interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
553 float normalized = scale * r;
554 int index = (int) normalized;
555 float fract2 = normalized - index;
556 float fract1 = 1.0f - fract2;
558 return fract1*coulomb_tab_climg2d[index] +
559 fract2*coulomb_tab_climg2d[index + 1];
562 /*! Calculate analytical Ewald correction term. */
564 float pmecorrF(float z2)
566 const float FN6 = -1.7357322914161492954e-8f;
567 const float FN5 = 1.4703624142580877519e-6f;
568 const float FN4 = -0.000053401640219807709149f;
569 const float FN3 = 0.0010054721316683106153f;
570 const float FN2 = -0.019278317264888380590f;
571 const float FN1 = 0.069670166153766424023f;
572 const float FN0 = -0.75225204789749321333f;
574 const float FD4 = 0.0011193462567257629232f;
575 const float FD3 = 0.014866955030185295499f;
576 const float FD2 = 0.11583842382862377919f;
577 const float FD1 = 0.50736591960530292870f;
578 const float FD0 = 1.0f;
581 float polyFN0, polyFN1, polyFD0, polyFD1;
585 polyFD0 = FD4*z4 + FD2;
586 polyFD1 = FD3*z4 + FD1;
587 polyFD0 = polyFD0*z4 + FD0;
588 polyFD0 = polyFD1*z2 + polyFD0;
590 polyFD0 = 1.0f/polyFD0;
592 polyFN0 = FN6*z4 + FN4;
593 polyFN1 = FN5*z4 + FN3;
594 polyFN0 = polyFN0*z4 + FN2;
595 polyFN1 = polyFN1*z4 + FN1;
596 polyFN0 = polyFN0*z4 + FN0;
597 polyFN0 = polyFN1*z2 + polyFN0;
599 return polyFN0*polyFD0;
604 void reduce_force_j_shfl(float3 fin, __global float *fout,
605 int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
607 /* Only does reduction over 4 elements in cluster. Needs to be changed
608 * for CL_SIZE>4. See CUDA code for required code */
609 fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
610 fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, 1);
611 fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
612 if ((tidxi & 1) == 1)
616 fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
617 fin.z += intel_sub_group_shuffle_up (fin.z, fin.z, 2);
624 atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
630 void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
631 int tidxi, int tidxj, int aidx)
633 int tidx = tidxi + tidxj*CL_SIZE;
634 f_buf[ tidx] = fcj_buf.x;
635 f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y;
636 f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
638 /* Split the reduction between the first 3 column threads
639 Threads with column id 0 will do the reduction for (float3).x components
640 Threads with column id 1 will do the reduction for (float3).y components
641 Threads with column id 2 will do the reduction for (float3).z components.
642 The reduction is performed for each line tidxj of f_buf. */
646 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
648 f += f_buf[FBUF_STRIDE * tidxi + j];
651 atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
655 /*! Final j-force reduction
658 void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
659 int tidxi, int tidxj, int aidx)
662 reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
664 reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
670 void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
671 bool bCalcFshift, int tidxi, int tidxj,
672 int sci, int shift, __global float *fshift)
674 /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
676 float2 fshift_buf = 0;
677 for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
679 int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
680 float3 fin = fci_buf[ci_offset];
681 fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
682 fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, CL_SIZE);
683 fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
689 /* Threads 0,1 and 2,3 increment x,y for their warp */
690 atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
693 fshift_buf[0] += fin.x;
695 /* Threads 0 and 2 increment z for their warp */
696 if ((tidxj & 1) == 0)
698 atomicAdd_g_f(&fout[3*aidx+2], fin.z);
701 fshift_buf[1] += fin.z;
705 /* add up local shift forces into global mem */
708 //Threads 0,1 and 2,3 update x,y
709 atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
710 //Threads 0 and 2 update z
711 if ((tidxj & 1) == 0)
713 atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
719 /*! Final i-force reduction; this implementation works only with power of two
723 void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
724 __global float *fout,
726 int tidxi, int tidxj,
727 int sci, int shift, __global float *fshift)
729 float fshift_buf = 0;
730 for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
732 int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
733 int tidx = tidxi + tidxj*CL_SIZE;
734 /* store i forces in shmem */
735 f_buf[ tidx] = fci_buf[ci_offset].x;
736 f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
737 f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
738 barrier(CLK_LOCAL_MEM_FENCE);
741 /* Reduce the initial CL_SIZE values for each i atom to half
742 * every step by using CL_SIZE * i threads.
743 * Can't just use i as loop variable because than nvcc refuses to unroll.
746 for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
751 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
752 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
753 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
758 * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
759 * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
760 * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
762 barrier(CLK_LOCAL_MEM_FENCE);
764 /* i == 1, last reduction step, writing to global mem */
765 /* Split the reduction between the first 3 line threads
766 Threads with line id 0 will do the reduction for (float3).x components
767 Threads with line id 1 will do the reduction for (float3).y components
768 Threads with line id 2 will do the reduction for (float3).z components. */
771 float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
773 atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
781 /* add up local shift forces into global mem */
784 /* Only threads with tidxj < 3 will update fshift.
785 The threads performing the update, must be the same as the threads
786 storing the reduction result above.
790 atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
795 /*! Final i-force reduction
798 void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
799 bool bCalcFshift, int tidxi, int tidxj, int sci,
800 int shift, __global float *fshift)
803 reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
806 reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
815 void reduce_energy_shfl(float E_lj, float E_el,
816 volatile __global float *e_lj,
817 volatile __global float *e_el,
820 E_lj = sub_group_reduce_add(E_lj);
821 E_el = sub_group_reduce_add(E_el);
822 /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
823 * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
824 * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
825 * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
826 * (by supporting c_nbnxnGpuClusterpairSplit=1). */
827 if (tidx == 0 || tidx == WARP_SIZE)
829 atomicAdd_g_f(e_lj, E_lj);
830 atomicAdd_g_f(e_el, E_el);
835 /*! Energy reduction; this implementation works only with power of two
839 void reduce_energy_pow2(volatile __local float *buf,
840 volatile __global float *e_lj,
841 volatile __global float *e_el,
846 unsigned int i = WARP_SIZE/2;
848 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
849 for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
853 buf[ tidx] += buf[ tidx + i];
854 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
859 /* last reduction step, writing to global mem */
862 float e1 = buf[ tidx] + buf[ tidx + i];
863 float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
865 atomicAdd_g_f(e_lj, e1);
866 atomicAdd_g_f(e_el, e2);
871 void reduce_energy(volatile __local float gmx_unused *buf,
872 float E_lj, float E_el,
873 volatile __global float *e_lj,
874 volatile __global float *e_el,
878 reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
880 /* flush the energies to shmem and reduce them */
882 buf[FBUF_STRIDE + tidx] = E_el;
883 reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
888 bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
895 bool ret = warp_any[widx];
902 //! Returns a true if predicate is true for any work item in warp
904 bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
907 return sub_group_any(pred);
909 return gmx_sub_group_any_localmem(warp_any, widx, pred);
913 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */