src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
   5  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \internal \file
  38  *  \brief
  39  *  Utility constant and function declaration for the CUDA non-bonded kernels.
  40  *  This header should be included once at the top level, just before the
  41  *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
  42  *
  43  *  \author Szilárd Páll <pall.szilard@gmail.com>
  44  *  \ingroup module_nbnxm
  45  */
  46 #include <assert.h>
  47
  48 /* Note that floating-point constants in CUDA code should be suffixed
  49  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  50  * code that is in double precision.
  51  */
  52
  53 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  54 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  55 #include "gromacs/gpu_utils/vectype_ops.cuh"
  56
  57 #include "nbnxm_cuda_types.h"
  58
  59 #ifndef NBNXM_CUDA_KERNEL_UTILS_CUH
  60 #    define NBNXM_CUDA_KERNEL_UTILS_CUH
  61
  62 /*! \brief Log of the i and j cluster size.
  63  *  change this together with c_clSize !*/
  64 static const int __device__ c_clSizeLog2 = 3;
  65 /*! \brief Square of cluster size. */
  66 static const int __device__ c_clSizeSq = c_clSize * c_clSize;
  67 /*! \brief j-cluster size after split (4 in the current implementation). */
  68 static const int __device__ c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
  69 /*! \brief Stride in the force accumualation buffer */
  70 static const int __device__ c_fbufStride = c_clSizeSq;
  71 /*! \brief i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
  72 static const unsigned __device__ superClInteractionMask =
  73         ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
  74
  75 static const float __device__ c_oneSixth    = 0.16666667f;
  76 static const float __device__ c_oneTwelveth = 0.08333333f;
  77
  78
  79 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
  80 static __forceinline__ __device__ void
  81                        convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon, float* c6, float* c12)
  82 {
  83     float sigma2, sigma6;
  84
  85     sigma2 = sigma * sigma;
  86     sigma6 = sigma2 * sigma2 * sigma2;
  87     *c6    = epsilon * sigma6;
  88     *c12   = *c6 * sigma6;
  89 }
  90
  91 /*! Apply force switch,  force + energy version. */
  92 static __forceinline__ __device__ void
  93                        calculate_force_switch_F(const cu_nbparam_t nbparam, float c6, float c12, float inv_r, float r2, float* F_invr)
  94 {
  95     float r, r_switch;
  96
  97     /* force switch constants */
  98     float disp_shift_V2 = nbparam.dispersion_shift.c2;
  99     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 100     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 101     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 102
 103     r        = r2 * inv_r;
 104     r_switch = r - nbparam.rvdw_switch;
 105     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 106
 107     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
 108                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
 109 }
 110
 111 /*! Apply force switch, force-only version. */
 112 static __forceinline__ __device__ void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
 113                                                                   float              c6,
 114                                                                   float              c12,
 115                                                                   float              inv_r,
 116                                                                   float              r2,
 117                                                                   float*             F_invr,
 118                                                                   float*             E_lj)
 119 {
 120     float r, r_switch;
 121
 122     /* force switch constants */
 123     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 124     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 125     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 126     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 127
 128     float disp_shift_F2 = nbparam.dispersion_shift.c2 / 3;
 129     float disp_shift_F3 = nbparam.dispersion_shift.c3 / 4;
 130     float repu_shift_F2 = nbparam.repulsion_shift.c2 / 3;
 131     float repu_shift_F3 = nbparam.repulsion_shift.c3 / 4;
 132
 133     r        = r2 * inv_r;
 134     r_switch = r - nbparam.rvdw_switch;
 135     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 136
 137     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
 138                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
 139     *E_lj += c6 * (disp_shift_F2 + disp_shift_F3 * r_switch) * r_switch * r_switch * r_switch
 140              - c12 * (repu_shift_F2 + repu_shift_F3 * r_switch) * r_switch * r_switch * r_switch;
 141 }
 142
 143 /*! Apply potential switch, force-only version. */
 144 static __forceinline__ __device__ void
 145                        calculate_potential_switch_F(const cu_nbparam_t nbparam, float inv_r, float r2, float* F_invr, float* E_lj)
 146 {
 147     float r, r_switch;
 148     float sw, dsw;
 149
 150     /* potential switch constants */
 151     float switch_V3 = nbparam.vdw_switch.c3;
 152     float switch_V4 = nbparam.vdw_switch.c4;
 153     float switch_V5 = nbparam.vdw_switch.c5;
 154     float switch_F2 = 3 * nbparam.vdw_switch.c3;
 155     float switch_F3 = 4 * nbparam.vdw_switch.c4;
 156     float switch_F4 = 5 * nbparam.vdw_switch.c5;
 157
 158     r        = r2 * inv_r;
 159     r_switch = r - nbparam.rvdw_switch;
 160
 161     /* Unlike in the F+E kernel, conditional is faster here */
 162     if (r_switch > 0.0f)
 163     {
 164         sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
 165         dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 166
 167         *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
 168     }
 169 }
 170
 171 /*! Apply potential switch, force + energy version. */
 172 static __forceinline__ __device__ void
 173                        calculate_potential_switch_F_E(const cu_nbparam_t nbparam, float inv_r, float r2, float* F_invr, float* E_lj)
 174 {
 175     float r, r_switch;
 176     float sw, dsw;
 177
 178     /* potential switch constants */
 179     float switch_V3 = nbparam.vdw_switch.c3;
 180     float switch_V4 = nbparam.vdw_switch.c4;
 181     float switch_V5 = nbparam.vdw_switch.c5;
 182     float switch_F2 = 3 * nbparam.vdw_switch.c3;
 183     float switch_F3 = 4 * nbparam.vdw_switch.c4;
 184     float switch_F4 = 5 * nbparam.vdw_switch.c5;
 185
 186     r        = r2 * inv_r;
 187     r_switch = r - nbparam.rvdw_switch;
 188     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 189
 190     /* Unlike in the F-only kernel, masking is faster here */
 191     sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
 192     dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 193
 194     *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
 195     *E_lj *= sw;
 196 }
 197
 198
 199 /*! \brief Fetch C6 grid contribution coefficients and return the product of these.
 200  *
 201  *  Depending on what is supported, it fetches parameters either
 202  *  using direct load, texture objects, or texrefs.
 203  */
 204 static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam, int typei, int typej)
 205 {
 206 #    if DISABLE_CUDA_TEXTURES
 207     return LDG(&nbparam.nbfp_comb[2 * typei]) * LDG(&nbparam.nbfp_comb[2 * typej]);
 208 #    else
 209     return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typei)
 210            * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typej);
 211 #    endif /* DISABLE_CUDA_TEXTURES */
 212 }
 213
 214
 215 /*! Calculate LJ-PME grid force contribution with
 216  *  geometric combination rule.
 217  */
 218 static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 219                                                                       int                typei,
 220                                                                       int                typej,
 221                                                                       float              r2,
 222                                                                       float              inv_r2,
 223                                                                       float              lje_coeff2,
 224                                                                       float  lje_coeff6_6,
 225                                                                       float* F_invr)
 226 {
 227     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 228
 229     c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 230
 231     /* Recalculate inv_r6 without exclusion mask */
 232     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 233     cr2       = lje_coeff2 * r2;
 234     expmcr2   = expf(-cr2);
 235     poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
 236
 237     /* Subtract the grid force from the total LJ force */
 238     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 239 }
 240
 241
 242 /*! Calculate LJ-PME grid force + energy contribution with
 243  *  geometric combination rule.
 244  */
 245 static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 246                                                                         int                typei,
 247                                                                         int                typej,
 248                                                                         float              r2,
 249                                                                         float              inv_r2,
 250                                                                         float  lje_coeff2,
 251                                                                         float  lje_coeff6_6,
 252                                                                         float  int_bit,
 253                                                                         float* F_invr,
 254                                                                         float* E_lj)
 255 {
 256     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 257
 258     c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 259
 260     /* Recalculate inv_r6 without exclusion mask */
 261     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 262     cr2       = lje_coeff2 * r2;
 263     expmcr2   = expf(-cr2);
 264     poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
 265
 266     /* Subtract the grid force from the total LJ force */
 267     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 268
 269     /* Shift should be applied only to real LJ pairs */
 270     sh_mask = nbparam.sh_lj_ewald * int_bit;
 271     *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
 272 }
 273
 274 /*! Fetch per-type LJ parameters.
 275  *
 276  *  Depending on what is supported, it fetches parameters either
 277  *  using direct load, texture objects, or texrefs.
 278  */
 279 static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam, int type)
 280 {
 281     float2 c6c12;
 282 #    if DISABLE_CUDA_TEXTURES
 283     /* Force an 8-byte fetch to save a memory instruction. */
 284     float2* nbfp_comb = (float2*)nbparam.nbfp_comb;
 285     c6c12             = LDG(&nbfp_comb[type]);
 286 #    else
 287     /* NOTE: as we always do 8-byte aligned loads, we could
 288        fetch float2 here too just as above. */
 289     c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type);
 290     c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type + 1);
 291 #    endif /* DISABLE_CUDA_TEXTURES */
 292
 293     return c6c12;
 294 }
 295
 296
 297 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != nullptr) with
 298  *  Lorentz-Berthelot combination rule.
 299  *  We use a single F+E kernel with conditional because the performance impact
 300  *  of this is pretty small and LB on the CPU is anyway very slow.
 301  */
 302 static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 303                                                                       int                typei,
 304                                                                       int                typej,
 305                                                                       float              r2,
 306                                                                       float              inv_r2,
 307                                                                       float              lje_coeff2,
 308                                                                       float  lje_coeff6_6,
 309                                                                       float  int_bit,
 310                                                                       float* F_invr,
 311                                                                       float* E_lj)
 312 {
 313     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 314     float sigma, sigma2, epsilon;
 315
 316     /* sigma and epsilon are scaled to give 6*C6 */
 317     float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
 318     float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
 319
 320     sigma   = c6c12_i.x + c6c12_j.x;
 321     epsilon = c6c12_i.y * c6c12_j.y;
 322
 323     sigma2 = sigma * sigma;
 324     c6grid = epsilon * sigma2 * sigma2 * sigma2;
 325
 326     /* Recalculate inv_r6 without exclusion mask */
 327     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 328     cr2       = lje_coeff2 * r2;
 329     expmcr2   = expf(-cr2);
 330     poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
 331
 332     /* Subtract the grid force from the total LJ force */
 333     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 334
 335     if (E_lj != nullptr)
 336     {
 337         float sh_mask;
 338
 339         /* Shift should be applied only to real LJ pairs */
 340         sh_mask = nbparam.sh_lj_ewald * int_bit;
 341         *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
 342     }
 343 }
 344
 345
 346 /*! Fetch two consecutive values from the Ewald correction F*r table.
 347  *
 348  *  Depending on what is supported, it fetches parameters either
 349  *  using direct load, texture objects, or texrefs.
 350  */
 351 static __forceinline__ __device__ float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam, int index)
 352 {
 353     float2 d;
 354
 355 #    if DISABLE_CUDA_TEXTURES
 356     /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
 357     d.x = LDG(&nbparam.coulomb_tab[index]);
 358     d.y = LDG(&nbparam.coulomb_tab[index + 1]);
 359 #    else
 360     d.x     = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
 361     d.y     = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
 362 #    endif // DISABLE_CUDA_TEXTURES
 363
 364     return d;
 365 }
 366
 367 /*! Linear interpolation using exactly two FMA operations.
 368  *
 369  *  Implements numeric equivalent of: (1-t)*d0 + t*d1
 370  *  Note that CUDA does not have fnms, otherwise we'd use
 371  *  fma(t, d1, fnms(t, d0, d0)
 372  *  but input modifiers are designed for this and are fast.
 373  */
 374 template<typename T>
 375 __forceinline__ __host__ __device__ T lerp(T d0, T d1, T t)
 376 {
 377     return fma(t, d1, fma(-t, d0, d0));
 378 }
 379
 380 /*! Interpolate Ewald coulomb force correction using the F*r table.
 381  */
 382 static __forceinline__ __device__ float interpolate_coulomb_force_r(const cu_nbparam_t nbparam, float r)
 383 {
 384     float normalized = nbparam.coulomb_tab_scale * r;
 385     int   index      = (int)normalized;
 386     float fraction   = normalized - index;
 387
 388     float2 d01 = fetch_coulomb_force_r(nbparam, index);
 389
 390     return lerp(d01.x, d01.y, fraction);
 391 }
 392
 393 /*! Fetch C6 and C12 from the parameter table.
 394  *
 395  *  Depending on what is supported, it fetches parameters either
 396  *  using direct load, texture objects, or texrefs.
 397  */
 398 static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const cu_nbparam_t nbparam, int baseIndex)
 399 {
 400 #    if DISABLE_CUDA_TEXTURES
 401     /* Force an 8-byte fetch to save a memory instruction. */
 402     float2* nbfp = (float2*)nbparam.nbfp;
 403     float2  c6c12;
 404     c6c12 = LDG(&nbfp[baseIndex]);
 405     c6    = c6c12.x;
 406     c12   = c6c12.y;
 407 #    else
 408     /* NOTE: as we always do 8-byte aligned loads, we could
 409        fetch float2 here too just as above. */
 410     c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex);
 411     c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex + 1);
 412 #    endif // DISABLE_CUDA_TEXTURES
 413 }
 414
 415
 416 /*! Calculate analytical Ewald correction term. */
 417 static __forceinline__ __device__ float pmecorrF(float z2)
 418 {
 419     const float FN6 = -1.7357322914161492954e-8f;
 420     const float FN5 = 1.4703624142580877519e-6f;
 421     const float FN4 = -0.000053401640219807709149f;
 422     const float FN3 = 0.0010054721316683106153f;
 423     const float FN2 = -0.019278317264888380590f;
 424     const float FN1 = 0.069670166153766424023f;
 425     const float FN0 = -0.75225204789749321333f;
 426
 427     const float FD4 = 0.0011193462567257629232f;
 428     const float FD3 = 0.014866955030185295499f;
 429     const float FD2 = 0.11583842382862377919f;
 430     const float FD1 = 0.50736591960530292870f;
 431     const float FD0 = 1.0f;
 432
 433     float z4;
 434     float polyFN0, polyFN1, polyFD0, polyFD1;
 435
 436     z4 = z2 * z2;
 437
 438     polyFD0 = FD4 * z4 + FD2;
 439     polyFD1 = FD3 * z4 + FD1;
 440     polyFD0 = polyFD0 * z4 + FD0;
 441     polyFD0 = polyFD1 * z2 + polyFD0;
 442
 443     polyFD0 = 1.0f / polyFD0;
 444
 445     polyFN0 = FN6 * z4 + FN4;
 446     polyFN1 = FN5 * z4 + FN3;
 447     polyFN0 = polyFN0 * z4 + FN2;
 448     polyFN1 = polyFN1 * z4 + FN1;
 449     polyFN0 = polyFN0 * z4 + FN0;
 450     polyFN0 = polyFN1 * z2 + polyFN0;
 451
 452     return polyFN0 * polyFD0;
 453 }
 454
 455 /*! Final j-force reduction; this generic implementation works with
 456  *  arbitrary array sizes.
 457  */
 458 static __forceinline__ __device__ void
 459                        reduce_force_j_generic(float* f_buf, float3* fout, int tidxi, int tidxj, int aidx)
 460 {
 461     if (tidxi < 3)
 462     {
 463         float f = 0.0f;
 464         for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
 465         {
 466             f += f_buf[c_fbufStride * tidxi + j];
 467         }
 468
 469         atomicAdd((&fout[aidx].x) + tidxi, f);
 470     }
 471 }
 472
 473 /*! Final j-force reduction; this implementation only with power of two
 474  *  array sizes.
 475  */
 476 static __forceinline__ __device__ void
 477                        reduce_force_j_warp_shfl(float3 f, float3* fout, int tidxi, int aidx, const unsigned int activemask)
 478 {
 479     f.x += __shfl_down_sync(activemask, f.x, 1);
 480     f.y += __shfl_up_sync(activemask, f.y, 1);
 481     f.z += __shfl_down_sync(activemask, f.z, 1);
 482
 483     if (tidxi & 1)
 484     {
 485         f.x = f.y;
 486     }
 487
 488     f.x += __shfl_down_sync(activemask, f.x, 2);
 489     f.z += __shfl_up_sync(activemask, f.z, 2);
 490
 491     if (tidxi & 2)
 492     {
 493         f.x = f.z;
 494     }
 495
 496     f.x += __shfl_down_sync(activemask, f.x, 4);
 497
 498     if (tidxi < 3)
 499     {
 500         atomicAdd((&fout[aidx].x) + tidxi, f.x);
 501     }
 502 }
 503
 504 /*! Final i-force reduction; this generic implementation works with
 505  *  arbitrary array sizes.
 506  * TODO: add the tidxi < 3 trick
 507  */
 508 static __forceinline__ __device__ void reduce_force_i_generic(float*  f_buf,
 509                                                               float3* fout,
 510                                                               float*  fshift_buf,
 511                                                               bool    bCalcFshift,
 512                                                               int     tidxi,
 513                                                               int     tidxj,
 514                                                               int     aidx)
 515 {
 516     if (tidxj < 3)
 517     {
 518         float f = 0.0f;
 519         for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
 520         {
 521             f += f_buf[tidxj * c_fbufStride + j];
 522         }
 523
 524         atomicAdd(&fout[aidx].x + tidxj, f);
 525
 526         if (bCalcFshift)
 527         {
 528             *fshift_buf += f;
 529         }
 530     }
 531 }
 532
 533 /*! Final i-force reduction; this implementation works only with power of two
 534  *  array sizes.
 535  */
 536 static __forceinline__ __device__ void reduce_force_i_pow2(volatile float* f_buf,
 537                                                            float3*         fout,
 538                                                            float*          fshift_buf,
 539                                                            bool            bCalcFshift,
 540                                                            int             tidxi,
 541                                                            int             tidxj,
 542                                                            int             aidx)
 543 {
 544     int   i, j;
 545     float f;
 546
 547     assert(c_clSize == 1 << c_clSizeLog2);
 548
 549     /* Reduce the initial c_clSize values for each i atom to half
 550      * every step by using c_clSize * i threads.
 551      * Can't just use i as loop variable because than nvcc refuses to unroll.
 552      */
 553     i = c_clSize / 2;
 554 #    pragma unroll 5
 555     for (j = c_clSizeLog2 - 1; j > 0; j--)
 556     {
 557         if (tidxj < i)
 558         {
 559
 560             f_buf[tidxj * c_clSize + tidxi] += f_buf[(tidxj + i) * c_clSize + tidxi];
 561             f_buf[c_fbufStride + tidxj * c_clSize + tidxi] +=
 562                     f_buf[c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 563             f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] +=
 564                     f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 565         }
 566         i >>= 1;
 567     }
 568
 569     /* i == 1, last reduction step, writing to global mem */
 570     if (tidxj < 3)
 571     {
 572         /* tidxj*c_fbufStride selects x, y or z */
 573         f = f_buf[tidxj * c_fbufStride + tidxi] + f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
 574
 575         atomicAdd(&(fout[aidx].x) + tidxj, f);
 576
 577         if (bCalcFshift)
 578         {
 579             *fshift_buf += f;
 580         }
 581     }
 582 }
 583
 584 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 585  *  on whether the size of the array to be reduced is power of two or not.
 586  */
 587 static __forceinline__ __device__ void
 588                        reduce_force_i(float* f_buf, float3* f, float* fshift_buf, bool bCalcFshift, int tidxi, int tidxj, int ai)
 589 {
 590     if ((c_clSize & (c_clSize - 1)))
 591     {
 592         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 593     }
 594     else
 595     {
 596         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 597     }
 598 }
 599
 600 /*! Final i-force reduction; this implementation works only with power of two
 601  *  array sizes.
 602  */
 603 static __forceinline__ __device__ void reduce_force_i_warp_shfl(float3             fin,
 604                                                                 float3*            fout,
 605                                                                 float*             fshift_buf,
 606                                                                 bool               bCalcFshift,
 607                                                                 int                tidxj,
 608                                                                 int                aidx,
 609                                                                 const unsigned int activemask)
 610 {
 611     fin.x += __shfl_down_sync(activemask, fin.x, c_clSize);
 612     fin.y += __shfl_up_sync(activemask, fin.y, c_clSize);
 613     fin.z += __shfl_down_sync(activemask, fin.z, c_clSize);
 614
 615     if (tidxj & 1)
 616     {
 617         fin.x = fin.y;
 618     }
 619
 620     fin.x += __shfl_down_sync(activemask, fin.x, 2 * c_clSize);
 621     fin.z += __shfl_up_sync(activemask, fin.z, 2 * c_clSize);
 622
 623     if (tidxj & 2)
 624     {
 625         fin.x = fin.z;
 626     }
 627
 628     /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 629     if ((tidxj & 3) < 3)
 630     {
 631         atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
 632
 633         if (bCalcFshift)
 634         {
 635             *fshift_buf += fin.x;
 636         }
 637     }
 638 }
 639
 640 /*! Energy reduction; this implementation works only with power of two
 641  *  array sizes.
 642  */
 643 static __forceinline__ __device__ void
 644                        reduce_energy_pow2(volatile float* buf, float* e_lj, float* e_el, unsigned int tidx)
 645 {
 646     float e1, e2;
 647
 648     unsigned int i = warp_size / 2;
 649
 650     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 651 #    pragma unroll 10
 652     for (int j = warp_size_log2 - 1; j > 0; j--)
 653     {
 654         if (tidx < i)
 655         {
 656             buf[tidx] += buf[tidx + i];
 657             buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
 658         }
 659         i >>= 1;
 660     }
 661
 662     // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 663     // memory locations to make sense
 664     /* last reduction step, writing to global mem */
 665     if (tidx == 0)
 666     {
 667         e1 = buf[tidx] + buf[tidx + i];
 668         e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
 669
 670         atomicAdd(e_lj, e1);
 671         atomicAdd(e_el, e2);
 672     }
 673 }
 674
 675 /*! Energy reduction; this implementation works only with power of two
 676  *  array sizes.
 677  */
 678 static __forceinline__ __device__ void
 679                        reduce_energy_warp_shfl(float E_lj, float E_el, float* e_lj, float* e_el, int tidx, const unsigned int activemask)
 680 {
 681     int i, sh;
 682
 683     sh = 1;
 684 #    pragma unroll 5
 685     for (i = 0; i < 5; i++)
 686     {
 687         E_lj += __shfl_down_sync(activemask, E_lj, sh);
 688         E_el += __shfl_down_sync(activemask, E_el, sh);
 689         sh += sh;
 690     }
 691
 692     /* The first thread in the warp writes the reduced energies */
 693     if (tidx == 0 || tidx == warp_size)
 694     {
 695         atomicAdd(e_lj, E_lj);
 696         atomicAdd(e_el, E_el);
 697     }
 698 }
 699
 700 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */