src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  Utility constant and function declaration for the CUDA non-bonded kernels.
  39  *  This header should be included once at the top level, just before the
  40  *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
  41  *
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \ingroup module_mdlib
  44  */
  45 #include <assert.h>
  46
  47 /* Note that floating-point constants in CUDA code should be suffixed
  48  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  49  * code that is in double precision.
  50  */
  51
  52 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  53 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  54 #include "gromacs/gpu_utils/vectype_ops.cuh"
  55
  56 #include "nbnxn_cuda_types.h"
  57
  58 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  59 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  60
  61 /* Use texture objects if supported by the target hardware (and in host pass). */
  62 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
  63 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
  64 #define USE_TEXOBJ
  65 #endif
  66
  67 /*! \brief Log of the i and j cluster size.
  68  *  change this together with c_clSize !*/
  69 static const int    c_clSizeLog2  = 3;
  70 /*! \brief Square of cluster size. */
  71 static const int    c_clSizeSq    = c_clSize*c_clSize;
  72 /*! \brief j-cluster size after split (4 in the current implementation). */
  73 static const int    c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
  74 /*! \brief Stride in the force accumualation buffer */
  75 static const int    c_fbufStride  = c_clSizeSq;
  76
  77 static const float  c_oneSixth    = 0.16666667f;
  78 static const float  c_oneTwelveth = 0.08333333f;
  79
  80
  81 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
  82 static __forceinline__ __device__
  83 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
  84                                      const float  epsilon,
  85                                      float       *c6,
  86                                      float       *c12)
  87 {
  88     float sigma2, sigma6;
  89
  90     sigma2 = sigma * sigma;
  91     sigma6 = sigma2 *sigma2 * sigma2;
  92     *c6    = epsilon * sigma6;
  93     *c12   = *c6 * sigma6;
  94 }
  95
  96 /*! Apply force switch,  force + energy version. */
  97 static __forceinline__ __device__
  98 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
  99                               float               c6,
 100                               float               c12,
 101                               float               inv_r,
 102                               float               r2,
 103                               float              *F_invr)
 104 {
 105     float r, r_switch;
 106
 107     /* force switch constants */
 108     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 109     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 110     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 111     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 112
 113     r         = r2 * inv_r;
 114     r_switch  = r - nbparam.rvdw_switch;
 115     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 116
 117     *F_invr  +=
 118         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 119         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 120 }
 121
 122 /*! Apply force switch, force-only version. */
 123 static __forceinline__ __device__
 124 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
 125                                 float               c6,
 126                                 float               c12,
 127                                 float               inv_r,
 128                                 float               r2,
 129                                 float              *F_invr,
 130                                 float              *E_lj)
 131 {
 132     float r, r_switch;
 133
 134     /* force switch constants */
 135     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 136     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 137     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 138     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 139
 140     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 141     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 142     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 143     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 144
 145     r         = r2 * inv_r;
 146     r_switch  = r - nbparam.rvdw_switch;
 147     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 148
 149     *F_invr  +=
 150         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 151         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 152     *E_lj    +=
 153         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 154         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 155 }
 156
 157 /*! Apply potential switch, force-only version. */
 158 static __forceinline__ __device__
 159 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 160                                   float               inv_r,
 161                                   float               r2,
 162                                   float              *F_invr,
 163                                   float              *E_lj)
 164 {
 165     float r, r_switch;
 166     float sw, dsw;
 167
 168     /* potential switch constants */
 169     float switch_V3 = nbparam.vdw_switch.c3;
 170     float switch_V4 = nbparam.vdw_switch.c4;
 171     float switch_V5 = nbparam.vdw_switch.c5;
 172     float switch_F2 = 3*nbparam.vdw_switch.c3;
 173     float switch_F3 = 4*nbparam.vdw_switch.c4;
 174     float switch_F4 = 5*nbparam.vdw_switch.c5;
 175
 176     r        = r2 * inv_r;
 177     r_switch = r - nbparam.rvdw_switch;
 178
 179     /* Unlike in the F+E kernel, conditional is faster here */
 180     if (r_switch > 0.0f)
 181     {
 182         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 183         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 184
 185         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 186     }
 187 }
 188
 189 /*! Apply potential switch, force + energy version. */
 190 static __forceinline__ __device__
 191 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 192                                     float               inv_r,
 193                                     float               r2,
 194                                     float              *F_invr,
 195                                     float              *E_lj)
 196 {
 197     float r, r_switch;
 198     float sw, dsw;
 199
 200     /* potential switch constants */
 201     float switch_V3 = nbparam.vdw_switch.c3;
 202     float switch_V4 = nbparam.vdw_switch.c4;
 203     float switch_V5 = nbparam.vdw_switch.c5;
 204     float switch_F2 = 3*nbparam.vdw_switch.c3;
 205     float switch_F3 = 4*nbparam.vdw_switch.c4;
 206     float switch_F4 = 5*nbparam.vdw_switch.c5;
 207
 208     r        = r2 * inv_r;
 209     r_switch = r - nbparam.rvdw_switch;
 210     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 211
 212     /* Unlike in the F-only kernel, masking is faster here */
 213     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 214     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 215
 216     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 217     *E_lj   *= sw;
 218 }
 219
 220
 221 /*! \brief Fetch C6 grid contribution coefficients and return the product of these.
 222  *
 223  *  Depending on what is supported, it fetches parameters either
 224  *  using direct load, texture objects, or texrefs.
 225  */
 226 static __forceinline__ __device__
 227 float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
 228                                 int                typei,
 229                                 int                typej)
 230 {
 231 #if DISABLE_CUDA_TEXTURES
 232     return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
 233 #else
 234 #ifdef USE_TEXOBJ
 235     return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 236 #else
 237     return tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 238 #endif /* USE_TEXOBJ */
 239 #endif /* DISABLE_CUDA_TEXTURES */
 240 }
 241
 242
 243 /*! Calculate LJ-PME grid force contribution with
 244  *  geometric combination rule.
 245  */
 246 static __forceinline__ __device__
 247 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 248                                     int                typei,
 249                                     int                typej,
 250                                     float              r2,
 251                                     float              inv_r2,
 252                                     float              lje_coeff2,
 253                                     float              lje_coeff6_6,
 254                                     float             *F_invr)
 255 {
 256     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 257
 258     c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 259
 260     /* Recalculate inv_r6 without exclusion mask */
 261     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 262     cr2       = lje_coeff2*r2;
 263     expmcr2   = expf(-cr2);
 264     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 265
 266     /* Subtract the grid force from the total LJ force */
 267     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 268 }
 269
 270
 271 /*! Calculate LJ-PME grid force + energy contribution with
 272  *  geometric combination rule.
 273  */
 274 static __forceinline__ __device__
 275 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 276                                       int                typei,
 277                                       int                typej,
 278                                       float              r2,
 279                                       float              inv_r2,
 280                                       float              lje_coeff2,
 281                                       float              lje_coeff6_6,
 282                                       float              int_bit,
 283                                       float             *F_invr,
 284                                       float             *E_lj)
 285 {
 286     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 287
 288     c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 289
 290     /* Recalculate inv_r6 without exclusion mask */
 291     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 292     cr2       = lje_coeff2*r2;
 293     expmcr2   = expf(-cr2);
 294     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 295
 296     /* Subtract the grid force from the total LJ force */
 297     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 298
 299     /* Shift should be applied only to real LJ pairs */
 300     sh_mask   = nbparam.sh_lj_ewald*int_bit;
 301     *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 302 }
 303
 304 /*! Fetch per-type LJ parameters.
 305  *
 306  *  Depending on what is supported, it fetches parameters either
 307  *  using direct load, texture objects, or texrefs.
 308  */
 309 static __forceinline__ __device__
 310 float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
 311                               int                type)
 312 {
 313     float2 c6c12;
 314 #if DISABLE_CUDA_TEXTURES
 315     /* Force an 8-byte fetch to save a memory instruction. */
 316     float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
 317     c6c12 = LDG(&nbfp_comb[type]);
 318 #else
 319     /* NOTE: as we always do 8-byte aligned loads, we could
 320        fetch float2 here too just as above. */
 321 #ifdef USE_TEXOBJ
 322     c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
 323     c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
 324 #else
 325     c6c12.x = tex1Dfetch(nbfp_comb_texref, 2*type);
 326     c6c12.y = tex1Dfetch(nbfp_comb_texref, 2*type + 1);
 327 #endif /* USE_TEXOBJ */
 328 #endif /* DISABLE_CUDA_TEXTURES */
 329
 330     return c6c12;
 331 }
 332
 333
 334 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 335  *  Lorentz-Berthelot combination rule.
 336  *  We use a single F+E kernel with conditional because the performance impact
 337  *  of this is pretty small and LB on the CPU is anyway very slow.
 338  */
 339 static __forceinline__ __device__
 340 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 341                                     int                typei,
 342                                     int                typej,
 343                                     float              r2,
 344                                     float              inv_r2,
 345                                     float              lje_coeff2,
 346                                     float              lje_coeff6_6,
 347                                     float              int_bit,
 348                                     float             *F_invr,
 349                                     float             *E_lj)
 350 {
 351     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 352     float sigma, sigma2, epsilon;
 353
 354     /* sigma and epsilon are scaled to give 6*C6 */
 355     float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
 356     float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
 357
 358     sigma   = c6c12_i.x + c6c12_j.x;
 359     epsilon = c6c12_i.y * c6c12_j.y;
 360
 361     sigma2  = sigma*sigma;
 362     c6grid  = epsilon*sigma2*sigma2*sigma2;
 363
 364     /* Recalculate inv_r6 without exclusion mask */
 365     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 366     cr2       = lje_coeff2*r2;
 367     expmcr2   = expf(-cr2);
 368     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 369
 370     /* Subtract the grid force from the total LJ force */
 371     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 372
 373     if (E_lj != NULL)
 374     {
 375         float sh_mask;
 376
 377         /* Shift should be applied only to real LJ pairs */
 378         sh_mask   = nbparam.sh_lj_ewald*int_bit;
 379         *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 380     }
 381 }
 382
 383
 384 /*! Fetch two consecutive values from the Ewald correction F*r table.
 385  *
 386  *  Depending on what is supported, it fetches parameters either
 387  *  using direct load, texture objects, or texrefs.
 388  */
 389 static __forceinline__ __device__
 390 float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
 391                              int                index)
 392 {
 393     float2 d;
 394
 395 #if DISABLE_CUDA_TEXTURES
 396     /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
 397     d.x = LDG(&nbparam.coulomb_tab[index]);
 398     d.y = LDG(&nbparam.coulomb_tab[index + 1]);
 399 #else
 400 #ifdef USE_TEXOBJ
 401     d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
 402     d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
 403 #else
 404     d.x =  tex1Dfetch(coulomb_tab_texref, index);
 405     d.y =  tex1Dfetch(coulomb_tab_texref, index + 1);
 406 #endif // USE_TEXOBJ
 407 #endif // DISABLE_CUDA_TEXTURES
 408
 409     return d;
 410 }
 411
 412 /*! Linear interpolation using exactly two FMA operations.
 413  *
 414  *  Implements numeric equivalent of: (1-t)*d0 + t*d1
 415  *  Note that CUDA does not have fnms, otherwise we'd use
 416  *  fma(t, d1, fnms(t, d0, d0)
 417  *  but input modifiers are designed for this and are fast.
 418  */
 419 template <typename T>
 420 __forceinline__ __host__ __device__
 421 T lerp(T d0, T d1, T t)
 422 {
 423     return fma(t, d1, fma(-t, d0, d0));
 424 }
 425
 426 /*! Interpolate Ewald coulomb force correction using the F*r table.
 427  */
 428 static __forceinline__ __device__
 429 float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
 430                                   float              r)
 431 {
 432     float  normalized = nbparam.coulomb_tab_scale * r;
 433     int    index      = (int) normalized;
 434     float  fraction   = normalized - index;
 435
 436     float2 d01 = fetch_coulomb_force_r(nbparam, index);
 437
 438     return lerp(d01.x, d01.y, fraction);
 439 }
 440
 441 /*! Fetch C6 and C12 from the parameter table.
 442  *
 443  *  Depending on what is supported, it fetches parameters either
 444  *  using direct load, texture objects, or texrefs.
 445  */
 446 static __forceinline__ __device__
 447 void fetch_nbfp_c6_c12(float               &c6,
 448                        float               &c12,
 449                        const cu_nbparam_t   nbparam,
 450                        int                  baseIndex)
 451 {
 452 #if DISABLE_CUDA_TEXTURES
 453     /* Force an 8-byte fetch to save a memory instruction. */
 454     float2 *nbfp = (float2 *)nbparam.nbfp;
 455     float2  c6c12;
 456     c6c12 = LDG(&nbfp[baseIndex]);
 457     c6    = c6c12.x;
 458     c12   = c6c12.y;
 459 #else
 460     /* NOTE: as we always do 8-byte aligned loads, we could
 461        fetch float2 here too just as above. */
 462 #ifdef USE_TEXOBJ
 463     c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
 464     c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
 465 #else
 466     c6  = tex1Dfetch(nbfp_texref, 2*baseIndex);
 467     c12 = tex1Dfetch(nbfp_texref, 2*baseIndex + 1);
 468 #endif
 469 #endif // DISABLE_CUDA_TEXTURES
 470 }
 471
 472
 473 /*! Calculate analytical Ewald correction term. */
 474 static __forceinline__ __device__
 475 float pmecorrF(float z2)
 476 {
 477     const float FN6 = -1.7357322914161492954e-8f;
 478     const float FN5 = 1.4703624142580877519e-6f;
 479     const float FN4 = -0.000053401640219807709149f;
 480     const float FN3 = 0.0010054721316683106153f;
 481     const float FN2 = -0.019278317264888380590f;
 482     const float FN1 = 0.069670166153766424023f;
 483     const float FN0 = -0.75225204789749321333f;
 484
 485     const float FD4 = 0.0011193462567257629232f;
 486     const float FD3 = 0.014866955030185295499f;
 487     const float FD2 = 0.11583842382862377919f;
 488     const float FD1 = 0.50736591960530292870f;
 489     const float FD0 = 1.0f;
 490
 491     float       z4;
 492     float       polyFN0, polyFN1, polyFD0, polyFD1;
 493
 494     z4          = z2*z2;
 495
 496     polyFD0     = FD4*z4 + FD2;
 497     polyFD1     = FD3*z4 + FD1;
 498     polyFD0     = polyFD0*z4 + FD0;
 499     polyFD0     = polyFD1*z2 + polyFD0;
 500
 501     polyFD0     = 1.0f/polyFD0;
 502
 503     polyFN0     = FN6*z4 + FN4;
 504     polyFN1     = FN5*z4 + FN3;
 505     polyFN0     = polyFN0*z4 + FN2;
 506     polyFN1     = polyFN1*z4 + FN1;
 507     polyFN0     = polyFN0*z4 + FN0;
 508     polyFN0     = polyFN1*z2 + polyFN0;
 509
 510     return polyFN0*polyFD0;
 511 }
 512
 513 /*! Final j-force reduction; this generic implementation works with
 514  *  arbitrary array sizes.
 515  */
 516 static __forceinline__ __device__
 517 void reduce_force_j_generic(float *f_buf, float3 *fout,
 518                             int tidxi, int tidxj, int aidx)
 519 {
 520     if (tidxi < 3)
 521     {
 522         float f = 0.0f;
 523         for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
 524         {
 525             f += f_buf[c_fbufStride * tidxi + j];
 526         }
 527
 528         atomicAdd((&fout[aidx].x)+tidxi, f);
 529     }
 530 }
 531
 532 /*! Final j-force reduction; this implementation only with power of two
 533  *  array sizes and with sm >= 3.0
 534  */
 535 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 536 static __forceinline__ __device__
 537 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 538                               int tidxi, int aidx)
 539 {
 540     f.x += __shfl_down(f.x, 1);
 541     f.y += __shfl_up  (f.y, 1);
 542     f.z += __shfl_down(f.z, 1);
 543
 544     if (tidxi & 1)
 545     {
 546         f.x = f.y;
 547     }
 548
 549     f.x += __shfl_down(f.x, 2);
 550     f.z += __shfl_up  (f.z, 2);
 551
 552     if (tidxi & 2)
 553     {
 554         f.x = f.z;
 555     }
 556
 557     f.x += __shfl_down(f.x, 4);
 558
 559     if (tidxi < 3)
 560     {
 561         atomicAdd((&fout[aidx].x) + tidxi, f.x);
 562     }
 563 }
 564 #endif
 565
 566 /*! Final i-force reduction; this generic implementation works with
 567  *  arbitrary array sizes.
 568  * TODO: add the tidxi < 3 trick
 569  */
 570 static __forceinline__ __device__
 571 void reduce_force_i_generic(float *f_buf, float3 *fout,
 572                             float *fshift_buf, bool bCalcFshift,
 573                             int tidxi, int tidxj, int aidx)
 574 {
 575     if (tidxj < 3)
 576     {
 577         float f = 0.0f;
 578         for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
 579         {
 580             f += f_buf[tidxj * c_fbufStride + j];
 581         }
 582
 583         atomicAdd(&fout[aidx].x + tidxj, f);
 584
 585         if (bCalcFshift)
 586         {
 587             *fshift_buf += f;
 588         }
 589     }
 590 }
 591
 592 /*! Final i-force reduction; this implementation works only with power of two
 593  *  array sizes.
 594  */
 595 static __forceinline__ __device__
 596 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 597                          float *fshift_buf, bool bCalcFshift,
 598                          int tidxi, int tidxj, int aidx)
 599 {
 600     int     i, j;
 601     float   f;
 602
 603     assert(c_clSize == 1 << c_clSizeLog2);
 604
 605     /* Reduce the initial c_clSize values for each i atom to half
 606      * every step by using c_clSize * i threads.
 607      * Can't just use i as loop variable because than nvcc refuses to unroll.
 608      */
 609     i = c_clSize/2;
 610 #pragma unroll 5
 611     for (j = c_clSizeLog2 - 1; j > 0; j--)
 612     {
 613         if (tidxj < i)
 614         {
 615
 616             f_buf[                   tidxj * c_clSize + tidxi] += f_buf[                   (tidxj + i) * c_clSize + tidxi];
 617             f_buf[    c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[    c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 618             f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 619         }
 620         i >>= 1;
 621     }
 622
 623     /* i == 1, last reduction step, writing to global mem */
 624     if (tidxj < 3)
 625     {
 626         /* tidxj*c_fbufStride selects x, y or z */
 627         f = f_buf[tidxj * c_fbufStride               + tidxi] +
 628             f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
 629
 630         atomicAdd(&(fout[aidx].x) + tidxj, f);
 631
 632         if (bCalcFshift)
 633         {
 634             *fshift_buf += f;
 635         }
 636     }
 637
 638 }
 639
 640 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 641  *  on whether the size of the array to be reduced is power of two or not.
 642  */
 643 static __forceinline__ __device__
 644 void reduce_force_i(float *f_buf, float3 *f,
 645                     float *fshift_buf, bool bCalcFshift,
 646                     int tidxi, int tidxj, int ai)
 647 {
 648     if ((c_clSize & (c_clSize - 1)))
 649     {
 650         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 651     }
 652     else
 653     {
 654         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 655     }
 656 }
 657
 658 /*! Final i-force reduction; this implementation works only with power of two
 659  *  array sizes and with sm >= 3.0
 660  */
 661 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 662 static __forceinline__ __device__
 663 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 664                               float *fshift_buf, bool bCalcFshift,
 665                               int tidxj, int aidx)
 666 {
 667     fin.x += __shfl_down(fin.x, c_clSize);
 668     fin.y += __shfl_up  (fin.y, c_clSize);
 669     fin.z += __shfl_down(fin.z, c_clSize);
 670
 671     if (tidxj & 1)
 672     {
 673         fin.x = fin.y;
 674     }
 675
 676     fin.x += __shfl_down(fin.x, 2*c_clSize);
 677     fin.z += __shfl_up  (fin.z, 2*c_clSize);
 678
 679     if (tidxj & 2)
 680     {
 681         fin.x = fin.z;
 682     }
 683
 684     /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 685     if ((tidxj & 3) < 3)
 686     {
 687         atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
 688
 689         if (bCalcFshift)
 690         {
 691             *fshift_buf += fin.x;
 692         }
 693     }
 694 }
 695 #endif
 696
 697 /*! Energy reduction; this implementation works only with power of two
 698  *  array sizes.
 699  */
 700 static __forceinline__ __device__
 701 void reduce_energy_pow2(volatile float *buf,
 702                         float *e_lj, float *e_el,
 703                         unsigned int tidx)
 704 {
 705     int     i, j;
 706     float   e1, e2;
 707
 708     i = warp_size/2;
 709
 710     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 711 #pragma unroll 10
 712     for (j = warp_size_log2 - 1; j > 0; j--)
 713     {
 714         if (tidx < i)
 715         {
 716             buf[               tidx] += buf[               tidx + i];
 717             buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
 718         }
 719         i >>= 1;
 720     }
 721
 722     // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 723     // memory locations to make sense
 724     /* last reduction step, writing to global mem */
 725     if (tidx == 0)
 726     {
 727         e1 = buf[               tidx] + buf[               tidx + i];
 728         e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
 729
 730         atomicAdd(e_lj, e1);
 731         atomicAdd(e_el, e2);
 732     }
 733 }
 734
 735 /*! Energy reduction; this implementation works only with power of two
 736  *  array sizes and with sm >= 3.0
 737  */
 738 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 739 static __forceinline__ __device__
 740 void reduce_energy_warp_shfl(float E_lj, float E_el,
 741                              float *e_lj, float *e_el,
 742                              int tidx)
 743 {
 744     int i, sh;
 745
 746     sh = 1;
 747 #pragma unroll 5
 748     for (i = 0; i < 5; i++)
 749     {
 750         E_lj += __shfl_down(E_lj, sh);
 751         E_el += __shfl_down(E_el, sh);
 752         sh   += sh;
 753     }
 754
 755     /* The first thread in the warp writes the reduced energies */
 756     if (tidx == 0 || tidx == warp_size)
 757     {
 758         atomicAdd(e_lj, E_lj);
 759         atomicAdd(e_el, E_el);
 760     }
 761 }
 762 #endif /* GMX_PTX_ARCH */
 763
 764 #undef USE_TEXOBJ
 765
 766 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */