libhoomd/cuda/PotentialTersoffGPU.cuh

   1 /*
   2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
   3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
   4 the University of Michigan All rights reserved.
   5
   6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
   7 copyright is held, by various Contributors who have granted The Regents of the
   8 University of Michigan the right to modify and/or distribute such Contributions.
   9
  10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
  11 and binary forms, provided you abide by the following conditions:
  12
  13 * Redistributions of source code must retain the above copyright notice, this
  14 list of conditions, and the following disclaimer both in the code and
  15 prominently in any materials provided with the distribution.
  16
  17 * Redistributions in binary form must reproduce the above copyright notice, this
  18 list of conditions, and the following disclaimer in the documentation and/or
  19 other materials provided with the distribution.
  20
  21 * All publications and presentations based on HOOMD-blue, including any reports
  22 or published results obtained, in whole or in part, with HOOMD-blue, will
  23 acknowledge its use according to the terms posted at the time of submission on:
  24 http://codeblue.umich.edu/hoomd-blue/citations.html
  25
  26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
  27 http://codeblue.umich.edu/hoomd-blue/
  28
  29 * Apart from the above required attributions, neither the name of the copyright
  30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
  31 promote products derived from this software without specific prior written
  32 permission.
  33
  34 Disclaimer
  35
  36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
  37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
  39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
  40
  41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 */
  49
  50 #include "HOOMDMath.h"
  51 #include "TextureTools.h"
  52 #include "ParticleData.cuh"
  53 #include "Index1D.h"
  54
  55 #ifdef WIN32
  56 #include <cassert>
  57 #else
  58 #include <assert.h>
  59 #endif
  60
  61 /*! \file PotentialTersoffGPU.cuh
  62     \brief Defines templated GPU kernel code for calculating certain three-body forces
  63 */
  64
  65 #ifndef __POTENTIAL_TERSOFF_GPU_CUH__
  66 #define __POTENTIAL_TERSOFF_GPU_CUH__
  67
  68 //! Wraps arguments to gpu_cgpf
  69 struct tersoff_args_t
  70     {
  71     //! Construct a tersoff_args_t
  72     tersoff_args_t(Scalar4 *_d_force,
  73                    const unsigned int _N,
  74                    const Scalar4 *_d_pos,
  75                    const BoxDim& _box,
  76                    const unsigned int *_d_n_neigh,
  77                    const unsigned int *_d_nlist,
  78                    const Index2D& _nli,
  79                    const Scalar *_d_rcutsq,
  80                    const Scalar *_d_ronsq,
  81                    const unsigned int _ntypes,
  82                    const unsigned int _block_size)
  83                    : d_force(_d_force),
  84                      N(_N),
  85                      d_pos(_d_pos),
  86                      box(_box),
  87                      d_n_neigh(_d_n_neigh),
  88                      d_nlist(_d_nlist),
  89                      nli(_nli),
  90                      d_rcutsq(_d_rcutsq),
  91                      d_ronsq(_d_ronsq),
  92                      ntypes(_ntypes),
  93                      block_size(_block_size)
  94         {
  95         };
  96
  97     Scalar4 *d_force;                //!< Force to write out
  98     const unsigned int N;            //!< Number of particles
  99     const Scalar4 *d_pos;            //!< particle positions
 100     const BoxDim& box;                //!< Simulation box in GPU format
 101     const unsigned int *d_n_neigh;  //!< Device array listing the number of neighbors on each particle
 102     const unsigned int *d_nlist;    //!< Device array listing the neighbors of each particle
 103     const Index2D& nli;             //!< Indexer for accessing d_nlist
 104     const Scalar *d_rcutsq;          //!< Device array listing r_cut squared per particle type pair
 105     const Scalar *d_ronsq;           //!< Device array listing r_on squared per particle type pair
 106     const unsigned int ntypes;      //!< Number of particle types in the simulation
 107     const unsigned int block_size;  //!< Block size to execute
 108     };
 109
 110
 111 #ifdef NVCC
 112 //! Texture for reading particle positions
 113 scalar4_tex_t pdata_pos_tex;
 114
 115 #ifndef SINGLE_PRECISION
 116 //! atomicAdd function for double-precision floating point numbers
 117 /*! This function is only used when hoomd is compiled for double precision on the GPU.
 118
 119     \param address Address to write the double to
 120     \param val Value to add to address
 121 */
 122 static __device__ inline double atomicAdd(double* address, double val)
 123     {
 124     unsigned long long int* address_as_ull = (unsigned long long int*)address;
 125     unsigned long long int old = *address_as_ull, assumed;
 126
 127     do {
 128         assumed = old;
 129         old = atomicCAS(address_as_ull,
 130                         assumed,
 131                         __double_as_longlong(val + __longlong_as_double(assumed)));
 132     } while (assumed != old);
 133
 134     return __longlong_as_double(old);
 135     }
 136 #endif
 137
 138 //! Kernel for calculating the Tersoff forces
 139 /*! This kernel is called to calculate the forces on all N particles. Actual evaluation of the potentials and
 140     forces for each pair is handled via the template class \a evaluator.
 141
 142     \param d_force Device memory to write computed forces
 143     \param N Number of particles in the system
 144     \param d_pos Positions of all the particles
 145     \param box Box dimensions used to implement periodic boundary conditions
 146     \param d_n_neigh Device memory array listing the number of neighbors for each particle
 147     \param d_nlist Device memory array containing the neighbor list contents
 148     \param nli Indexer for indexing \a d_nlist
 149     \param d_params Parameters for the potential, stored per type pair
 150     \param d_rcutsq rcut squared, stored per type pair
 151     \param d_ronsq ron squared, stored per type pair
 152     \param ntypes Number of types in the simulation
 153
 154     \a d_params, \a d_rcutsq, and \a d_ronsq must be indexed with an Index2DUpperTriangler(typei, typej) to access the
 155     unique value for that type pair. These values are all cached into shared memory for quick access, so a dynamic
 156     amount of shared memory must be allocatd for this kernel launch. The amount is
 157     (2*sizeof(Scalar) + sizeof(typename evaluator::param_type)) * typpair_idx.getNumElements()
 158
 159     Certain options are controlled via template parameters to avoid the performance hit when they are not enabled.
 160     \tparam evaluator EvaluatorPair class to evualuate V(r) and -delta V(r)/r
 161
 162     <b>Implementation details</b>
 163     Each block will calculate the forces on a block of particles.
 164     Each thread will calculate the total force on one particle.
 165     The neighborlist is arranged in columns so that reads are fully coalesced when doing this.
 166 */
 167 template< class evaluator >
 168 __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
 169                                                   const unsigned int N,
 170                                                   const Scalar4 *d_pos,
 171                                                   const BoxDim box,
 172                                                   const unsigned int *d_n_neigh,
 173                                                   const unsigned int *d_nlist,
 174                                                   const Index2D nli,
 175                                                   const typename evaluator::param_type *d_params,
 176                                                   const Scalar *d_rcutsq,
 177                                                   const Scalar *d_ronsq,
 178                                                   const unsigned int ntypes)
 179     {
 180     Index2D typpair_idx(ntypes);
 181     const unsigned int num_typ_parameters = typpair_idx.getNumElements();
 182
 183     // shared arrays for per type pair parameters
 184     extern __shared__ char s_data[];
 185     typename evaluator::param_type *s_params =
 186         (typename evaluator::param_type *)(&s_data[0]);
 187     Scalar *s_rcutsq = (Scalar *)(&s_data[num_typ_parameters*sizeof(evaluator::param_type)]);
 188
 189     // load in the per type pair parameters
 190     for (unsigned int cur_offset = 0; cur_offset < num_typ_parameters; cur_offset += blockDim.x)
 191         {
 192         if (cur_offset + threadIdx.x < num_typ_parameters)
 193             {
 194             s_rcutsq[cur_offset + threadIdx.x] = d_rcutsq[cur_offset + threadIdx.x];
 195             s_params[cur_offset + threadIdx.x] = d_params[cur_offset + threadIdx.x];
 196             }
 197         }
 198     __syncthreads();
 199
 200     // start by identifying which particle we are to handle
 201     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
 202
 203     if (idx >= N)
 204         return;
 205
 206     // load in the length of the neighbor list (MEM_TRANSFER: 4 bytes)
 207     unsigned int n_neigh = d_n_neigh[idx];
 208
 209     // read in the position of the particle
 210     Scalar4 postypei = texFetchScalar4(d_pos, pdata_pos_tex, idx);
 211     Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z);
 212
 213     // initialize the force to 0
 214     Scalar4 forcei = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
 215
 216     // prefetch neighbor index
 217     unsigned int cur_j = 0;
 218     unsigned int next_j = d_nlist[nli(idx, 0)];
 219
 220     // loop over neighbors
 221     // on pre Fermi hardware, there is a bug that causes rare and random ULFs when simply looping over n_neigh
 222     // the workaround (activated via the template paramter) is to loop over nlist.height and put an if (i < n_neigh)
 223     // inside the loop
 224     #if (__CUDA_ARCH__ < 200)
 225     for (int neigh_idx = 0; neigh_idx < nli.getH(); neigh_idx++)
 226     #else
 227     for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++)
 228     #endif
 229     {
 230         #if (__CUDA_ARCH__ < 200)
 231         if (neigh_idx < n_neigh)
 232         #endif
 233         {
 234             // read the current neighbor index (MEM TRANSFER: 4 bytes)
 235             // prefetch the next value and set the current one
 236             cur_j = next_j;
 237             next_j = d_nlist[nli(idx, neigh_idx + 1)];
 238
 239             // read the position of j (MEM TRANSFER: 16 bytes)
 240             Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j);
 241             Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
 242
 243             // initialize the force on j
 244             Scalar4 forcej = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
 245
 246             // compute r_ij (FLOPS: 3)
 247             Scalar3 dxij = posi - posj;
 248
 249             // apply periodic boundary conditions (FLOPS: 12)
 250             dxij = box.minImage(dxij);
 251
 252             // compute rij_sq (FLOPS: 5)
 253             Scalar rij_sq = dot(dxij, dxij);
 254
 255             // access the per type-pair parameters
 256             unsigned int typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypej.w));
 257             Scalar rcutsq = s_rcutsq[typpair];
 258             typename evaluator::param_type param = s_params[typpair];
 259
 260             // compute the base repulsive and attractive terms of the potential
 261             Scalar fR = Scalar(0.0);
 262             Scalar fA = Scalar(0.0);
 263             evaluator eval(rij_sq, rcutsq, param);
 264             bool evaluatedij = eval.evalRepulsiveAndAttractive(fR, fA);
 265
 266             if (evaluatedij)
 267                 {
 268                 // compute chi
 269                 Scalar chi = Scalar(0.0);
 270                 unsigned int cur_k = 0;
 271                 unsigned int next_k = d_nlist[nli(idx, 0)];
 272                 #if (__CUDA_ARCH__ < 200)
 273                 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
 274                 #else
 275                 for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++)
 276                 #endif
 277                     {
 278                     #if (__CUDA_ARCH__ < 200)
 279                     if (neigh_idy < n_neigh)
 280                     #endif
 281                         {
 282                         // read the current index of k and prefetch the next one
 283                         cur_k = next_k;
 284                         next_k = d_nlist[nli(idx, neigh_idy+1)];
 285
 286                         // get the position of neighbor k
 287                         Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
 288                         Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
 289
 290                         // get the type pair parameters for i and k
 291                         typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypek.w));
 292                         Scalar temp_rcutsq = s_rcutsq[typpair];
 293                         typename evaluator::param_type temp_param = s_params[typpair];
 294
 295                         evaluator temp_eval(rij_sq, temp_rcutsq, temp_param);
 296                         bool temp_evaluated = temp_eval.areInteractive();
 297
 298                         if (cur_k != cur_j && temp_evaluated)
 299                             {
 300                             // compute rik
 301                             Scalar3 dxik = posi - posk;
 302
 303                             // apply the periodic boundary conditions
 304                             dxik = box.minImage(dxik);
 305
 306                             // compute rik_sq
 307                             Scalar rik_sq = dot(dxik, dxik);
 308
 309                             // compute the bond angle (if needed)
 310                             Scalar cos_th = Scalar(0.0);
 311                             if (evaluator::needsAngle())
 312                                 cos_th = dot(dxij, dxik) * fast::rsqrt(rij_sq * rik_sq);
 313                             else cos_th += Scalar(1.0); // shuts up the compiler warning
 314
 315                             // set up the evaluator
 316                             eval.setRik(rik_sq);
 317                             if (evaluator::needsAngle())
 318                                 eval.setAngle(cos_th);
 319
 320                             // compute the partial chi term
 321                             eval.evalChi(chi);
 322                             }
 323                         }
 324                     }
 325                 // evaluate the force and energy from the ij interaction
 326                 Scalar force_divr = Scalar(0.0);
 327                 Scalar potential_eng = Scalar(0.0);
 328                 Scalar bij = Scalar(0.0);
 329                 eval.evalForceij(fR, fA, chi, bij, force_divr, potential_eng);
 330
 331                 // add the forces and energies to their respective particles
 332                 Scalar2 v_coeffs = make_scalar2(Scalar(1.0 / 6.0) * rij_sq, Scalar(0.0));
 333                 #if (__CUDA_ARCH__ >= 200)
 334                 forcei.x += dxij.x * force_divr;
 335                 forcei.y += dxij.y * force_divr;
 336                 forcei.z += dxij.z * force_divr;
 337
 338                 forcej.x -= dxij.x * force_divr;
 339                 forcej.y -= dxij.y * force_divr;
 340                 forcej.z -= dxij.z * force_divr;
 341                 forcej.w += potential_eng;
 342                 #else
 343                 forcei.x += __fmul_rn(dxij.x, force_divr);
 344                 forcei.y += __fmul_rn(dxij.y, force_divr);
 345                 forcei.z += __fmul_rn(dxij.z, force_divr);
 346
 347                 forcej.x += Scalar(1.0); // shuts up the compiler warning
 348                 #endif
 349                 forcei.w += potential_eng;
 350
 351                 // now evaluate the force from the ik interactions
 352                 cur_k = 0;
 353                 next_k = d_nlist[nli(idx, 0)];
 354                 #if (__CUDA_ARCH__ < 200)
 355                 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
 356                 #else
 357                 for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++)
 358                 #endif
 359                     {
 360                     #if (__CUDA_ARCH__ < 200)
 361                     if (neigh_idy < n_neigh)
 362                     #endif
 363                         {
 364                         // read the current neighbor index and prefetch the next one
 365                         cur_k = next_k;
 366                         next_k = d_nlist[nli(idx, neigh_idy+1)];
 367
 368                         // get the position of neighbor k
 369                         Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
 370                         Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
 371
 372                         // get the type pair parameters for i and k
 373                         typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypek.w));
 374                         Scalar temp_rcutsq = s_rcutsq[typpair];
 375                         typename evaluator::param_type temp_param = s_params[typpair];
 376
 377                         evaluator temp_eval(rij_sq, temp_rcutsq, temp_param);
 378                         bool temp_evaluated = temp_eval.areInteractive();
 379
 380                         if (cur_k != cur_j && temp_evaluated)
 381                             {
 382                             Scalar4 forcek = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
 383
 384                             // compute rik
 385                             Scalar3 dxik = posi - posk;
 386
 387                             // apply the periodic boundary conditions
 388                             dxik = box.minImage(dxik);
 389
 390                             // compute rik_sq
 391                             Scalar rik_sq = dot(dxik, dxik);
 392
 393                             // compute the bond angle (if needed)
 394                             Scalar cos_th = Scalar(0.0);
 395                             if (evaluator::needsAngle())
 396                                 cos_th = dot(dxij, dxik) * fast::rsqrt(rij_sq * rik_sq);
 397                             else cos_th += Scalar(1.0); // shuts up the compiler warning
 398
 399                             // set up the evaluator
 400                             eval.setRik(rik_sq);
 401                             if (evaluator::needsAngle())
 402                                 eval.setAngle(cos_th);
 403
 404                             // compute the force
 405                             Scalar3 force_divr_ij = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
 406                             Scalar3 force_divr_ik = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
 407                             bool evaluatedjk = eval.evalForceik(fR, fA, chi, bij, force_divr_ij, force_divr_ik);
 408
 409                             if (evaluatedjk)
 410                                 {
 411                                 // add the forces to their respective particles
 412                                 v_coeffs.y = Scalar(1.0 / 6.0) * rik_sq;
 413                                 #if (__CUDA_ARCH__ >= 200)
 414                                 forcei.x += force_divr_ij.x * dxij.x + force_divr_ik.x * dxik.x;
 415                                 forcei.y += force_divr_ij.x * dxij.y + force_divr_ik.x * dxik.y;
 416                                 forcei.z += force_divr_ij.x * dxij.z + force_divr_ik.x * dxik.z;
 417
 418                                 forcej.x += force_divr_ij.y * dxij.x + force_divr_ik.y * dxik.x;
 419                                 forcej.y += force_divr_ij.y * dxij.y + force_divr_ik.y * dxik.y;
 420                                 forcej.z += force_divr_ij.y * dxij.z + force_divr_ik.y * dxik.z;
 421
 422                                 forcek.x += force_divr_ij.z * dxij.x + force_divr_ik.z * dxik.x;
 423                                 forcek.y += force_divr_ij.z * dxij.y + force_divr_ik.z * dxik.y;
 424                                 forcek.z += force_divr_ij.z * dxij.z + force_divr_ik.z * dxik.z;
 425
 426                                 atomicAdd(&d_force[cur_k].x, forcek.x);
 427                                 atomicAdd(&d_force[cur_k].y, forcek.y);
 428                                 atomicAdd(&d_force[cur_k].z, forcek.z);
 429                                 #else
 430                                 forcei.x += __fmul_rn(dxij.x, force_divr_ij.x) + __fmul_rn(dxik.x, force_divr_ik.x);
 431                                 forcei.y += __fmul_rn(dxij.y, force_divr_ij.x) + __fmul_rn(dxik.y, force_divr_ik.x);
 432                                 forcei.z += __fmul_rn(dxij.z, force_divr_ij.x) + __fmul_rn(dxik.z, force_divr_ik.x);
 433
 434                                 forcek.x += Scalar(1.0); // shuts up the compiler warning
 435                                 #endif
 436                                 }
 437                             }
 438                         }
 439                     }
 440                 // on Fermi hardware we can use atomicAdd to gain some speed
 441                 // otherwise we need to loop over neighbors of j
 442                 #if (__CUDA_ARCH__ >= 200)
 443                 // potential energy of j must be halved
 444                 forcej.w *= Scalar(0.5);
 445                 // write out the result for particle j
 446                 atomicAdd(&d_force[cur_j].x, forcej.x);
 447                 atomicAdd(&d_force[cur_j].y, forcej.y);
 448                 atomicAdd(&d_force[cur_j].z, forcej.z);
 449                 atomicAdd(&d_force[cur_j].w, forcej.w);
 450                 #else
 451                 // now we have to compute the force with i as a secondary/tertiary particle
 452                 // first consider i as the secondary particle and j as the primary particle
 453                 // rji is -rij
 454                 dxij.x *= -Scalar(1.0);
 455                 dxij.y *= -Scalar(1.0);
 456                 dxij.z *= -Scalar(1.0);
 457
 458                 // the fR and fA already computed are still valid, so there is no point in recomputing them
 459
 460                 // recompute chi by looping over neighbors of j
 461                 unsigned int n_neighj = d_n_neigh[cur_j];
 462                 chi = Scalar(0.0);
 463                 cur_k = 0;
 464                 next_k = d_nlist[nli(cur_j, 0)];
 465                 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
 466                     {
 467                     if (neigh_idy < n_neighj)
 468                         {
 469                         // read the index of k and prefetch the next one
 470                         cur_k = next_k;
 471                         next_k = d_nlist[nli(cur_j, neigh_idy+1)];
 472
 473                         if (cur_k != idx)
 474                             {
 475                             // get the position of neighbor k
 476                             Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
 477                             Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
 478
 479                             // compute rjk
 480                             Scalar3 dxjk = posj - posk;
 481
 482                             // apply the periodic boundary conditions
 483                             dxjk = box.minImage(dxjk);
 484
 485                             // compute rjk_sq
 486                             Scalar rjk_sq = dot(dxjk, dxjk);
 487
 488                             // compute the bond angle (if needed)
 489                             Scalar cos_th = Scalar(0.0);
 490                             if (evaluator::needsAngle())
 491                                 cos_th = dot(dxij, dxjk) * fast::rsqrt(rij_sq * rjk_sq);
 492                             else cos_th += Scalar(1.0); // shuts up the compiler warning
 493
 494                             // set up the evaluator
 495                             eval.setRik(rjk_sq);
 496                             if (evaluator::needsAngle())
 497                                 eval.setAngle(cos_th);
 498
 499                             // evaluate chi
 500                             eval.evalChi(chi);
 501                             }
 502                         }
 503                     }
 504                 // now compute the ji force
 505                 eval.evalForceij(fR, fA, chi, bij, force_divr, potential_eng);
 506
 507                 // add the force and energy to particle i
 508                 forcei.x -= __fmul_rn(dxij.x, force_divr);
 509                 forcei.y -= __fmul_rn(dxij.y, force_divr);
 510                 forcei.z -= __fmul_rn(dxij.z, force_divr);
 511                 forcei.w += potential_eng;
 512
 513                 // now compute the jk force
 514                 cur_k = 0;
 515                 next_k = d_nlist[nli(cur_j, 0)];
 516                 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
 517                     {
 518                     if (neigh_idy < n_neighj)
 519                         {
 520                         // get the index of k and prefecth the next one
 521                         cur_k = next_k;
 522                         next_k = d_nlist[nli(cur_j, neigh_idy+1)];
 523
 524                         if (cur_k != idx)
 525                             {
 526                             // get the position of k
 527                             Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
 528                             Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
 529
 530                             // compute rjk
 531                             Scalar3 dxjk = posj - posk;
 532
 533                             // apply periodic boundary conditions
 534                             dxjk = box.minImage(dxjk);
 535
 536                             // compute rjk_sq
 537                             Scalar rjk_sq = dot(dxjk, dxjk);
 538
 539                             // compute the bond angle (if needed)
 540                             Scalar cos_th = Scalar(0.0);
 541                             if (evaluator::needsAngle())
 542                                 cos_th = dot(dxij, dxjk) * fast::rsqrt(rij_sq * rjk_sq);
 543                             else cos_th += Scalar(1.0); // shuts up the compiler warning
 544
 545                             // set up the evaluator
 546                             eval.setRik(rjk_sq);
 547                             if (evaluator::needsAngle())
 548                                 eval.setAngle(cos_th);
 549
 550                             // evaluate the force
 551                             Scalar3 force_divr_ij = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
 552                             Scalar3 force_divr_jk = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
 553                             eval.evalForceik(fR, fA, chi, bij, force_divr_ij, force_divr_jk);
 554
 555                             // add the force to particle i
 556                             forcei.x += __fmul_rn(dxjk.x, force_divr_jk.y) + __fmul_rn(dxij.x, force_divr_ij.y);
 557                             forcei.y += __fmul_rn(dxjk.y, force_divr_jk.y) + __fmul_rn(dxij.y, force_divr_ij.y);
 558                             forcei.z += __fmul_rn(dxjk.z, force_divr_jk.y) + __fmul_rn(dxij.z, force_divr_ij.y);
 559                             }
 560                         }
 561                     }
 562                 #endif
 563                 }
 564             }
 565         }
 566     // potential energy per particle must be halved
 567     forcei.w *= Scalar(0.5);
 568     // now that the force calculation is complete, write out the result (MEM TRANSFER: 20 bytes)
 569     #if (__CUDA_ARCH__ >= 200)
 570     atomicAdd(&d_force[idx].x, forcei.x);
 571     atomicAdd(&d_force[idx].y, forcei.y);
 572     atomicAdd(&d_force[idx].z, forcei.z);
 573     atomicAdd(&d_force[idx].w, forcei.w);
 574     #else
 575     d_force[idx] = forcei;
 576     #endif
 577     }
 578
 579 //! Kernel for zeroing forces before computation with atomic additions.
 580 /*! \param d_force Device memory to write forces to
 581     \param N Number of particles in the system
 582
 583 */
 584 __global__ void gpu_zero_forces_kernel(Scalar4 *d_force,
 585                                        const unsigned int N)
 586     {
 587     // identify the particle we are supposed to handle
 588     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
 589
 590     if (idx >= N)
 591         return;
 592
 593     // zero the force
 594     d_force[idx] = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
 595     }
 596
 597 //! Kernel driver that computes the three-body forces
 598 /*! \param pair_args Other arugments to pass onto the kernel
 599     \param d_params Parameters for the potential, stored per type pair
 600
 601     This is just a driver function for gpu_compute_triplet_forces_kernel(), see it for details.
 602 */
 603 template< class evaluator >
 604 cudaError_t gpu_compute_triplet_forces(const tersoff_args_t& pair_args,
 605                                        const typename evaluator::param_type *d_params)
 606     {
 607     assert(d_params);
 608     assert(pair_args.d_rcutsq);
 609     assert(pair_args.d_ronsq);
 610     assert(pair_args.ntypes > 0);
 611
 612     static unsigned int max_block_size = UINT_MAX;
 613     if (max_block_size == UINT_MAX)
 614         {
 615         cudaFuncAttributes attr;
 616         cudaFuncGetAttributes(&attr, gpu_compute_triplet_forces_kernel<evaluator>);
 617         max_block_size = attr.maxThreadsPerBlock;
 618         }
 619
 620     unsigned int run_block_size = min(pair_args.block_size, max_block_size);
 621
 622     // setup the grid to run the kernel
 623     dim3 grid( pair_args.N / run_block_size + 1, 1, 1);
 624     dim3 threads(run_block_size, 1, 1);
 625
 626     // bind the position texture
 627     pdata_pos_tex.normalized = false;
 628     pdata_pos_tex.filterMode = cudaFilterModePoint;
 629     cudaError_t error = cudaBindTexture(0, pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4) * pair_args.N);
 630     if (error != cudaSuccess)
 631         return error;
 632
 633     Index2D typpair_idx(pair_args.ntypes);
 634     unsigned int shared_bytes = (2*sizeof(Scalar) + sizeof(typename evaluator::param_type))
 635                                 * typpair_idx.getNumElements();
 636
 637     // zero the forces
 638     gpu_zero_forces_kernel<<<grid, threads, shared_bytes>>>(pair_args.d_force,
 639                                                             pair_args.N);
 640
 641     // compute the new forces
 642     gpu_compute_triplet_forces_kernel<evaluator>
 643       <<<grid, threads, shared_bytes>>>(pair_args.d_force,
 644                                         pair_args.N,
 645                                         pair_args.d_pos,
 646                                         pair_args.box,
 647                                         pair_args.d_n_neigh,
 648                                         pair_args.d_nlist,
 649                                         pair_args.nli,
 650                                         d_params,
 651                                         pair_args.d_rcutsq,
 652                                         pair_args.d_ronsq,
 653                                         pair_args.ntypes);
 654     return cudaSuccess;
 655     }
 656 #endif
 657
 658 #endif // __POTENTIAL_TERSOFF_GPU_CUH__