Fix potential problem in Messenger related to MPI window
[hoomd-blue.git] / libhoomd / cuda / PotentialTersoffGPU.cuh
blob0d6937b55ea6d9842ba2abb8f3b066dca205347e
1 /*
2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
32 permission.
34 Disclaimer
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 #include "HOOMDMath.h"
51 #include "TextureTools.h"
52 #include "ParticleData.cuh"
53 #include "Index1D.h"
55 #ifdef WIN32
56 #include <cassert>
57 #else
58 #include <assert.h>
59 #endif
61 /*! \file PotentialTersoffGPU.cuh
62     \brief Defines templated GPU kernel code for calculating certain three-body forces
65 #ifndef __POTENTIAL_TERSOFF_GPU_CUH__
66 #define __POTENTIAL_TERSOFF_GPU_CUH__
68 //! Wraps arguments to gpu_cgpf
69 struct tersoff_args_t
70     {
71     //! Construct a tersoff_args_t
72     tersoff_args_t(Scalar4 *_d_force,
73                    const unsigned int _N,
74                    const Scalar4 *_d_pos,
75                    const BoxDim& _box,
76                    const unsigned int *_d_n_neigh,
77                    const unsigned int *_d_nlist,
78                    const Index2D& _nli,
79                    const Scalar *_d_rcutsq,
80                    const Scalar *_d_ronsq,
81                    const unsigned int _ntypes,
82                    const unsigned int _block_size)
83                    : d_force(_d_force),
84                      N(_N),
85                      d_pos(_d_pos),
86                      box(_box),
87                      d_n_neigh(_d_n_neigh),
88                      d_nlist(_d_nlist),
89                      nli(_nli),
90                      d_rcutsq(_d_rcutsq),
91                      d_ronsq(_d_ronsq),
92                      ntypes(_ntypes),
93                      block_size(_block_size)
94         {
95         };
97     Scalar4 *d_force;                //!< Force to write out
98     const unsigned int N;            //!< Number of particles
99     const Scalar4 *d_pos;            //!< particle positions
100     const BoxDim& box;                //!< Simulation box in GPU format
101     const unsigned int *d_n_neigh;  //!< Device array listing the number of neighbors on each particle
102     const unsigned int *d_nlist;    //!< Device array listing the neighbors of each particle
103     const Index2D& nli;             //!< Indexer for accessing d_nlist
104     const Scalar *d_rcutsq;          //!< Device array listing r_cut squared per particle type pair
105     const Scalar *d_ronsq;           //!< Device array listing r_on squared per particle type pair
106     const unsigned int ntypes;      //!< Number of particle types in the simulation
107     const unsigned int block_size;  //!< Block size to execute
108     };
111 #ifdef NVCC
112 //! Texture for reading particle positions
113 scalar4_tex_t pdata_pos_tex;
115 #ifndef SINGLE_PRECISION
116 //! atomicAdd function for double-precision floating point numbers
117 /*! This function is only used when hoomd is compiled for double precision on the GPU.
119     \param address Address to write the double to
120     \param val Value to add to address
122 static __device__ inline double atomicAdd(double* address, double val)
123     {
124     unsigned long long int* address_as_ull = (unsigned long long int*)address;
125     unsigned long long int old = *address_as_ull, assumed;
127     do {
128         assumed = old;
129         old = atomicCAS(address_as_ull,
130                         assumed,
131                         __double_as_longlong(val + __longlong_as_double(assumed)));
132     } while (assumed != old);
134     return __longlong_as_double(old);
135     }
136 #endif
138 //! Kernel for calculating the Tersoff forces
139 /*! This kernel is called to calculate the forces on all N particles. Actual evaluation of the potentials and
140     forces for each pair is handled via the template class \a evaluator.
142     \param d_force Device memory to write computed forces
143     \param N Number of particles in the system
144     \param d_pos Positions of all the particles
145     \param box Box dimensions used to implement periodic boundary conditions
146     \param d_n_neigh Device memory array listing the number of neighbors for each particle
147     \param d_nlist Device memory array containing the neighbor list contents
148     \param nli Indexer for indexing \a d_nlist
149     \param d_params Parameters for the potential, stored per type pair
150     \param d_rcutsq rcut squared, stored per type pair
151     \param d_ronsq ron squared, stored per type pair
152     \param ntypes Number of types in the simulation
154     \a d_params, \a d_rcutsq, and \a d_ronsq must be indexed with an Index2DUpperTriangler(typei, typej) to access the
155     unique value for that type pair. These values are all cached into shared memory for quick access, so a dynamic
156     amount of shared memory must be allocatd for this kernel launch. The amount is
157     (2*sizeof(Scalar) + sizeof(typename evaluator::param_type)) * typpair_idx.getNumElements()
159     Certain options are controlled via template parameters to avoid the performance hit when they are not enabled.
160     \tparam evaluator EvaluatorPair class to evualuate V(r) and -delta V(r)/r
162     <b>Implementation details</b>
163     Each block will calculate the forces on a block of particles.
164     Each thread will calculate the total force on one particle.
165     The neighborlist is arranged in columns so that reads are fully coalesced when doing this.
167 template< class evaluator >
168 __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
169                                                   const unsigned int N,
170                                                   const Scalar4 *d_pos,
171                                                   const BoxDim box,
172                                                   const unsigned int *d_n_neigh,
173                                                   const unsigned int *d_nlist,
174                                                   const Index2D nli,
175                                                   const typename evaluator::param_type *d_params,
176                                                   const Scalar *d_rcutsq,
177                                                   const Scalar *d_ronsq,
178                                                   const unsigned int ntypes)
179     {
180     Index2D typpair_idx(ntypes);
181     const unsigned int num_typ_parameters = typpair_idx.getNumElements();
183     // shared arrays for per type pair parameters
184     extern __shared__ char s_data[];
185     typename evaluator::param_type *s_params =
186         (typename evaluator::param_type *)(&s_data[0]);
187     Scalar *s_rcutsq = (Scalar *)(&s_data[num_typ_parameters*sizeof(evaluator::param_type)]);
189     // load in the per type pair parameters
190     for (unsigned int cur_offset = 0; cur_offset < num_typ_parameters; cur_offset += blockDim.x)
191         {
192         if (cur_offset + threadIdx.x < num_typ_parameters)
193             {
194             s_rcutsq[cur_offset + threadIdx.x] = d_rcutsq[cur_offset + threadIdx.x];
195             s_params[cur_offset + threadIdx.x] = d_params[cur_offset + threadIdx.x];
196             }
197         }
198     __syncthreads();
200     // start by identifying which particle we are to handle
201     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
203     if (idx >= N)
204         return;
206     // load in the length of the neighbor list (MEM_TRANSFER: 4 bytes)
207     unsigned int n_neigh = d_n_neigh[idx];
209     // read in the position of the particle
210     Scalar4 postypei = texFetchScalar4(d_pos, pdata_pos_tex, idx);
211     Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z);
213     // initialize the force to 0
214     Scalar4 forcei = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
216     // prefetch neighbor index
217     unsigned int cur_j = 0;
218     unsigned int next_j = d_nlist[nli(idx, 0)];
220     // loop over neighbors
221     // on pre Fermi hardware, there is a bug that causes rare and random ULFs when simply looping over n_neigh
222     // the workaround (activated via the template paramter) is to loop over nlist.height and put an if (i < n_neigh)
223     // inside the loop
224     #if (__CUDA_ARCH__ < 200)
225     for (int neigh_idx = 0; neigh_idx < nli.getH(); neigh_idx++)
226     #else
227     for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++)
228     #endif
229     {
230         #if (__CUDA_ARCH__ < 200)
231         if (neigh_idx < n_neigh)
232         #endif
233         {
234             // read the current neighbor index (MEM TRANSFER: 4 bytes)
235             // prefetch the next value and set the current one
236             cur_j = next_j;
237             next_j = d_nlist[nli(idx, neigh_idx + 1)];
239             // read the position of j (MEM TRANSFER: 16 bytes)
240             Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j);
241             Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
243             // initialize the force on j
244             Scalar4 forcej = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
246             // compute r_ij (FLOPS: 3)
247             Scalar3 dxij = posi - posj;
249             // apply periodic boundary conditions (FLOPS: 12)
250             dxij = box.minImage(dxij);
252             // compute rij_sq (FLOPS: 5)
253             Scalar rij_sq = dot(dxij, dxij);
255             // access the per type-pair parameters
256             unsigned int typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypej.w));
257             Scalar rcutsq = s_rcutsq[typpair];
258             typename evaluator::param_type param = s_params[typpair];
260             // compute the base repulsive and attractive terms of the potential
261             Scalar fR = Scalar(0.0);
262             Scalar fA = Scalar(0.0);
263             evaluator eval(rij_sq, rcutsq, param);
264             bool evaluatedij = eval.evalRepulsiveAndAttractive(fR, fA);
266             if (evaluatedij)
267                 {
268                 // compute chi
269                 Scalar chi = Scalar(0.0);
270                 unsigned int cur_k = 0;
271                 unsigned int next_k = d_nlist[nli(idx, 0)];
272                 #if (__CUDA_ARCH__ < 200)
273                 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
274                 #else
275                 for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++)
276                 #endif
277                     {
278                     #if (__CUDA_ARCH__ < 200)
279                     if (neigh_idy < n_neigh)
280                     #endif
281                         {
282                         // read the current index of k and prefetch the next one
283                         cur_k = next_k;
284                         next_k = d_nlist[nli(idx, neigh_idy+1)];
286                         // get the position of neighbor k
287                         Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
288                         Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
290                         // get the type pair parameters for i and k
291                         typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypek.w));
292                         Scalar temp_rcutsq = s_rcutsq[typpair];
293                         typename evaluator::param_type temp_param = s_params[typpair];
295                         evaluator temp_eval(rij_sq, temp_rcutsq, temp_param);
296                         bool temp_evaluated = temp_eval.areInteractive();
298                         if (cur_k != cur_j && temp_evaluated)
299                             {
300                             // compute rik
301                             Scalar3 dxik = posi - posk;
303                             // apply the periodic boundary conditions
304                             dxik = box.minImage(dxik);
306                             // compute rik_sq
307                             Scalar rik_sq = dot(dxik, dxik);
309                             // compute the bond angle (if needed)
310                             Scalar cos_th = Scalar(0.0);
311                             if (evaluator::needsAngle())
312                                 cos_th = dot(dxij, dxik) * fast::rsqrt(rij_sq * rik_sq);
313                             else cos_th += Scalar(1.0); // shuts up the compiler warning
315                             // set up the evaluator
316                             eval.setRik(rik_sq);
317                             if (evaluator::needsAngle())
318                                 eval.setAngle(cos_th);
320                             // compute the partial chi term
321                             eval.evalChi(chi);
322                             }
323                         }
324                     }
325                 // evaluate the force and energy from the ij interaction
326                 Scalar force_divr = Scalar(0.0);
327                 Scalar potential_eng = Scalar(0.0);
328                 Scalar bij = Scalar(0.0);
329                 eval.evalForceij(fR, fA, chi, bij, force_divr, potential_eng);
331                 // add the forces and energies to their respective particles
332                 Scalar2 v_coeffs = make_scalar2(Scalar(1.0 / 6.0) * rij_sq, Scalar(0.0));
333                 #if (__CUDA_ARCH__ >= 200)
334                 forcei.x += dxij.x * force_divr;
335                 forcei.y += dxij.y * force_divr;
336                 forcei.z += dxij.z * force_divr;
338                 forcej.x -= dxij.x * force_divr;
339                 forcej.y -= dxij.y * force_divr;
340                 forcej.z -= dxij.z * force_divr;
341                 forcej.w += potential_eng;
342                 #else
343                 forcei.x += __fmul_rn(dxij.x, force_divr);
344                 forcei.y += __fmul_rn(dxij.y, force_divr);
345                 forcei.z += __fmul_rn(dxij.z, force_divr);
347                 forcej.x += Scalar(1.0); // shuts up the compiler warning
348                 #endif
349                 forcei.w += potential_eng;
351                 // now evaluate the force from the ik interactions
352                 cur_k = 0;
353                 next_k = d_nlist[nli(idx, 0)];
354                 #if (__CUDA_ARCH__ < 200)
355                 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
356                 #else
357                 for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++)
358                 #endif
359                     {
360                     #if (__CUDA_ARCH__ < 200)
361                     if (neigh_idy < n_neigh)
362                     #endif
363                         {
364                         // read the current neighbor index and prefetch the next one
365                         cur_k = next_k;
366                         next_k = d_nlist[nli(idx, neigh_idy+1)];
368                         // get the position of neighbor k
369                         Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
370                         Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
372                         // get the type pair parameters for i and k
373                         typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypek.w));
374                         Scalar temp_rcutsq = s_rcutsq[typpair];
375                         typename evaluator::param_type temp_param = s_params[typpair];
377                         evaluator temp_eval(rij_sq, temp_rcutsq, temp_param);
378                         bool temp_evaluated = temp_eval.areInteractive();
380                         if (cur_k != cur_j && temp_evaluated)
381                             {
382                             Scalar4 forcek = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
384                             // compute rik
385                             Scalar3 dxik = posi - posk;
387                             // apply the periodic boundary conditions
388                             dxik = box.minImage(dxik);
390                             // compute rik_sq
391                             Scalar rik_sq = dot(dxik, dxik);
393                             // compute the bond angle (if needed)
394                             Scalar cos_th = Scalar(0.0);
395                             if (evaluator::needsAngle())
396                                 cos_th = dot(dxij, dxik) * fast::rsqrt(rij_sq * rik_sq);
397                             else cos_th += Scalar(1.0); // shuts up the compiler warning
399                             // set up the evaluator
400                             eval.setRik(rik_sq);
401                             if (evaluator::needsAngle())
402                                 eval.setAngle(cos_th);
404                             // compute the force
405                             Scalar3 force_divr_ij = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
406                             Scalar3 force_divr_ik = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
407                             bool evaluatedjk = eval.evalForceik(fR, fA, chi, bij, force_divr_ij, force_divr_ik);
409                             if (evaluatedjk)
410                                 {
411                                 // add the forces to their respective particles
412                                 v_coeffs.y = Scalar(1.0 / 6.0) * rik_sq;
413                                 #if (__CUDA_ARCH__ >= 200)
414                                 forcei.x += force_divr_ij.x * dxij.x + force_divr_ik.x * dxik.x;
415                                 forcei.y += force_divr_ij.x * dxij.y + force_divr_ik.x * dxik.y;
416                                 forcei.z += force_divr_ij.x * dxij.z + force_divr_ik.x * dxik.z;
418                                 forcej.x += force_divr_ij.y * dxij.x + force_divr_ik.y * dxik.x;
419                                 forcej.y += force_divr_ij.y * dxij.y + force_divr_ik.y * dxik.y;
420                                 forcej.z += force_divr_ij.y * dxij.z + force_divr_ik.y * dxik.z;
422                                 forcek.x += force_divr_ij.z * dxij.x + force_divr_ik.z * dxik.x;
423                                 forcek.y += force_divr_ij.z * dxij.y + force_divr_ik.z * dxik.y;
424                                 forcek.z += force_divr_ij.z * dxij.z + force_divr_ik.z * dxik.z;
426                                 atomicAdd(&d_force[cur_k].x, forcek.x);
427                                 atomicAdd(&d_force[cur_k].y, forcek.y);
428                                 atomicAdd(&d_force[cur_k].z, forcek.z);
429                                 #else
430                                 forcei.x += __fmul_rn(dxij.x, force_divr_ij.x) + __fmul_rn(dxik.x, force_divr_ik.x);
431                                 forcei.y += __fmul_rn(dxij.y, force_divr_ij.x) + __fmul_rn(dxik.y, force_divr_ik.x);
432                                 forcei.z += __fmul_rn(dxij.z, force_divr_ij.x) + __fmul_rn(dxik.z, force_divr_ik.x);
434                                 forcek.x += Scalar(1.0); // shuts up the compiler warning
435                                 #endif
436                                 }
437                             }
438                         }
439                     }
440                 // on Fermi hardware we can use atomicAdd to gain some speed
441                 // otherwise we need to loop over neighbors of j
442                 #if (__CUDA_ARCH__ >= 200)
443                 // potential energy of j must be halved
444                 forcej.w *= Scalar(0.5);
445                 // write out the result for particle j
446                 atomicAdd(&d_force[cur_j].x, forcej.x);
447                 atomicAdd(&d_force[cur_j].y, forcej.y);
448                 atomicAdd(&d_force[cur_j].z, forcej.z);
449                 atomicAdd(&d_force[cur_j].w, forcej.w);
450                 #else
451                 // now we have to compute the force with i as a secondary/tertiary particle
452                 // first consider i as the secondary particle and j as the primary particle
453                 // rji is -rij
454                 dxij.x *= -Scalar(1.0);
455                 dxij.y *= -Scalar(1.0);
456                 dxij.z *= -Scalar(1.0);
458                 // the fR and fA already computed are still valid, so there is no point in recomputing them
460                 // recompute chi by looping over neighbors of j
461                 unsigned int n_neighj = d_n_neigh[cur_j];
462                 chi = Scalar(0.0);
463                 cur_k = 0;
464                 next_k = d_nlist[nli(cur_j, 0)];
465                 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
466                     {
467                     if (neigh_idy < n_neighj)
468                         {
469                         // read the index of k and prefetch the next one
470                         cur_k = next_k;
471                         next_k = d_nlist[nli(cur_j, neigh_idy+1)];
473                         if (cur_k != idx)
474                             {
475                             // get the position of neighbor k
476                             Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
477                             Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
479                             // compute rjk
480                             Scalar3 dxjk = posj - posk;
482                             // apply the periodic boundary conditions
483                             dxjk = box.minImage(dxjk);
485                             // compute rjk_sq
486                             Scalar rjk_sq = dot(dxjk, dxjk);
488                             // compute the bond angle (if needed)
489                             Scalar cos_th = Scalar(0.0);
490                             if (evaluator::needsAngle())
491                                 cos_th = dot(dxij, dxjk) * fast::rsqrt(rij_sq * rjk_sq);
492                             else cos_th += Scalar(1.0); // shuts up the compiler warning
494                             // set up the evaluator
495                             eval.setRik(rjk_sq);
496                             if (evaluator::needsAngle())
497                                 eval.setAngle(cos_th);
499                             // evaluate chi
500                             eval.evalChi(chi);
501                             }
502                         }
503                     }
504                 // now compute the ji force
505                 eval.evalForceij(fR, fA, chi, bij, force_divr, potential_eng);
507                 // add the force and energy to particle i
508                 forcei.x -= __fmul_rn(dxij.x, force_divr);
509                 forcei.y -= __fmul_rn(dxij.y, force_divr);
510                 forcei.z -= __fmul_rn(dxij.z, force_divr);
511                 forcei.w += potential_eng;
513                 // now compute the jk force
514                 cur_k = 0;
515                 next_k = d_nlist[nli(cur_j, 0)];
516                 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
517                     {
518                     if (neigh_idy < n_neighj)
519                         {
520                         // get the index of k and prefecth the next one
521                         cur_k = next_k;
522                         next_k = d_nlist[nli(cur_j, neigh_idy+1)];
524                         if (cur_k != idx)
525                             {
526                             // get the position of k
527                             Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
528                             Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
530                             // compute rjk
531                             Scalar3 dxjk = posj - posk;
533                             // apply periodic boundary conditions
534                             dxjk = box.minImage(dxjk);
536                             // compute rjk_sq
537                             Scalar rjk_sq = dot(dxjk, dxjk);
539                             // compute the bond angle (if needed)
540                             Scalar cos_th = Scalar(0.0);
541                             if (evaluator::needsAngle())
542                                 cos_th = dot(dxij, dxjk) * fast::rsqrt(rij_sq * rjk_sq);
543                             else cos_th += Scalar(1.0); // shuts up the compiler warning
545                             // set up the evaluator
546                             eval.setRik(rjk_sq);
547                             if (evaluator::needsAngle())
548                                 eval.setAngle(cos_th);
550                             // evaluate the force
551                             Scalar3 force_divr_ij = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
552                             Scalar3 force_divr_jk = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
553                             eval.evalForceik(fR, fA, chi, bij, force_divr_ij, force_divr_jk);
555                             // add the force to particle i
556                             forcei.x += __fmul_rn(dxjk.x, force_divr_jk.y) + __fmul_rn(dxij.x, force_divr_ij.y);
557                             forcei.y += __fmul_rn(dxjk.y, force_divr_jk.y) + __fmul_rn(dxij.y, force_divr_ij.y);
558                             forcei.z += __fmul_rn(dxjk.z, force_divr_jk.y) + __fmul_rn(dxij.z, force_divr_ij.y);
559                             }
560                         }
561                     }
562                 #endif
563                 }
564             }
565         }
566     // potential energy per particle must be halved
567     forcei.w *= Scalar(0.5);
568     // now that the force calculation is complete, write out the result (MEM TRANSFER: 20 bytes)
569     #if (__CUDA_ARCH__ >= 200)
570     atomicAdd(&d_force[idx].x, forcei.x);
571     atomicAdd(&d_force[idx].y, forcei.y);
572     atomicAdd(&d_force[idx].z, forcei.z);
573     atomicAdd(&d_force[idx].w, forcei.w);
574     #else
575     d_force[idx] = forcei;
576     #endif
577     }
579 //! Kernel for zeroing forces before computation with atomic additions.
580 /*! \param d_force Device memory to write forces to
581     \param N Number of particles in the system
584 __global__ void gpu_zero_forces_kernel(Scalar4 *d_force,
585                                        const unsigned int N)
586     {
587     // identify the particle we are supposed to handle
588     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
590     if (idx >= N)
591         return;
593     // zero the force
594     d_force[idx] = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
595     }
597 //! Kernel driver that computes the three-body forces
598 /*! \param pair_args Other arugments to pass onto the kernel
599     \param d_params Parameters for the potential, stored per type pair
601     This is just a driver function for gpu_compute_triplet_forces_kernel(), see it for details.
603 template< class evaluator >
604 cudaError_t gpu_compute_triplet_forces(const tersoff_args_t& pair_args,
605                                        const typename evaluator::param_type *d_params)
606     {
607     assert(d_params);
608     assert(pair_args.d_rcutsq);
609     assert(pair_args.d_ronsq);
610     assert(pair_args.ntypes > 0);
612     static unsigned int max_block_size = UINT_MAX;
613     if (max_block_size == UINT_MAX)
614         {
615         cudaFuncAttributes attr;
616         cudaFuncGetAttributes(&attr, gpu_compute_triplet_forces_kernel<evaluator>);
617         max_block_size = attr.maxThreadsPerBlock;
618         }
620     unsigned int run_block_size = min(pair_args.block_size, max_block_size);
622     // setup the grid to run the kernel
623     dim3 grid( pair_args.N / run_block_size + 1, 1, 1);
624     dim3 threads(run_block_size, 1, 1);
626     // bind the position texture
627     pdata_pos_tex.normalized = false;
628     pdata_pos_tex.filterMode = cudaFilterModePoint;
629     cudaError_t error = cudaBindTexture(0, pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4) * pair_args.N);
630     if (error != cudaSuccess)
631         return error;
633     Index2D typpair_idx(pair_args.ntypes);
634     unsigned int shared_bytes = (2*sizeof(Scalar) + sizeof(typename evaluator::param_type))
635                                 * typpair_idx.getNumElements();
637     // zero the forces
638     gpu_zero_forces_kernel<<<grid, threads, shared_bytes>>>(pair_args.d_force,
639                                                             pair_args.N);
641     // compute the new forces
642     gpu_compute_triplet_forces_kernel<evaluator>
643       <<<grid, threads, shared_bytes>>>(pair_args.d_force,
644                                         pair_args.N,
645                                         pair_args.d_pos,
646                                         pair_args.box,
647                                         pair_args.d_n_neigh,
648                                         pair_args.d_nlist,
649                                         pair_args.nli,
650                                         d_params,
651                                         pair_args.d_rcutsq,
652                                         pair_args.d_ronsq,
653                                         pair_args.ntypes);
654     return cudaSuccess;
655     }
656 #endif
658 #endif // __POTENTIAL_TERSOFF_GPU_CUH__