2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 #include "HOOMDMath.h"
51 #include "TextureTools.h"
52 #include "ParticleData.cuh"
61 /*! \file PotentialTersoffGPU.cuh
62 \brief Defines templated GPU kernel code for calculating certain three-body forces
65 #ifndef __POTENTIAL_TERSOFF_GPU_CUH__
66 #define __POTENTIAL_TERSOFF_GPU_CUH__
68 //! Wraps arguments to gpu_cgpf
71 //! Construct a tersoff_args_t
72 tersoff_args_t(Scalar4 *_d_force,
73 const unsigned int _N,
74 const Scalar4 *_d_pos,
76 const unsigned int *_d_n_neigh,
77 const unsigned int *_d_nlist,
79 const Scalar *_d_rcutsq,
80 const Scalar *_d_ronsq,
81 const unsigned int _ntypes,
82 const unsigned int _block_size)
87 d_n_neigh(_d_n_neigh),
93 block_size(_block_size)
97 Scalar4 *d_force; //!< Force to write out
98 const unsigned int N; //!< Number of particles
99 const Scalar4 *d_pos; //!< particle positions
100 const BoxDim& box; //!< Simulation box in GPU format
101 const unsigned int *d_n_neigh; //!< Device array listing the number of neighbors on each particle
102 const unsigned int *d_nlist; //!< Device array listing the neighbors of each particle
103 const Index2D& nli; //!< Indexer for accessing d_nlist
104 const Scalar *d_rcutsq; //!< Device array listing r_cut squared per particle type pair
105 const Scalar *d_ronsq; //!< Device array listing r_on squared per particle type pair
106 const unsigned int ntypes; //!< Number of particle types in the simulation
107 const unsigned int block_size; //!< Block size to execute
112 //! Texture for reading particle positions
113 scalar4_tex_t pdata_pos_tex;
115 #ifndef SINGLE_PRECISION
116 //! atomicAdd function for double-precision floating point numbers
117 /*! This function is only used when hoomd is compiled for double precision on the GPU.
119 \param address Address to write the double to
120 \param val Value to add to address
122 static __device__ inline double atomicAdd(double* address, double val)
124 unsigned long long int* address_as_ull = (unsigned long long int*)address;
125 unsigned long long int old = *address_as_ull, assumed;
129 old = atomicCAS(address_as_ull,
131 __double_as_longlong(val + __longlong_as_double(assumed)));
132 } while (assumed != old);
134 return __longlong_as_double(old);
138 //! Kernel for calculating the Tersoff forces
139 /*! This kernel is called to calculate the forces on all N particles. Actual evaluation of the potentials and
140 forces for each pair is handled via the template class \a evaluator.
142 \param d_force Device memory to write computed forces
143 \param N Number of particles in the system
144 \param d_pos Positions of all the particles
145 \param box Box dimensions used to implement periodic boundary conditions
146 \param d_n_neigh Device memory array listing the number of neighbors for each particle
147 \param d_nlist Device memory array containing the neighbor list contents
148 \param nli Indexer for indexing \a d_nlist
149 \param d_params Parameters for the potential, stored per type pair
150 \param d_rcutsq rcut squared, stored per type pair
151 \param d_ronsq ron squared, stored per type pair
152 \param ntypes Number of types in the simulation
154 \a d_params, \a d_rcutsq, and \a d_ronsq must be indexed with an Index2DUpperTriangler(typei, typej) to access the
155 unique value for that type pair. These values are all cached into shared memory for quick access, so a dynamic
156 amount of shared memory must be allocatd for this kernel launch. The amount is
157 (2*sizeof(Scalar) + sizeof(typename evaluator::param_type)) * typpair_idx.getNumElements()
159 Certain options are controlled via template parameters to avoid the performance hit when they are not enabled.
160 \tparam evaluator EvaluatorPair class to evualuate V(r) and -delta V(r)/r
162 <b>Implementation details</b>
163 Each block will calculate the forces on a block of particles.
164 Each thread will calculate the total force on one particle.
165 The neighborlist is arranged in columns so that reads are fully coalesced when doing this.
167 template< class evaluator >
168 __global__ void gpu_compute_triplet_forces_kernel(Scalar4 *d_force,
169 const unsigned int N,
170 const Scalar4 *d_pos,
172 const unsigned int *d_n_neigh,
173 const unsigned int *d_nlist,
175 const typename evaluator::param_type *d_params,
176 const Scalar *d_rcutsq,
177 const Scalar *d_ronsq,
178 const unsigned int ntypes)
180 Index2D typpair_idx(ntypes);
181 const unsigned int num_typ_parameters = typpair_idx.getNumElements();
183 // shared arrays for per type pair parameters
184 extern __shared__ char s_data[];
185 typename evaluator::param_type *s_params =
186 (typename evaluator::param_type *)(&s_data[0]);
187 Scalar *s_rcutsq = (Scalar *)(&s_data[num_typ_parameters*sizeof(evaluator::param_type)]);
189 // load in the per type pair parameters
190 for (unsigned int cur_offset = 0; cur_offset < num_typ_parameters; cur_offset += blockDim.x)
192 if (cur_offset + threadIdx.x < num_typ_parameters)
194 s_rcutsq[cur_offset + threadIdx.x] = d_rcutsq[cur_offset + threadIdx.x];
195 s_params[cur_offset + threadIdx.x] = d_params[cur_offset + threadIdx.x];
200 // start by identifying which particle we are to handle
201 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
206 // load in the length of the neighbor list (MEM_TRANSFER: 4 bytes)
207 unsigned int n_neigh = d_n_neigh[idx];
209 // read in the position of the particle
210 Scalar4 postypei = texFetchScalar4(d_pos, pdata_pos_tex, idx);
211 Scalar3 posi = make_scalar3(postypei.x, postypei.y, postypei.z);
213 // initialize the force to 0
214 Scalar4 forcei = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
216 // prefetch neighbor index
217 unsigned int cur_j = 0;
218 unsigned int next_j = d_nlist[nli(idx, 0)];
220 // loop over neighbors
221 // on pre Fermi hardware, there is a bug that causes rare and random ULFs when simply looping over n_neigh
222 // the workaround (activated via the template paramter) is to loop over nlist.height and put an if (i < n_neigh)
224 #if (__CUDA_ARCH__ < 200)
225 for (int neigh_idx = 0; neigh_idx < nli.getH(); neigh_idx++)
227 for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++)
230 #if (__CUDA_ARCH__ < 200)
231 if (neigh_idx < n_neigh)
234 // read the current neighbor index (MEM TRANSFER: 4 bytes)
235 // prefetch the next value and set the current one
237 next_j = d_nlist[nli(idx, neigh_idx + 1)];
239 // read the position of j (MEM TRANSFER: 16 bytes)
240 Scalar4 postypej = texFetchScalar4(d_pos, pdata_pos_tex, cur_j);
241 Scalar3 posj = make_scalar3(postypej.x, postypej.y, postypej.z);
243 // initialize the force on j
244 Scalar4 forcej = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
246 // compute r_ij (FLOPS: 3)
247 Scalar3 dxij = posi - posj;
249 // apply periodic boundary conditions (FLOPS: 12)
250 dxij = box.minImage(dxij);
252 // compute rij_sq (FLOPS: 5)
253 Scalar rij_sq = dot(dxij, dxij);
255 // access the per type-pair parameters
256 unsigned int typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypej.w));
257 Scalar rcutsq = s_rcutsq[typpair];
258 typename evaluator::param_type param = s_params[typpair];
260 // compute the base repulsive and attractive terms of the potential
261 Scalar fR = Scalar(0.0);
262 Scalar fA = Scalar(0.0);
263 evaluator eval(rij_sq, rcutsq, param);
264 bool evaluatedij = eval.evalRepulsiveAndAttractive(fR, fA);
269 Scalar chi = Scalar(0.0);
270 unsigned int cur_k = 0;
271 unsigned int next_k = d_nlist[nli(idx, 0)];
272 #if (__CUDA_ARCH__ < 200)
273 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
275 for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++)
278 #if (__CUDA_ARCH__ < 200)
279 if (neigh_idy < n_neigh)
282 // read the current index of k and prefetch the next one
284 next_k = d_nlist[nli(idx, neigh_idy+1)];
286 // get the position of neighbor k
287 Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
288 Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
290 // get the type pair parameters for i and k
291 typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypek.w));
292 Scalar temp_rcutsq = s_rcutsq[typpair];
293 typename evaluator::param_type temp_param = s_params[typpair];
295 evaluator temp_eval(rij_sq, temp_rcutsq, temp_param);
296 bool temp_evaluated = temp_eval.areInteractive();
298 if (cur_k != cur_j && temp_evaluated)
301 Scalar3 dxik = posi - posk;
303 // apply the periodic boundary conditions
304 dxik = box.minImage(dxik);
307 Scalar rik_sq = dot(dxik, dxik);
309 // compute the bond angle (if needed)
310 Scalar cos_th = Scalar(0.0);
311 if (evaluator::needsAngle())
312 cos_th = dot(dxij, dxik) * fast::rsqrt(rij_sq * rik_sq);
313 else cos_th += Scalar(1.0); // shuts up the compiler warning
315 // set up the evaluator
317 if (evaluator::needsAngle())
318 eval.setAngle(cos_th);
320 // compute the partial chi term
325 // evaluate the force and energy from the ij interaction
326 Scalar force_divr = Scalar(0.0);
327 Scalar potential_eng = Scalar(0.0);
328 Scalar bij = Scalar(0.0);
329 eval.evalForceij(fR, fA, chi, bij, force_divr, potential_eng);
331 // add the forces and energies to their respective particles
332 Scalar2 v_coeffs = make_scalar2(Scalar(1.0 / 6.0) * rij_sq, Scalar(0.0));
333 #if (__CUDA_ARCH__ >= 200)
334 forcei.x += dxij.x * force_divr;
335 forcei.y += dxij.y * force_divr;
336 forcei.z += dxij.z * force_divr;
338 forcej.x -= dxij.x * force_divr;
339 forcej.y -= dxij.y * force_divr;
340 forcej.z -= dxij.z * force_divr;
341 forcej.w += potential_eng;
343 forcei.x += __fmul_rn(dxij.x, force_divr);
344 forcei.y += __fmul_rn(dxij.y, force_divr);
345 forcei.z += __fmul_rn(dxij.z, force_divr);
347 forcej.x += Scalar(1.0); // shuts up the compiler warning
349 forcei.w += potential_eng;
351 // now evaluate the force from the ik interactions
353 next_k = d_nlist[nli(idx, 0)];
354 #if (__CUDA_ARCH__ < 200)
355 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
357 for (int neigh_idy = 0; neigh_idy < n_neigh; neigh_idy++)
360 #if (__CUDA_ARCH__ < 200)
361 if (neigh_idy < n_neigh)
364 // read the current neighbor index and prefetch the next one
366 next_k = d_nlist[nli(idx, neigh_idy+1)];
368 // get the position of neighbor k
369 Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
370 Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
372 // get the type pair parameters for i and k
373 typpair = typpair_idx(__scalar_as_int(postypei.w), __scalar_as_int(postypek.w));
374 Scalar temp_rcutsq = s_rcutsq[typpair];
375 typename evaluator::param_type temp_param = s_params[typpair];
377 evaluator temp_eval(rij_sq, temp_rcutsq, temp_param);
378 bool temp_evaluated = temp_eval.areInteractive();
380 if (cur_k != cur_j && temp_evaluated)
382 Scalar4 forcek = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
385 Scalar3 dxik = posi - posk;
387 // apply the periodic boundary conditions
388 dxik = box.minImage(dxik);
391 Scalar rik_sq = dot(dxik, dxik);
393 // compute the bond angle (if needed)
394 Scalar cos_th = Scalar(0.0);
395 if (evaluator::needsAngle())
396 cos_th = dot(dxij, dxik) * fast::rsqrt(rij_sq * rik_sq);
397 else cos_th += Scalar(1.0); // shuts up the compiler warning
399 // set up the evaluator
401 if (evaluator::needsAngle())
402 eval.setAngle(cos_th);
405 Scalar3 force_divr_ij = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
406 Scalar3 force_divr_ik = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
407 bool evaluatedjk = eval.evalForceik(fR, fA, chi, bij, force_divr_ij, force_divr_ik);
411 // add the forces to their respective particles
412 v_coeffs.y = Scalar(1.0 / 6.0) * rik_sq;
413 #if (__CUDA_ARCH__ >= 200)
414 forcei.x += force_divr_ij.x * dxij.x + force_divr_ik.x * dxik.x;
415 forcei.y += force_divr_ij.x * dxij.y + force_divr_ik.x * dxik.y;
416 forcei.z += force_divr_ij.x * dxij.z + force_divr_ik.x * dxik.z;
418 forcej.x += force_divr_ij.y * dxij.x + force_divr_ik.y * dxik.x;
419 forcej.y += force_divr_ij.y * dxij.y + force_divr_ik.y * dxik.y;
420 forcej.z += force_divr_ij.y * dxij.z + force_divr_ik.y * dxik.z;
422 forcek.x += force_divr_ij.z * dxij.x + force_divr_ik.z * dxik.x;
423 forcek.y += force_divr_ij.z * dxij.y + force_divr_ik.z * dxik.y;
424 forcek.z += force_divr_ij.z * dxij.z + force_divr_ik.z * dxik.z;
426 atomicAdd(&d_force[cur_k].x, forcek.x);
427 atomicAdd(&d_force[cur_k].y, forcek.y);
428 atomicAdd(&d_force[cur_k].z, forcek.z);
430 forcei.x += __fmul_rn(dxij.x, force_divr_ij.x) + __fmul_rn(dxik.x, force_divr_ik.x);
431 forcei.y += __fmul_rn(dxij.y, force_divr_ij.x) + __fmul_rn(dxik.y, force_divr_ik.x);
432 forcei.z += __fmul_rn(dxij.z, force_divr_ij.x) + __fmul_rn(dxik.z, force_divr_ik.x);
434 forcek.x += Scalar(1.0); // shuts up the compiler warning
440 // on Fermi hardware we can use atomicAdd to gain some speed
441 // otherwise we need to loop over neighbors of j
442 #if (__CUDA_ARCH__ >= 200)
443 // potential energy of j must be halved
444 forcej.w *= Scalar(0.5);
445 // write out the result for particle j
446 atomicAdd(&d_force[cur_j].x, forcej.x);
447 atomicAdd(&d_force[cur_j].y, forcej.y);
448 atomicAdd(&d_force[cur_j].z, forcej.z);
449 atomicAdd(&d_force[cur_j].w, forcej.w);
451 // now we have to compute the force with i as a secondary/tertiary particle
452 // first consider i as the secondary particle and j as the primary particle
454 dxij.x *= -Scalar(1.0);
455 dxij.y *= -Scalar(1.0);
456 dxij.z *= -Scalar(1.0);
458 // the fR and fA already computed are still valid, so there is no point in recomputing them
460 // recompute chi by looping over neighbors of j
461 unsigned int n_neighj = d_n_neigh[cur_j];
464 next_k = d_nlist[nli(cur_j, 0)];
465 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
467 if (neigh_idy < n_neighj)
469 // read the index of k and prefetch the next one
471 next_k = d_nlist[nli(cur_j, neigh_idy+1)];
475 // get the position of neighbor k
476 Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
477 Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
480 Scalar3 dxjk = posj - posk;
482 // apply the periodic boundary conditions
483 dxjk = box.minImage(dxjk);
486 Scalar rjk_sq = dot(dxjk, dxjk);
488 // compute the bond angle (if needed)
489 Scalar cos_th = Scalar(0.0);
490 if (evaluator::needsAngle())
491 cos_th = dot(dxij, dxjk) * fast::rsqrt(rij_sq * rjk_sq);
492 else cos_th += Scalar(1.0); // shuts up the compiler warning
494 // set up the evaluator
496 if (evaluator::needsAngle())
497 eval.setAngle(cos_th);
504 // now compute the ji force
505 eval.evalForceij(fR, fA, chi, bij, force_divr, potential_eng);
507 // add the force and energy to particle i
508 forcei.x -= __fmul_rn(dxij.x, force_divr);
509 forcei.y -= __fmul_rn(dxij.y, force_divr);
510 forcei.z -= __fmul_rn(dxij.z, force_divr);
511 forcei.w += potential_eng;
513 // now compute the jk force
515 next_k = d_nlist[nli(cur_j, 0)];
516 for (int neigh_idy = 0; neigh_idy < nli.getH(); neigh_idy++)
518 if (neigh_idy < n_neighj)
520 // get the index of k and prefecth the next one
522 next_k = d_nlist[nli(cur_j, neigh_idy+1)];
526 // get the position of k
527 Scalar4 postypek = texFetchScalar4(d_pos, pdata_pos_tex, cur_k);
528 Scalar3 posk = make_scalar3(postypek.x, postypek.y, postypek.z);
531 Scalar3 dxjk = posj - posk;
533 // apply periodic boundary conditions
534 dxjk = box.minImage(dxjk);
537 Scalar rjk_sq = dot(dxjk, dxjk);
539 // compute the bond angle (if needed)
540 Scalar cos_th = Scalar(0.0);
541 if (evaluator::needsAngle())
542 cos_th = dot(dxij, dxjk) * fast::rsqrt(rij_sq * rjk_sq);
543 else cos_th += Scalar(1.0); // shuts up the compiler warning
545 // set up the evaluator
547 if (evaluator::needsAngle())
548 eval.setAngle(cos_th);
550 // evaluate the force
551 Scalar3 force_divr_ij = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
552 Scalar3 force_divr_jk = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
553 eval.evalForceik(fR, fA, chi, bij, force_divr_ij, force_divr_jk);
555 // add the force to particle i
556 forcei.x += __fmul_rn(dxjk.x, force_divr_jk.y) + __fmul_rn(dxij.x, force_divr_ij.y);
557 forcei.y += __fmul_rn(dxjk.y, force_divr_jk.y) + __fmul_rn(dxij.y, force_divr_ij.y);
558 forcei.z += __fmul_rn(dxjk.z, force_divr_jk.y) + __fmul_rn(dxij.z, force_divr_ij.y);
566 // potential energy per particle must be halved
567 forcei.w *= Scalar(0.5);
568 // now that the force calculation is complete, write out the result (MEM TRANSFER: 20 bytes)
569 #if (__CUDA_ARCH__ >= 200)
570 atomicAdd(&d_force[idx].x, forcei.x);
571 atomicAdd(&d_force[idx].y, forcei.y);
572 atomicAdd(&d_force[idx].z, forcei.z);
573 atomicAdd(&d_force[idx].w, forcei.w);
575 d_force[idx] = forcei;
579 //! Kernel for zeroing forces before computation with atomic additions.
580 /*! \param d_force Device memory to write forces to
581 \param N Number of particles in the system
584 __global__ void gpu_zero_forces_kernel(Scalar4 *d_force,
585 const unsigned int N)
587 // identify the particle we are supposed to handle
588 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
594 d_force[idx] = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
597 //! Kernel driver that computes the three-body forces
598 /*! \param pair_args Other arugments to pass onto the kernel
599 \param d_params Parameters for the potential, stored per type pair
601 This is just a driver function for gpu_compute_triplet_forces_kernel(), see it for details.
603 template< class evaluator >
604 cudaError_t gpu_compute_triplet_forces(const tersoff_args_t& pair_args,
605 const typename evaluator::param_type *d_params)
608 assert(pair_args.d_rcutsq);
609 assert(pair_args.d_ronsq);
610 assert(pair_args.ntypes > 0);
612 static unsigned int max_block_size = UINT_MAX;
613 if (max_block_size == UINT_MAX)
615 cudaFuncAttributes attr;
616 cudaFuncGetAttributes(&attr, gpu_compute_triplet_forces_kernel<evaluator>);
617 max_block_size = attr.maxThreadsPerBlock;
620 unsigned int run_block_size = min(pair_args.block_size, max_block_size);
622 // setup the grid to run the kernel
623 dim3 grid( pair_args.N / run_block_size + 1, 1, 1);
624 dim3 threads(run_block_size, 1, 1);
626 // bind the position texture
627 pdata_pos_tex.normalized = false;
628 pdata_pos_tex.filterMode = cudaFilterModePoint;
629 cudaError_t error = cudaBindTexture(0, pdata_pos_tex, pair_args.d_pos, sizeof(Scalar4) * pair_args.N);
630 if (error != cudaSuccess)
633 Index2D typpair_idx(pair_args.ntypes);
634 unsigned int shared_bytes = (2*sizeof(Scalar) + sizeof(typename evaluator::param_type))
635 * typpair_idx.getNumElements();
638 gpu_zero_forces_kernel<<<grid, threads, shared_bytes>>>(pair_args.d_force,
641 // compute the new forces
642 gpu_compute_triplet_forces_kernel<evaluator>
643 <<<grid, threads, shared_bytes>>>(pair_args.d_force,
658 #endif // __POTENTIAL_TERSOFF_GPU_CUH__