Fix potential problem in Messenger related to MPI window
[hoomd-blue.git] / libhoomd / cuda / FIREEnergyMinimizerGPU.cu
blob5a0b327e769c5bb02995f937d0e5dedf205f685b
1 /*
2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
32 permission.
34 Disclaimer
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: askeys
52 #include "FIREEnergyMinimizerGPU.cuh"
53 #include "TextureTools.h"
55 #ifdef WIN32
56 #include <cassert>
57 #else
58 #include <assert.h>
59 #endif
61 #include <stdio.h>
63 /*! \file FIREEnergyMinimizerGPU.cu
64     \brief Defines GPU kernel code for one performing one FIRE energy
65     minimization iteration on the GPU. Used by FIREEnergyMinimizerGPU.
68 //! Shared memory used in reducing sums
69 extern __shared__ Scalar fire_sdata[];
70 //! Shared memory used in simultaneously reducing three sums (currently unused)
71 extern __shared__ Scalar fire_sdata1[];
72 //! Shared memory used in simultaneously reducing three sums (currently unused)
73 extern __shared__ Scalar fire_sdata2[];
74 //! Shared memory used in simultaneously reducing three sums (currently unused)
75 extern __shared__ Scalar fire_sdata3[];
77 //! The kernel function to zeros velocities, called by gpu_fire_zero_v()
78 /*! \param d_vel device array of particle velocities
79     \param d_group_members Device array listing the indicies of the mebers of the group to zero
80     \param group_size Number of members in the group
82 extern "C" __global__
83 void gpu_fire_zero_v_kernel(Scalar4 *d_vel,
84                             unsigned int *d_group_members,
85                             unsigned int group_size)
86     {
87     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
88     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
90     if (group_idx < group_size)
91         {
92         unsigned int idx = d_group_members[group_idx];
94         // read the particle's velocity (MEM TRANSFER: 32 bytes)
95         Scalar4 vel = d_vel[idx];
97         // zero the velocity(FLOPS: ?)
98         vel.x = Scalar(0.0);
99         vel.y = Scalar(0.0);
100         vel.z = Scalar(0.0);
102         // write out the results (MEM_TRANSFER: 32 bytes)
103         d_vel[idx] = vel;
104         }
105     }
108 /*! \param d_vel device array of particle velocities
109     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
110     \param group_size Number of members in the group
112 This function is just the driver for gpu_fire_zero_v_kernel(), see that function
113 for details.
115 cudaError_t gpu_fire_zero_v(Scalar4 *d_vel,
116                             unsigned int *d_group_members,
117                             unsigned int group_size)
118     {
119     // setup the grid to run the kernel
120     int block_size = 256;
121     dim3 grid( (group_size/block_size) + 1, 1, 1);
122     dim3 threads(block_size, 1, 1);
124     // run the kernel
125     gpu_fire_zero_v_kernel<<< grid, threads >>>(d_vel,
126                                                 d_group_members,
127                                                 group_size);
129     return cudaSuccess;
130     }
132 //! Kernel function for reducing the potential energy to a partial sum
133 /*! \param d_group_members Device array listing the indicies of the mebers of the group to sum
134     \param group_size Number of members in the group
135     \param d_net_force Pointer to the force array for all particles
136     \param d_partial_sum_pe Placeholder for the partial sum
138 extern "C" __global__
139     void gpu_fire_reduce_pe_partial_kernel(unsigned int *d_group_members,
140                                            unsigned int group_size,
141                                            Scalar4* d_net_force,
142                                            Scalar* d_partial_sum_pe)
143     {
144     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
145     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
147     Scalar pe = 0;
149     if (group_idx < group_size)
150         {
151         unsigned int idx = d_group_members[group_idx];
152         // read the particle's force and extract the pe from w component (MEM TRANSFER: 32 bytes)
154         Scalar4 force = d_net_force[idx];
155         pe = force.w;
157         // Uncoalesced Memory Read replace by Texture Read above.  Scalars4* d_net_force still being passed to support this
158         // defunct structure.
159         //pe = d_net_force[idx].w;
160         }
162         fire_sdata[threadIdx.x] = pe;
163         __syncthreads();
165     // reduce the sum in parallel
166     int offs = blockDim.x >> 1;
167     while (offs > 0)
168         {
169         if (threadIdx.x < offs)
170             fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
171         offs >>= 1;
172         __syncthreads();
173         }
175     // write out our partial sum
176     if (threadIdx.x == 0)
177         {
178         d_partial_sum_pe[blockIdx.x] = fire_sdata[0];
179         }
181     }
183 //! Kernel function for reducing a partial sum to a full sum (one value)
184 /*! \param d_sum Placeholder for the sum
185     \param d_partial_sum Array containing the parial sum
186     \param num_blocks Number of blocks to execute
188 extern "C" __global__
189     void gpu_fire_reduce_partial_sum_kernel(Scalar *d_sum,
190                                             Scalar* d_partial_sum,
191                                             unsigned int num_blocks)
192     {
193     Scalar sum = Scalar(0.0);
195     // sum up the values in the partial sum via a sliding window
196     for (int start = 0; start < num_blocks; start += blockDim.x)
197         {
198         __syncthreads();
199         if (start + threadIdx.x < num_blocks)
200             fire_sdata[threadIdx.x] = d_partial_sum[start + threadIdx.x];
201         else
202             fire_sdata[threadIdx.x] = Scalar(0.0);
203         __syncthreads();
205         // reduce the sum in parallel
206         int offs = blockDim.x >> 1;
207         while (offs > 0)
208             {
209             if (threadIdx.x < offs)
210                 fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
211             offs >>= 1;
212             __syncthreads();
213             }
215         // everybody sums up sum2K
216         sum += fire_sdata[0];
217         }
219     if (threadIdx.x == 0)
220         *d_sum = sum;
221     }
223 /*!  \param d_group_members Device array listing the indicies of the mebers of the group to integrate
224     \param group_size Number of members in the group
225     \param d_net_force Array containing the net forces
226     \param d_sum_pe Placeholder for the sum of the PE
227     \param d_partial_sum_pe Array containing the parial sum of the PE
228     \param block_size The size of one block
229     \param num_blocks Number of blocks to execute
231     This is a driver for gpu_fire_reduce_pe_partial_kernel() and
232     gpu_fire_reduce_partial_sum_kernel(), see them for details
234 cudaError_t gpu_fire_compute_sum_pe(unsigned int *d_group_members,
235                                     unsigned int group_size,
236                                     Scalar4* d_net_force,
237                                     Scalar* d_sum_pe,
238                                     Scalar* d_partial_sum_pe,
239                                     unsigned int block_size,
240                                     unsigned int num_blocks)
241     {
244     // setup the grid to run the kernel
245     dim3 grid(num_blocks, 1, 1);
246     dim3 threads(block_size, 1, 1);
248     // run the kernel
249     gpu_fire_reduce_pe_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_group_members,
250                                                                                      group_size,
251                                                                                      d_net_force,
252                                                                                      d_partial_sum_pe);
254     gpu_fire_reduce_partial_sum_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_sum_pe,
255                                                                                       d_partial_sum_pe,
256                                                                                       num_blocks);
258     return cudaSuccess;
259     }
261 //! Kernel function to compute the partial sum over the P term in the FIRE algorithm
262 /*! \param d_vel particle velocities and masses on the device
263     \param d_accel particle accelerations on the device
264     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
265     \param group_size Number of members in the group
266     \param d_partial_sum_P Array to hold the partial sum
268 extern "C" __global__
269     void gpu_fire_reduce_P_partial_kernel(const Scalar4 *d_vel,
270                                           const Scalar3 *d_accel,
271                                           unsigned int *d_group_members,
272                                           unsigned int group_size,
273                                           Scalar* d_partial_sum_P)
274     {
275     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
276     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
278     Scalar P = 0;
280     if (group_idx < group_size)
281         {
282         unsigned int idx = d_group_members[group_idx];
284         Scalar3 a = d_accel[idx];
285         Scalar4 v = d_vel[idx];
286         P = a.x*v.x + a.y*v.y + a.z*v.z;
287         }
289     fire_sdata[threadIdx.x] = P;
290     __syncthreads();
292     // reduce the sum in parallel
293     int offs = blockDim.x >> 1;
294     while (offs > 0)
295         {
296         if (threadIdx.x < offs)
297             fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
298         offs >>= 1;
299         __syncthreads();
300         }
302     // write out our partial sum
303     if (threadIdx.x == 0)
304         d_partial_sum_P[blockIdx.x] = fire_sdata[0];
306     }
308 //! Kernel function to compute the partial sum over the vsq term in the FIRE algorithm
309 /*! \param d_vel device array of particle velocities
310     \param d_group_members Array listing members of the group
311     \param group_size Number of members in the group
312     \param d_partial_sum_vsq Array to hold the partial sum
314 extern "C" __global__
315     void gpu_fire_reduce_vsq_partial_kernel(const Scalar4 *d_vel,
316                                             unsigned int *d_group_members,
317                                             unsigned int group_size,
318                                             Scalar* d_partial_sum_vsq)
319     {
320     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
321     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
323     Scalar vsq = 0;
325     if (group_idx < group_size)
326         {
327         unsigned int idx = d_group_members[group_idx];
329         Scalar4 v = d_vel[idx];
330         vsq = v.x*v.x + v.y*v.y + v.z*v.z;
331         }
333     fire_sdata[threadIdx.x] = vsq;
334     __syncthreads();
336     // reduce the sum in parallel
337     int offs = blockDim.x >> 1;
338     while (offs > 0)
339         {
340         if (threadIdx.x < offs)
341             fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
342         offs >>= 1;
343         __syncthreads();
344         }
346     // write out our partial sum
347     if (threadIdx.x == 0)
348         d_partial_sum_vsq[blockIdx.x] = fire_sdata[0];
350     }
352 //! Kernel function to compute the partial sum over the asq term in the FIRE algorithm
353 /*! \param d_accel device array of particle accelerations
354     \param d_group_members Array listing members of the group
355     \param group_size Number of members in the group
356     \param d_partial_sum_asq Array to hold the partial sum
358 extern "C" __global__
359     void gpu_fire_reduce_asq_partial_kernel(const Scalar3 *d_accel,
360                                             unsigned int *d_group_members,
361                                             unsigned int group_size,
362                                             Scalar* d_partial_sum_asq)
363     {
364     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
365     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
367     Scalar asq = 0;
369     if (group_idx < group_size)
370         {
371         unsigned int idx = d_group_members[group_idx];
373         Scalar3 a = d_accel[idx];
374         asq = a.x*a.x + a.y*a.y + a.z*a.z;
375         }
377     fire_sdata[threadIdx.x] = asq;
378     __syncthreads();
380     // reduce the sum in parallel
381     int offs = blockDim.x >> 1;
382     while (offs > 0)
383         {
384         if (threadIdx.x < offs)
385             fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
386         offs >>= 1;
387         __syncthreads();
388         }
390     // write out our partial sum
391     if (threadIdx.x == 0)
392         d_partial_sum_asq[blockIdx.x] = fire_sdata[0];
394     }
396 //! Kernel function to simultaneously compute the partial sum over P, vsq and asq for the FIRE algorithm
397 /*! \param d_vel device array of particle velocities
398     \param d_accel device array of particle accelerations
399     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
400     \param group_size Number of members in the group
401     \param d_partial_sum_P Array to hold the partial sum over P (v*a)
402     \param d_partial_sum_vsq Array to hold the partial sum over vsq (v*v)
403     \param d_partial_sum_asq Array to hold the partial sum over asq (a*a)
404     \note this function is never used, but could be implemented to improve performance
406 extern "C" __global__
407     void gpu_fire_reduce_all_partial_kernel(const Scalar4 *d_vel,
408                                             const Scalar3 *d_accel,
409                                             unsigned int *d_group_members,
410                                             unsigned int group_size,
411                                             Scalar* d_partial_sum_P,
412                                             Scalar* d_partial_sum_vsq,
413                                             Scalar* d_partial_sum_asq)
414     {
415     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
416     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
418     Scalar P, vsq, asq;
419     P=0;
420     vsq=0;
421     asq=0;
423     if (group_idx < group_size)
424         {
425         unsigned int idx = d_group_members[group_idx];
427         Scalar3 a = d_accel[idx];
428         Scalar4 v = d_vel[idx];
429         P = a.x*v.x + a.y*v.y + a.z*v.z;
430         vsq = v.x*v.x + v.y*v.y + v.z*v.z;
431         asq = a.x*a.x + a.y*a.y + a.z*a.z;
432         }
434     fire_sdata1[threadIdx.x] = P;
435     fire_sdata2[threadIdx.x] = vsq;
436     fire_sdata3[threadIdx.x] = asq;
437     __syncthreads();
439     // reduce the sum in parallel
440     int offs = blockDim.x >> 1;
441     while (offs > 0)
442         {
443         if (threadIdx.x < offs)
444             {
445             fire_sdata1[threadIdx.x] += fire_sdata1[threadIdx.x + offs];
446             fire_sdata2[threadIdx.x] += fire_sdata2[threadIdx.x + offs];
447             fire_sdata3[threadIdx.x] += fire_sdata3[threadIdx.x + offs];
448             }
449         offs >>= 1;
450         __syncthreads();
451         }
453     // write out our partial sum
454     if (threadIdx.x == 0)
455         {
456         d_partial_sum_P[blockIdx.x] = fire_sdata1[0];
457         d_partial_sum_vsq[blockIdx.x] = fire_sdata2[0];
458         d_partial_sum_asq[blockIdx.x] = fire_sdata3[0];
459         }
461     }
463 //! Kernel function to simultaneously reduce three partial sums at the same time
464 /*! \param d_sum Array to hold the sums
465     \param d_partial_sum1 Array containing a precomputed partial sum
466     \param d_partial_sum2 Array containing a precomputed partial sum
467     \param d_partial_sum3 Array containing a precomputed partial sum
468     \param num_blocks The number of blocks to execute
469     \note this function is never used, but could be implemented to improve performance
471 extern "C" __global__
472     void gpu_fire_reduce_partial_sum_3_kernel(Scalar *d_sum,
473                                               Scalar* d_partial_sum1,
474                                               Scalar* d_partial_sum2,
475                                               Scalar* d_partial_sum3,
476                                               unsigned int num_blocks)
477     {
478     Scalar sum1 = Scalar(0.0);
479     Scalar sum2 = Scalar(0.0);
480     Scalar sum3 = Scalar(0.0);
482     // sum up the values in the partial sum via a sliding window
483     for (int start = 0; start < num_blocks; start += blockDim.x)
484         {
485         __syncthreads();
486         if (start + threadIdx.x < num_blocks)
487             {
488             fire_sdata1[threadIdx.x] = d_partial_sum1[start + threadIdx.x];
489             fire_sdata2[threadIdx.x] = d_partial_sum2[start + threadIdx.x];
490             fire_sdata3[threadIdx.x] = d_partial_sum3[start + threadIdx.x];
491             }
492         else
493             {
494             fire_sdata1[threadIdx.x] = Scalar(0.0);
495             fire_sdata2[threadIdx.x] = Scalar(0.0);
496             fire_sdata3[threadIdx.x] = Scalar(0.0);
497             }
498         __syncthreads();
500         // reduce the sum in parallel
501         int offs = blockDim.x >> 1;
502         while (offs > 0)
503             {
504             if (threadIdx.x < offs)
505                 {
506                 fire_sdata1[threadIdx.x] += fire_sdata1[threadIdx.x + offs];
507                 fire_sdata2[threadIdx.x] += fire_sdata2[threadIdx.x + offs];
508                 fire_sdata3[threadIdx.x] += fire_sdata3[threadIdx.x + offs];
509                 }
510             offs >>= 1;
511             __syncthreads();
512             }
514         sum1 += fire_sdata1[0];
515         sum2 += fire_sdata2[0];
516         sum3 += fire_sdata3[0];
517         }
519     if (threadIdx.x == 0)
520         {
521         d_sum[0] = sum1;
522         d_sum[1] = sum2;
523         d_sum[2] = sum3;
524         }
525     }
527 /*! \param N number of particles in system
528     \param d_vel array of particle velocities
529     \param d_accel array of particle accelerations
530     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
531     \param group_size Number of members in the group
532     \param d_sum_all Array to hold the sum over P, vsq, and asq
533     \param d_partial_sum_P Array to hold the partial sum over P (a*v)
534     \param d_partial_sum_vsq Array to hold the partial sum over vsq (v*v)
535     \param d_partial_sum_asq Array to hold the partial sum over asq (a*a)
536     \param block_size is the size of one block
537     \param num_blocks is the number of blocks to execute
538     \note Currently the sums are performed consecutively. The efficiency of this
539         function could be improved by computing all three sums simultaneously
540     This is a driver for gpu_fire_reduce_{X}_partial_kernel() (where X = P, vsq, asq)
541     and gpu_fire_reduce_partial_sum_kernel(), see them for details
543 cudaError_t gpu_fire_compute_sum_all(
544                                     const unsigned int N,
545                                     const Scalar4 *d_vel,
546                                     const Scalar3 *d_accel,
547                                     unsigned int *d_group_members,
548                                     unsigned int group_size,
549                                     Scalar* d_sum_all,
550                                     Scalar* d_partial_sum_P,
551                                     Scalar* d_partial_sum_vsq,
552                                     Scalar* d_partial_sum_asq,
553                                     unsigned int block_size,
554                                     unsigned int num_blocks)
555     {
556     // setup the grid to run the kernel
557     dim3 grid(num_blocks, 1, 1);
558     dim3 grid1(1, 1, 1);
559     dim3 threads(block_size, 1, 1);
560     dim3 threads1(256, 1, 1);
562     // run the kernels
563     gpu_fire_reduce_P_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(  d_vel,
565       d_accel,
566                                                                                       d_group_members,
567                                                                                       group_size,
568                                                                                       d_partial_sum_P);
570     gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[0],
571                                                                                       d_partial_sum_P,
572                                                                                       num_blocks);
574     gpu_fire_reduce_vsq_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_vel,
575                                                                                       d_group_members,
576                                                                                       group_size,
577                                                                                       d_partial_sum_vsq);
579     gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[1],
580                                                                                       d_partial_sum_vsq,
581                                                                                       num_blocks);
583     gpu_fire_reduce_asq_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_accel,
584                                                                                       d_group_members,
585                                                                                       group_size,
586                                                                                       d_partial_sum_asq);
588     gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[2],
589                                                                                       d_partial_sum_asq,
590                                                                                       num_blocks);
592     /*
593     //do all three sums at once:
594     gpu_fire_reduce_all_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_vel, d_accel, d_partial_sum_P, d_partial_sum_vsq, d_partial_sum_asq);
595     gpu_fire_reduce_partial_sum_3_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_sum_all, d_partial_sum_P, d_partial_sum_vsq, d_partial_sum_asq, num_blocks);
596     */
598     return cudaSuccess;
599     }
602 //! Kernel function to update the velocties used by the FIRE algorithm
603 /*! \param d_vel Array of velocities to update
604     \param d_accel Array of accelerations
605     \param d_group_members Device array listing the indicies of the mebers of the group to update
606     \param group_size Number of members in the grou
607     \param alpha Alpha coupling parameter used by the FIRE algorithm
608     \param vnorm Magnitude of the (3*N) dimensional velocity vector
609     \param invfnorm 1 over the magnitude of the (3*N) dimensional force vector
611 extern "C" __global__
612     void gpu_fire_update_v_kernel(Scalar4 *d_vel,
613                                   const Scalar3 *d_accel,
614                                   unsigned int *d_group_members,
615                                   unsigned int group_size,
616                                   Scalar alpha,
617                                   Scalar vnorm,
618                                   Scalar invfnorm)
619     {
620     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
621     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
623     if (group_idx < group_size)
624         {
625         unsigned int idx = d_group_members[group_idx];
626         // read the particle's velocity and acceleration (MEM TRANSFER: 32 bytes)
627         Scalar4 v = d_vel[idx];
628         Scalar3 a = d_accel[idx];
630         v.x = v.x*(Scalar(1.0)-alpha) + alpha*a.x*invfnorm*vnorm;
631         v.y = v.y*(Scalar(1.0)-alpha) + alpha*a.y*invfnorm*vnorm;
632         v.z = v.z*(Scalar(1.0)-alpha) + alpha*a.z*invfnorm*vnorm;
634         // write out the results (MEM_TRANSFER: 32 bytes)
635         d_vel[idx] = v;
636         }
637     }
640 /*! \param d_vel array of particle velocities to update
641     \param d_accel array of particle accelerations
642     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
643     \param group_size Number of members in the group
644     \param alpha Alpha coupling parameter used by the FIRE algorithm
645     \param vnorm Magnitude of the (3*N) dimensional velocity vector
646     \param invfnorm 1 over the magnitude of the (3*N) dimensional force vector
648     This function is a driver for gpu_fire_update_v_kernel(), see it for details.
650 cudaError_t gpu_fire_update_v(Scalar4 *d_vel,
651                               const Scalar3 *d_accel,
652                               unsigned int *d_group_members,
653                               unsigned int group_size,
654                               Scalar alpha,
655                               Scalar vnorm,
656                               Scalar invfnorm)
657     {
658     // setup the grid to run the kernel
659     int block_size = 256;
660     dim3 grid( (group_size/block_size) + 1, 1, 1);
661     dim3 threads(block_size, 1, 1);
663     // run the kernel
664     gpu_fire_update_v_kernel<<< grid, threads >>>(d_vel,
665                                                   d_accel,
666                                                   d_group_members,
667                                                   group_size,
668                                                   alpha,
669                                                   vnorm,
670                                                   invfnorm);
672     return cudaSuccess;
673     }