libhoomd/cuda/FIREEnergyMinimizerGPU.cu

   1 /*
   2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
   3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
   4 the University of Michigan All rights reserved.
   5
   6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
   7 copyright is held, by various Contributors who have granted The Regents of the
   8 University of Michigan the right to modify and/or distribute such Contributions.
   9
  10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
  11 and binary forms, provided you abide by the following conditions:
  12
  13 * Redistributions of source code must retain the above copyright notice, this
  14 list of conditions, and the following disclaimer both in the code and
  15 prominently in any materials provided with the distribution.
  16
  17 * Redistributions in binary form must reproduce the above copyright notice, this
  18 list of conditions, and the following disclaimer in the documentation and/or
  19 other materials provided with the distribution.
  20
  21 * All publications and presentations based on HOOMD-blue, including any reports
  22 or published results obtained, in whole or in part, with HOOMD-blue, will
  23 acknowledge its use according to the terms posted at the time of submission on:
  24 http://codeblue.umich.edu/hoomd-blue/citations.html
  25
  26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
  27 http://codeblue.umich.edu/hoomd-blue/
  28
  29 * Apart from the above required attributions, neither the name of the copyright
  30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
  31 promote products derived from this software without specific prior written
  32 permission.
  33
  34 Disclaimer
  35
  36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
  37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
  39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
  40
  41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 */
  49
  50 // Maintainer: askeys
  51
  52 #include "FIREEnergyMinimizerGPU.cuh"
  53 #include "TextureTools.h"
  54
  55 #ifdef WIN32
  56 #include <cassert>
  57 #else
  58 #include <assert.h>
  59 #endif
  60
  61 #include <stdio.h>
  62
  63 /*! \file FIREEnergyMinimizerGPU.cu
  64     \brief Defines GPU kernel code for one performing one FIRE energy
  65     minimization iteration on the GPU. Used by FIREEnergyMinimizerGPU.
  66 */
  67
  68 //! Shared memory used in reducing sums
  69 extern __shared__ Scalar fire_sdata[];
  70 //! Shared memory used in simultaneously reducing three sums (currently unused)
  71 extern __shared__ Scalar fire_sdata1[];
  72 //! Shared memory used in simultaneously reducing three sums (currently unused)
  73 extern __shared__ Scalar fire_sdata2[];
  74 //! Shared memory used in simultaneously reducing three sums (currently unused)
  75 extern __shared__ Scalar fire_sdata3[];
  76
  77 //! The kernel function to zeros velocities, called by gpu_fire_zero_v()
  78 /*! \param d_vel device array of particle velocities
  79     \param d_group_members Device array listing the indicies of the mebers of the group to zero
  80     \param group_size Number of members in the group
  81 */
  82 extern "C" __global__
  83 void gpu_fire_zero_v_kernel(Scalar4 *d_vel,
  84                             unsigned int *d_group_members,
  85                             unsigned int group_size)
  86     {
  87     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
  88     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
  89
  90     if (group_idx < group_size)
  91         {
  92         unsigned int idx = d_group_members[group_idx];
  93
  94         // read the particle's velocity (MEM TRANSFER: 32 bytes)
  95         Scalar4 vel = d_vel[idx];
  96
  97         // zero the velocity(FLOPS: ?)
  98         vel.x = Scalar(0.0);
  99         vel.y = Scalar(0.0);
 100         vel.z = Scalar(0.0);
 101
 102         // write out the results (MEM_TRANSFER: 32 bytes)
 103         d_vel[idx] = vel;
 104         }
 105     }
 106
 107
 108 /*! \param d_vel device array of particle velocities
 109     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 110     \param group_size Number of members in the group
 111
 112 This function is just the driver for gpu_fire_zero_v_kernel(), see that function
 113 for details.
 114 */
 115 cudaError_t gpu_fire_zero_v(Scalar4 *d_vel,
 116                             unsigned int *d_group_members,
 117                             unsigned int group_size)
 118     {
 119     // setup the grid to run the kernel
 120     int block_size = 256;
 121     dim3 grid( (group_size/block_size) + 1, 1, 1);
 122     dim3 threads(block_size, 1, 1);
 123
 124     // run the kernel
 125     gpu_fire_zero_v_kernel<<< grid, threads >>>(d_vel,
 126                                                 d_group_members,
 127                                                 group_size);
 128
 129     return cudaSuccess;
 130     }
 131
 132 //! Kernel function for reducing the potential energy to a partial sum
 133 /*! \param d_group_members Device array listing the indicies of the mebers of the group to sum
 134     \param group_size Number of members in the group
 135     \param d_net_force Pointer to the force array for all particles
 136     \param d_partial_sum_pe Placeholder for the partial sum
 137 */
 138 extern "C" __global__
 139     void gpu_fire_reduce_pe_partial_kernel(unsigned int *d_group_members,
 140                                            unsigned int group_size,
 141                                            Scalar4* d_net_force,
 142                                            Scalar* d_partial_sum_pe)
 143     {
 144     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
 145     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 146
 147     Scalar pe = 0;
 148
 149     if (group_idx < group_size)
 150         {
 151         unsigned int idx = d_group_members[group_idx];
 152         // read the particle's force and extract the pe from w component (MEM TRANSFER: 32 bytes)
 153
 154         Scalar4 force = d_net_force[idx];
 155         pe = force.w;
 156
 157         // Uncoalesced Memory Read replace by Texture Read above.  Scalars4* d_net_force still being passed to support this
 158         // defunct structure.
 159         //pe = d_net_force[idx].w;
 160         }
 161
 162         fire_sdata[threadIdx.x] = pe;
 163         __syncthreads();
 164
 165     // reduce the sum in parallel
 166     int offs = blockDim.x >> 1;
 167     while (offs > 0)
 168         {
 169         if (threadIdx.x < offs)
 170             fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
 171         offs >>= 1;
 172         __syncthreads();
 173         }
 174
 175     // write out our partial sum
 176     if (threadIdx.x == 0)
 177         {
 178         d_partial_sum_pe[blockIdx.x] = fire_sdata[0];
 179         }
 180
 181     }
 182
 183 //! Kernel function for reducing a partial sum to a full sum (one value)
 184 /*! \param d_sum Placeholder for the sum
 185     \param d_partial_sum Array containing the parial sum
 186     \param num_blocks Number of blocks to execute
 187 */
 188 extern "C" __global__
 189     void gpu_fire_reduce_partial_sum_kernel(Scalar *d_sum,
 190                                             Scalar* d_partial_sum,
 191                                             unsigned int num_blocks)
 192     {
 193     Scalar sum = Scalar(0.0);
 194
 195     // sum up the values in the partial sum via a sliding window
 196     for (int start = 0; start < num_blocks; start += blockDim.x)
 197         {
 198         __syncthreads();
 199         if (start + threadIdx.x < num_blocks)
 200             fire_sdata[threadIdx.x] = d_partial_sum[start + threadIdx.x];
 201         else
 202             fire_sdata[threadIdx.x] = Scalar(0.0);
 203         __syncthreads();
 204
 205         // reduce the sum in parallel
 206         int offs = blockDim.x >> 1;
 207         while (offs > 0)
 208             {
 209             if (threadIdx.x < offs)
 210                 fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
 211             offs >>= 1;
 212             __syncthreads();
 213             }
 214
 215         // everybody sums up sum2K
 216         sum += fire_sdata[0];
 217         }
 218
 219     if (threadIdx.x == 0)
 220         *d_sum = sum;
 221     }
 222
 223 /*!  \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 224     \param group_size Number of members in the group
 225     \param d_net_force Array containing the net forces
 226     \param d_sum_pe Placeholder for the sum of the PE
 227     \param d_partial_sum_pe Array containing the parial sum of the PE
 228     \param block_size The size of one block
 229     \param num_blocks Number of blocks to execute
 230
 231     This is a driver for gpu_fire_reduce_pe_partial_kernel() and
 232     gpu_fire_reduce_partial_sum_kernel(), see them for details
 233 */
 234 cudaError_t gpu_fire_compute_sum_pe(unsigned int *d_group_members,
 235                                     unsigned int group_size,
 236                                     Scalar4* d_net_force,
 237                                     Scalar* d_sum_pe,
 238                                     Scalar* d_partial_sum_pe,
 239                                     unsigned int block_size,
 240                                     unsigned int num_blocks)
 241     {
 242
 243
 244     // setup the grid to run the kernel
 245     dim3 grid(num_blocks, 1, 1);
 246     dim3 threads(block_size, 1, 1);
 247
 248     // run the kernel
 249     gpu_fire_reduce_pe_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_group_members,
 250                                                                                      group_size,
 251                                                                                      d_net_force,
 252                                                                                      d_partial_sum_pe);
 253
 254     gpu_fire_reduce_partial_sum_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_sum_pe,
 255                                                                                       d_partial_sum_pe,
 256                                                                                       num_blocks);
 257
 258     return cudaSuccess;
 259     }
 260
 261 //! Kernel function to compute the partial sum over the P term in the FIRE algorithm
 262 /*! \param d_vel particle velocities and masses on the device
 263     \param d_accel particle accelerations on the device
 264     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 265     \param group_size Number of members in the group
 266     \param d_partial_sum_P Array to hold the partial sum
 267 */
 268 extern "C" __global__
 269     void gpu_fire_reduce_P_partial_kernel(const Scalar4 *d_vel,
 270                                           const Scalar3 *d_accel,
 271                                           unsigned int *d_group_members,
 272                                           unsigned int group_size,
 273                                           Scalar* d_partial_sum_P)
 274     {
 275     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
 276     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 277
 278     Scalar P = 0;
 279
 280     if (group_idx < group_size)
 281         {
 282         unsigned int idx = d_group_members[group_idx];
 283
 284         Scalar3 a = d_accel[idx];
 285         Scalar4 v = d_vel[idx];
 286         P = a.x*v.x + a.y*v.y + a.z*v.z;
 287         }
 288
 289     fire_sdata[threadIdx.x] = P;
 290     __syncthreads();
 291
 292     // reduce the sum in parallel
 293     int offs = blockDim.x >> 1;
 294     while (offs > 0)
 295         {
 296         if (threadIdx.x < offs)
 297             fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
 298         offs >>= 1;
 299         __syncthreads();
 300         }
 301
 302     // write out our partial sum
 303     if (threadIdx.x == 0)
 304         d_partial_sum_P[blockIdx.x] = fire_sdata[0];
 305
 306     }
 307
 308 //! Kernel function to compute the partial sum over the vsq term in the FIRE algorithm
 309 /*! \param d_vel device array of particle velocities
 310     \param d_group_members Array listing members of the group
 311     \param group_size Number of members in the group
 312     \param d_partial_sum_vsq Array to hold the partial sum
 313 */
 314 extern "C" __global__
 315     void gpu_fire_reduce_vsq_partial_kernel(const Scalar4 *d_vel,
 316                                             unsigned int *d_group_members,
 317                                             unsigned int group_size,
 318                                             Scalar* d_partial_sum_vsq)
 319     {
 320     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
 321     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 322
 323     Scalar vsq = 0;
 324
 325     if (group_idx < group_size)
 326         {
 327         unsigned int idx = d_group_members[group_idx];
 328
 329         Scalar4 v = d_vel[idx];
 330         vsq = v.x*v.x + v.y*v.y + v.z*v.z;
 331         }
 332
 333     fire_sdata[threadIdx.x] = vsq;
 334     __syncthreads();
 335
 336     // reduce the sum in parallel
 337     int offs = blockDim.x >> 1;
 338     while (offs > 0)
 339         {
 340         if (threadIdx.x < offs)
 341             fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
 342         offs >>= 1;
 343         __syncthreads();
 344         }
 345
 346     // write out our partial sum
 347     if (threadIdx.x == 0)
 348         d_partial_sum_vsq[blockIdx.x] = fire_sdata[0];
 349
 350     }
 351
 352 //! Kernel function to compute the partial sum over the asq term in the FIRE algorithm
 353 /*! \param d_accel device array of particle accelerations
 354     \param d_group_members Array listing members of the group
 355     \param group_size Number of members in the group
 356     \param d_partial_sum_asq Array to hold the partial sum
 357 */
 358 extern "C" __global__
 359     void gpu_fire_reduce_asq_partial_kernel(const Scalar3 *d_accel,
 360                                             unsigned int *d_group_members,
 361                                             unsigned int group_size,
 362                                             Scalar* d_partial_sum_asq)
 363     {
 364     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
 365     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 366
 367     Scalar asq = 0;
 368
 369     if (group_idx < group_size)
 370         {
 371         unsigned int idx = d_group_members[group_idx];
 372
 373         Scalar3 a = d_accel[idx];
 374         asq = a.x*a.x + a.y*a.y + a.z*a.z;
 375         }
 376
 377     fire_sdata[threadIdx.x] = asq;
 378     __syncthreads();
 379
 380     // reduce the sum in parallel
 381     int offs = blockDim.x >> 1;
 382     while (offs > 0)
 383         {
 384         if (threadIdx.x < offs)
 385             fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
 386         offs >>= 1;
 387         __syncthreads();
 388         }
 389
 390     // write out our partial sum
 391     if (threadIdx.x == 0)
 392         d_partial_sum_asq[blockIdx.x] = fire_sdata[0];
 393
 394     }
 395
 396 //! Kernel function to simultaneously compute the partial sum over P, vsq and asq for the FIRE algorithm
 397 /*! \param d_vel device array of particle velocities
 398     \param d_accel device array of particle accelerations
 399     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 400     \param group_size Number of members in the group
 401     \param d_partial_sum_P Array to hold the partial sum over P (v*a)
 402     \param d_partial_sum_vsq Array to hold the partial sum over vsq (v*v)
 403     \param d_partial_sum_asq Array to hold the partial sum over asq (a*a)
 404     \note this function is never used, but could be implemented to improve performance
 405 */
 406 extern "C" __global__
 407     void gpu_fire_reduce_all_partial_kernel(const Scalar4 *d_vel,
 408                                             const Scalar3 *d_accel,
 409                                             unsigned int *d_group_members,
 410                                             unsigned int group_size,
 411                                             Scalar* d_partial_sum_P,
 412                                             Scalar* d_partial_sum_vsq,
 413                                             Scalar* d_partial_sum_asq)
 414     {
 415     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
 416     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 417
 418     Scalar P, vsq, asq;
 419     P=0;
 420     vsq=0;
 421     asq=0;
 422
 423     if (group_idx < group_size)
 424         {
 425         unsigned int idx = d_group_members[group_idx];
 426
 427         Scalar3 a = d_accel[idx];
 428         Scalar4 v = d_vel[idx];
 429         P = a.x*v.x + a.y*v.y + a.z*v.z;
 430         vsq = v.x*v.x + v.y*v.y + v.z*v.z;
 431         asq = a.x*a.x + a.y*a.y + a.z*a.z;
 432         }
 433
 434     fire_sdata1[threadIdx.x] = P;
 435     fire_sdata2[threadIdx.x] = vsq;
 436     fire_sdata3[threadIdx.x] = asq;
 437     __syncthreads();
 438
 439     // reduce the sum in parallel
 440     int offs = blockDim.x >> 1;
 441     while (offs > 0)
 442         {
 443         if (threadIdx.x < offs)
 444             {
 445             fire_sdata1[threadIdx.x] += fire_sdata1[threadIdx.x + offs];
 446             fire_sdata2[threadIdx.x] += fire_sdata2[threadIdx.x + offs];
 447             fire_sdata3[threadIdx.x] += fire_sdata3[threadIdx.x + offs];
 448             }
 449         offs >>= 1;
 450         __syncthreads();
 451         }
 452
 453     // write out our partial sum
 454     if (threadIdx.x == 0)
 455         {
 456         d_partial_sum_P[blockIdx.x] = fire_sdata1[0];
 457         d_partial_sum_vsq[blockIdx.x] = fire_sdata2[0];
 458         d_partial_sum_asq[blockIdx.x] = fire_sdata3[0];
 459         }
 460
 461     }
 462
 463 //! Kernel function to simultaneously reduce three partial sums at the same time
 464 /*! \param d_sum Array to hold the sums
 465     \param d_partial_sum1 Array containing a precomputed partial sum
 466     \param d_partial_sum2 Array containing a precomputed partial sum
 467     \param d_partial_sum3 Array containing a precomputed partial sum
 468     \param num_blocks The number of blocks to execute
 469     \note this function is never used, but could be implemented to improve performance
 470 */
 471 extern "C" __global__
 472     void gpu_fire_reduce_partial_sum_3_kernel(Scalar *d_sum,
 473                                               Scalar* d_partial_sum1,
 474                                               Scalar* d_partial_sum2,
 475                                               Scalar* d_partial_sum3,
 476                                               unsigned int num_blocks)
 477     {
 478     Scalar sum1 = Scalar(0.0);
 479     Scalar sum2 = Scalar(0.0);
 480     Scalar sum3 = Scalar(0.0);
 481
 482     // sum up the values in the partial sum via a sliding window
 483     for (int start = 0; start < num_blocks; start += blockDim.x)
 484         {
 485         __syncthreads();
 486         if (start + threadIdx.x < num_blocks)
 487             {
 488             fire_sdata1[threadIdx.x] = d_partial_sum1[start + threadIdx.x];
 489             fire_sdata2[threadIdx.x] = d_partial_sum2[start + threadIdx.x];
 490             fire_sdata3[threadIdx.x] = d_partial_sum3[start + threadIdx.x];
 491             }
 492         else
 493             {
 494             fire_sdata1[threadIdx.x] = Scalar(0.0);
 495             fire_sdata2[threadIdx.x] = Scalar(0.0);
 496             fire_sdata3[threadIdx.x] = Scalar(0.0);
 497             }
 498         __syncthreads();
 499
 500         // reduce the sum in parallel
 501         int offs = blockDim.x >> 1;
 502         while (offs > 0)
 503             {
 504             if (threadIdx.x < offs)
 505                 {
 506                 fire_sdata1[threadIdx.x] += fire_sdata1[threadIdx.x + offs];
 507                 fire_sdata2[threadIdx.x] += fire_sdata2[threadIdx.x + offs];
 508                 fire_sdata3[threadIdx.x] += fire_sdata3[threadIdx.x + offs];
 509                 }
 510             offs >>= 1;
 511             __syncthreads();
 512             }
 513
 514         sum1 += fire_sdata1[0];
 515         sum2 += fire_sdata2[0];
 516         sum3 += fire_sdata3[0];
 517         }
 518
 519     if (threadIdx.x == 0)
 520         {
 521         d_sum[0] = sum1;
 522         d_sum[1] = sum2;
 523         d_sum[2] = sum3;
 524         }
 525     }
 526
 527 /*! \param N number of particles in system
 528     \param d_vel array of particle velocities
 529     \param d_accel array of particle accelerations
 530     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 531     \param group_size Number of members in the group
 532     \param d_sum_all Array to hold the sum over P, vsq, and asq
 533     \param d_partial_sum_P Array to hold the partial sum over P (a*v)
 534     \param d_partial_sum_vsq Array to hold the partial sum over vsq (v*v)
 535     \param d_partial_sum_asq Array to hold the partial sum over asq (a*a)
 536     \param block_size is the size of one block
 537     \param num_blocks is the number of blocks to execute
 538     \note Currently the sums are performed consecutively. The efficiency of this
 539         function could be improved by computing all three sums simultaneously
 540     This is a driver for gpu_fire_reduce_{X}_partial_kernel() (where X = P, vsq, asq)
 541     and gpu_fire_reduce_partial_sum_kernel(), see them for details
 542 */
 543 cudaError_t gpu_fire_compute_sum_all(
 544                                     const unsigned int N,
 545                                     const Scalar4 *d_vel,
 546                                     const Scalar3 *d_accel,
 547                                     unsigned int *d_group_members,
 548                                     unsigned int group_size,
 549                                     Scalar* d_sum_all,
 550                                     Scalar* d_partial_sum_P,
 551                                     Scalar* d_partial_sum_vsq,
 552                                     Scalar* d_partial_sum_asq,
 553                                     unsigned int block_size,
 554                                     unsigned int num_blocks)
 555     {
 556     // setup the grid to run the kernel
 557     dim3 grid(num_blocks, 1, 1);
 558     dim3 grid1(1, 1, 1);
 559     dim3 threads(block_size, 1, 1);
 560     dim3 threads1(256, 1, 1);
 561
 562     // run the kernels
 563     gpu_fire_reduce_P_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(  d_vel,
 564
 565       d_accel,
 566                                                                                       d_group_members,
 567                                                                                       group_size,
 568                                                                                       d_partial_sum_P);
 569
 570     gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[0],
 571                                                                                       d_partial_sum_P,
 572                                                                                       num_blocks);
 573
 574     gpu_fire_reduce_vsq_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_vel,
 575                                                                                       d_group_members,
 576                                                                                       group_size,
 577                                                                                       d_partial_sum_vsq);
 578
 579     gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[1],
 580                                                                                       d_partial_sum_vsq,
 581                                                                                       num_blocks);
 582
 583     gpu_fire_reduce_asq_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_accel,
 584                                                                                       d_group_members,
 585                                                                                       group_size,
 586                                                                                       d_partial_sum_asq);
 587
 588     gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[2],
 589                                                                                       d_partial_sum_asq,
 590                                                                                       num_blocks);
 591
 592     /*
 593     //do all three sums at once:
 594     gpu_fire_reduce_all_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_vel, d_accel, d_partial_sum_P, d_partial_sum_vsq, d_partial_sum_asq);
 595     gpu_fire_reduce_partial_sum_3_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_sum_all, d_partial_sum_P, d_partial_sum_vsq, d_partial_sum_asq, num_blocks);
 596     */
 597
 598     return cudaSuccess;
 599     }
 600
 601
 602 //! Kernel function to update the velocties used by the FIRE algorithm
 603 /*! \param d_vel Array of velocities to update
 604     \param d_accel Array of accelerations
 605     \param d_group_members Device array listing the indicies of the mebers of the group to update
 606     \param group_size Number of members in the grou
 607     \param alpha Alpha coupling parameter used by the FIRE algorithm
 608     \param vnorm Magnitude of the (3*N) dimensional velocity vector
 609     \param invfnorm 1 over the magnitude of the (3*N) dimensional force vector
 610 */
 611 extern "C" __global__
 612     void gpu_fire_update_v_kernel(Scalar4 *d_vel,
 613                                   const Scalar3 *d_accel,
 614                                   unsigned int *d_group_members,
 615                                   unsigned int group_size,
 616                                   Scalar alpha,
 617                                   Scalar vnorm,
 618                                   Scalar invfnorm)
 619     {
 620     // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
 621     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 622
 623     if (group_idx < group_size)
 624         {
 625         unsigned int idx = d_group_members[group_idx];
 626         // read the particle's velocity and acceleration (MEM TRANSFER: 32 bytes)
 627         Scalar4 v = d_vel[idx];
 628         Scalar3 a = d_accel[idx];
 629
 630         v.x = v.x*(Scalar(1.0)-alpha) + alpha*a.x*invfnorm*vnorm;
 631         v.y = v.y*(Scalar(1.0)-alpha) + alpha*a.y*invfnorm*vnorm;
 632         v.z = v.z*(Scalar(1.0)-alpha) + alpha*a.z*invfnorm*vnorm;
 633
 634         // write out the results (MEM_TRANSFER: 32 bytes)
 635         d_vel[idx] = v;
 636         }
 637     }
 638
 639
 640 /*! \param d_vel array of particle velocities to update
 641     \param d_accel array of particle accelerations
 642     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 643     \param group_size Number of members in the group
 644     \param alpha Alpha coupling parameter used by the FIRE algorithm
 645     \param vnorm Magnitude of the (3*N) dimensional velocity vector
 646     \param invfnorm 1 over the magnitude of the (3*N) dimensional force vector
 647
 648     This function is a driver for gpu_fire_update_v_kernel(), see it for details.
 649 */
 650 cudaError_t gpu_fire_update_v(Scalar4 *d_vel,
 651                               const Scalar3 *d_accel,
 652                               unsigned int *d_group_members,
 653                               unsigned int group_size,
 654                               Scalar alpha,
 655                               Scalar vnorm,
 656                               Scalar invfnorm)
 657     {
 658     // setup the grid to run the kernel
 659     int block_size = 256;
 660     dim3 grid( (group_size/block_size) + 1, 1, 1);
 661     dim3 threads(block_size, 1, 1);
 662
 663     // run the kernel
 664     gpu_fire_update_v_kernel<<< grid, threads >>>(d_vel,
 665                                                   d_accel,
 666                                                   d_group_members,
 667                                                   group_size,
 668                                                   alpha,
 669                                                   vnorm,
 670                                                   invfnorm);
 671
 672     return cudaSuccess;
 673     }