2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52 #include "FIREEnergyMinimizerGPU.cuh"
53 #include "TextureTools.h"
63 /*! \file FIREEnergyMinimizerGPU.cu
64 \brief Defines GPU kernel code for one performing one FIRE energy
65 minimization iteration on the GPU. Used by FIREEnergyMinimizerGPU.
68 //! Shared memory used in reducing sums
69 extern __shared__ Scalar fire_sdata[];
70 //! Shared memory used in simultaneously reducing three sums (currently unused)
71 extern __shared__ Scalar fire_sdata1[];
72 //! Shared memory used in simultaneously reducing three sums (currently unused)
73 extern __shared__ Scalar fire_sdata2[];
74 //! Shared memory used in simultaneously reducing three sums (currently unused)
75 extern __shared__ Scalar fire_sdata3[];
77 //! The kernel function to zeros velocities, called by gpu_fire_zero_v()
78 /*! \param d_vel device array of particle velocities
79 \param d_group_members Device array listing the indicies of the mebers of the group to zero
80 \param group_size Number of members in the group
83 void gpu_fire_zero_v_kernel(Scalar4 *d_vel,
84 unsigned int *d_group_members,
85 unsigned int group_size)
87 // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
88 int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
90 if (group_idx < group_size)
92 unsigned int idx = d_group_members[group_idx];
94 // read the particle's velocity (MEM TRANSFER: 32 bytes)
95 Scalar4 vel = d_vel[idx];
97 // zero the velocity(FLOPS: ?)
102 // write out the results (MEM_TRANSFER: 32 bytes)
108 /*! \param d_vel device array of particle velocities
109 \param d_group_members Device array listing the indicies of the mebers of the group to integrate
110 \param group_size Number of members in the group
112 This function is just the driver for gpu_fire_zero_v_kernel(), see that function
115 cudaError_t gpu_fire_zero_v(Scalar4 *d_vel,
116 unsigned int *d_group_members,
117 unsigned int group_size)
119 // setup the grid to run the kernel
120 int block_size = 256;
121 dim3 grid( (group_size/block_size) + 1, 1, 1);
122 dim3 threads(block_size, 1, 1);
125 gpu_fire_zero_v_kernel<<< grid, threads >>>(d_vel,
132 //! Kernel function for reducing the potential energy to a partial sum
133 /*! \param d_group_members Device array listing the indicies of the mebers of the group to sum
134 \param group_size Number of members in the group
135 \param d_net_force Pointer to the force array for all particles
136 \param d_partial_sum_pe Placeholder for the partial sum
138 extern "C" __global__
139 void gpu_fire_reduce_pe_partial_kernel(unsigned int *d_group_members,
140 unsigned int group_size,
141 Scalar4* d_net_force,
142 Scalar* d_partial_sum_pe)
144 // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
145 int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
149 if (group_idx < group_size)
151 unsigned int idx = d_group_members[group_idx];
152 // read the particle's force and extract the pe from w component (MEM TRANSFER: 32 bytes)
154 Scalar4 force = d_net_force[idx];
157 // Uncoalesced Memory Read replace by Texture Read above. Scalars4* d_net_force still being passed to support this
158 // defunct structure.
159 //pe = d_net_force[idx].w;
162 fire_sdata[threadIdx.x] = pe;
165 // reduce the sum in parallel
166 int offs = blockDim.x >> 1;
169 if (threadIdx.x < offs)
170 fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
175 // write out our partial sum
176 if (threadIdx.x == 0)
178 d_partial_sum_pe[blockIdx.x] = fire_sdata[0];
183 //! Kernel function for reducing a partial sum to a full sum (one value)
184 /*! \param d_sum Placeholder for the sum
185 \param d_partial_sum Array containing the parial sum
186 \param num_blocks Number of blocks to execute
188 extern "C" __global__
189 void gpu_fire_reduce_partial_sum_kernel(Scalar *d_sum,
190 Scalar* d_partial_sum,
191 unsigned int num_blocks)
193 Scalar sum = Scalar(0.0);
195 // sum up the values in the partial sum via a sliding window
196 for (int start = 0; start < num_blocks; start += blockDim.x)
199 if (start + threadIdx.x < num_blocks)
200 fire_sdata[threadIdx.x] = d_partial_sum[start + threadIdx.x];
202 fire_sdata[threadIdx.x] = Scalar(0.0);
205 // reduce the sum in parallel
206 int offs = blockDim.x >> 1;
209 if (threadIdx.x < offs)
210 fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
215 // everybody sums up sum2K
216 sum += fire_sdata[0];
219 if (threadIdx.x == 0)
223 /*! \param d_group_members Device array listing the indicies of the mebers of the group to integrate
224 \param group_size Number of members in the group
225 \param d_net_force Array containing the net forces
226 \param d_sum_pe Placeholder for the sum of the PE
227 \param d_partial_sum_pe Array containing the parial sum of the PE
228 \param block_size The size of one block
229 \param num_blocks Number of blocks to execute
231 This is a driver for gpu_fire_reduce_pe_partial_kernel() and
232 gpu_fire_reduce_partial_sum_kernel(), see them for details
234 cudaError_t gpu_fire_compute_sum_pe(unsigned int *d_group_members,
235 unsigned int group_size,
236 Scalar4* d_net_force,
238 Scalar* d_partial_sum_pe,
239 unsigned int block_size,
240 unsigned int num_blocks)
244 // setup the grid to run the kernel
245 dim3 grid(num_blocks, 1, 1);
246 dim3 threads(block_size, 1, 1);
249 gpu_fire_reduce_pe_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_group_members,
254 gpu_fire_reduce_partial_sum_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_sum_pe,
261 //! Kernel function to compute the partial sum over the P term in the FIRE algorithm
262 /*! \param d_vel particle velocities and masses on the device
263 \param d_accel particle accelerations on the device
264 \param d_group_members Device array listing the indicies of the mebers of the group to integrate
265 \param group_size Number of members in the group
266 \param d_partial_sum_P Array to hold the partial sum
268 extern "C" __global__
269 void gpu_fire_reduce_P_partial_kernel(const Scalar4 *d_vel,
270 const Scalar3 *d_accel,
271 unsigned int *d_group_members,
272 unsigned int group_size,
273 Scalar* d_partial_sum_P)
275 // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
276 int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
280 if (group_idx < group_size)
282 unsigned int idx = d_group_members[group_idx];
284 Scalar3 a = d_accel[idx];
285 Scalar4 v = d_vel[idx];
286 P = a.x*v.x + a.y*v.y + a.z*v.z;
289 fire_sdata[threadIdx.x] = P;
292 // reduce the sum in parallel
293 int offs = blockDim.x >> 1;
296 if (threadIdx.x < offs)
297 fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
302 // write out our partial sum
303 if (threadIdx.x == 0)
304 d_partial_sum_P[blockIdx.x] = fire_sdata[0];
308 //! Kernel function to compute the partial sum over the vsq term in the FIRE algorithm
309 /*! \param d_vel device array of particle velocities
310 \param d_group_members Array listing members of the group
311 \param group_size Number of members in the group
312 \param d_partial_sum_vsq Array to hold the partial sum
314 extern "C" __global__
315 void gpu_fire_reduce_vsq_partial_kernel(const Scalar4 *d_vel,
316 unsigned int *d_group_members,
317 unsigned int group_size,
318 Scalar* d_partial_sum_vsq)
320 // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
321 int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
325 if (group_idx < group_size)
327 unsigned int idx = d_group_members[group_idx];
329 Scalar4 v = d_vel[idx];
330 vsq = v.x*v.x + v.y*v.y + v.z*v.z;
333 fire_sdata[threadIdx.x] = vsq;
336 // reduce the sum in parallel
337 int offs = blockDim.x >> 1;
340 if (threadIdx.x < offs)
341 fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
346 // write out our partial sum
347 if (threadIdx.x == 0)
348 d_partial_sum_vsq[blockIdx.x] = fire_sdata[0];
352 //! Kernel function to compute the partial sum over the asq term in the FIRE algorithm
353 /*! \param d_accel device array of particle accelerations
354 \param d_group_members Array listing members of the group
355 \param group_size Number of members in the group
356 \param d_partial_sum_asq Array to hold the partial sum
358 extern "C" __global__
359 void gpu_fire_reduce_asq_partial_kernel(const Scalar3 *d_accel,
360 unsigned int *d_group_members,
361 unsigned int group_size,
362 Scalar* d_partial_sum_asq)
364 // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
365 int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
369 if (group_idx < group_size)
371 unsigned int idx = d_group_members[group_idx];
373 Scalar3 a = d_accel[idx];
374 asq = a.x*a.x + a.y*a.y + a.z*a.z;
377 fire_sdata[threadIdx.x] = asq;
380 // reduce the sum in parallel
381 int offs = blockDim.x >> 1;
384 if (threadIdx.x < offs)
385 fire_sdata[threadIdx.x] += fire_sdata[threadIdx.x + offs];
390 // write out our partial sum
391 if (threadIdx.x == 0)
392 d_partial_sum_asq[blockIdx.x] = fire_sdata[0];
396 //! Kernel function to simultaneously compute the partial sum over P, vsq and asq for the FIRE algorithm
397 /*! \param d_vel device array of particle velocities
398 \param d_accel device array of particle accelerations
399 \param d_group_members Device array listing the indicies of the mebers of the group to integrate
400 \param group_size Number of members in the group
401 \param d_partial_sum_P Array to hold the partial sum over P (v*a)
402 \param d_partial_sum_vsq Array to hold the partial sum over vsq (v*v)
403 \param d_partial_sum_asq Array to hold the partial sum over asq (a*a)
404 \note this function is never used, but could be implemented to improve performance
406 extern "C" __global__
407 void gpu_fire_reduce_all_partial_kernel(const Scalar4 *d_vel,
408 const Scalar3 *d_accel,
409 unsigned int *d_group_members,
410 unsigned int group_size,
411 Scalar* d_partial_sum_P,
412 Scalar* d_partial_sum_vsq,
413 Scalar* d_partial_sum_asq)
415 // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
416 int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
423 if (group_idx < group_size)
425 unsigned int idx = d_group_members[group_idx];
427 Scalar3 a = d_accel[idx];
428 Scalar4 v = d_vel[idx];
429 P = a.x*v.x + a.y*v.y + a.z*v.z;
430 vsq = v.x*v.x + v.y*v.y + v.z*v.z;
431 asq = a.x*a.x + a.y*a.y + a.z*a.z;
434 fire_sdata1[threadIdx.x] = P;
435 fire_sdata2[threadIdx.x] = vsq;
436 fire_sdata3[threadIdx.x] = asq;
439 // reduce the sum in parallel
440 int offs = blockDim.x >> 1;
443 if (threadIdx.x < offs)
445 fire_sdata1[threadIdx.x] += fire_sdata1[threadIdx.x + offs];
446 fire_sdata2[threadIdx.x] += fire_sdata2[threadIdx.x + offs];
447 fire_sdata3[threadIdx.x] += fire_sdata3[threadIdx.x + offs];
453 // write out our partial sum
454 if (threadIdx.x == 0)
456 d_partial_sum_P[blockIdx.x] = fire_sdata1[0];
457 d_partial_sum_vsq[blockIdx.x] = fire_sdata2[0];
458 d_partial_sum_asq[blockIdx.x] = fire_sdata3[0];
463 //! Kernel function to simultaneously reduce three partial sums at the same time
464 /*! \param d_sum Array to hold the sums
465 \param d_partial_sum1 Array containing a precomputed partial sum
466 \param d_partial_sum2 Array containing a precomputed partial sum
467 \param d_partial_sum3 Array containing a precomputed partial sum
468 \param num_blocks The number of blocks to execute
469 \note this function is never used, but could be implemented to improve performance
471 extern "C" __global__
472 void gpu_fire_reduce_partial_sum_3_kernel(Scalar *d_sum,
473 Scalar* d_partial_sum1,
474 Scalar* d_partial_sum2,
475 Scalar* d_partial_sum3,
476 unsigned int num_blocks)
478 Scalar sum1 = Scalar(0.0);
479 Scalar sum2 = Scalar(0.0);
480 Scalar sum3 = Scalar(0.0);
482 // sum up the values in the partial sum via a sliding window
483 for (int start = 0; start < num_blocks; start += blockDim.x)
486 if (start + threadIdx.x < num_blocks)
488 fire_sdata1[threadIdx.x] = d_partial_sum1[start + threadIdx.x];
489 fire_sdata2[threadIdx.x] = d_partial_sum2[start + threadIdx.x];
490 fire_sdata3[threadIdx.x] = d_partial_sum3[start + threadIdx.x];
494 fire_sdata1[threadIdx.x] = Scalar(0.0);
495 fire_sdata2[threadIdx.x] = Scalar(0.0);
496 fire_sdata3[threadIdx.x] = Scalar(0.0);
500 // reduce the sum in parallel
501 int offs = blockDim.x >> 1;
504 if (threadIdx.x < offs)
506 fire_sdata1[threadIdx.x] += fire_sdata1[threadIdx.x + offs];
507 fire_sdata2[threadIdx.x] += fire_sdata2[threadIdx.x + offs];
508 fire_sdata3[threadIdx.x] += fire_sdata3[threadIdx.x + offs];
514 sum1 += fire_sdata1[0];
515 sum2 += fire_sdata2[0];
516 sum3 += fire_sdata3[0];
519 if (threadIdx.x == 0)
527 /*! \param N number of particles in system
528 \param d_vel array of particle velocities
529 \param d_accel array of particle accelerations
530 \param d_group_members Device array listing the indicies of the mebers of the group to integrate
531 \param group_size Number of members in the group
532 \param d_sum_all Array to hold the sum over P, vsq, and asq
533 \param d_partial_sum_P Array to hold the partial sum over P (a*v)
534 \param d_partial_sum_vsq Array to hold the partial sum over vsq (v*v)
535 \param d_partial_sum_asq Array to hold the partial sum over asq (a*a)
536 \param block_size is the size of one block
537 \param num_blocks is the number of blocks to execute
538 \note Currently the sums are performed consecutively. The efficiency of this
539 function could be improved by computing all three sums simultaneously
540 This is a driver for gpu_fire_reduce_{X}_partial_kernel() (where X = P, vsq, asq)
541 and gpu_fire_reduce_partial_sum_kernel(), see them for details
543 cudaError_t gpu_fire_compute_sum_all(
544 const unsigned int N,
545 const Scalar4 *d_vel,
546 const Scalar3 *d_accel,
547 unsigned int *d_group_members,
548 unsigned int group_size,
550 Scalar* d_partial_sum_P,
551 Scalar* d_partial_sum_vsq,
552 Scalar* d_partial_sum_asq,
553 unsigned int block_size,
554 unsigned int num_blocks)
556 // setup the grid to run the kernel
557 dim3 grid(num_blocks, 1, 1);
559 dim3 threads(block_size, 1, 1);
560 dim3 threads1(256, 1, 1);
563 gpu_fire_reduce_P_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>( d_vel,
570 gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[0],
574 gpu_fire_reduce_vsq_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_vel,
579 gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[1],
583 gpu_fire_reduce_asq_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_accel,
588 gpu_fire_reduce_partial_sum_kernel<<< grid1, threads1, block_size*sizeof(Scalar) >>>(&d_sum_all[2],
593 //do all three sums at once:
594 gpu_fire_reduce_all_partial_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_vel, d_accel, d_partial_sum_P, d_partial_sum_vsq, d_partial_sum_asq);
595 gpu_fire_reduce_partial_sum_3_kernel<<< grid, threads, block_size*sizeof(Scalar) >>>(d_sum_all, d_partial_sum_P, d_partial_sum_vsq, d_partial_sum_asq, num_blocks);
602 //! Kernel function to update the velocties used by the FIRE algorithm
603 /*! \param d_vel Array of velocities to update
604 \param d_accel Array of accelerations
605 \param d_group_members Device array listing the indicies of the mebers of the group to update
606 \param group_size Number of members in the grou
607 \param alpha Alpha coupling parameter used by the FIRE algorithm
608 \param vnorm Magnitude of the (3*N) dimensional velocity vector
609 \param invfnorm 1 over the magnitude of the (3*N) dimensional force vector
611 extern "C" __global__
612 void gpu_fire_update_v_kernel(Scalar4 *d_vel,
613 const Scalar3 *d_accel,
614 unsigned int *d_group_members,
615 unsigned int group_size,
620 // determine which particle this thread works on (MEM TRANSFER: 4 bytes)
621 int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
623 if (group_idx < group_size)
625 unsigned int idx = d_group_members[group_idx];
626 // read the particle's velocity and acceleration (MEM TRANSFER: 32 bytes)
627 Scalar4 v = d_vel[idx];
628 Scalar3 a = d_accel[idx];
630 v.x = v.x*(Scalar(1.0)-alpha) + alpha*a.x*invfnorm*vnorm;
631 v.y = v.y*(Scalar(1.0)-alpha) + alpha*a.y*invfnorm*vnorm;
632 v.z = v.z*(Scalar(1.0)-alpha) + alpha*a.z*invfnorm*vnorm;
634 // write out the results (MEM_TRANSFER: 32 bytes)
640 /*! \param d_vel array of particle velocities to update
641 \param d_accel array of particle accelerations
642 \param d_group_members Device array listing the indicies of the mebers of the group to integrate
643 \param group_size Number of members in the group
644 \param alpha Alpha coupling parameter used by the FIRE algorithm
645 \param vnorm Magnitude of the (3*N) dimensional velocity vector
646 \param invfnorm 1 over the magnitude of the (3*N) dimensional force vector
648 This function is a driver for gpu_fire_update_v_kernel(), see it for details.
650 cudaError_t gpu_fire_update_v(Scalar4 *d_vel,
651 const Scalar3 *d_accel,
652 unsigned int *d_group_members,
653 unsigned int group_size,
658 // setup the grid to run the kernel
659 int block_size = 256;
660 dim3 grid( (group_size/block_size) + 1, 1, 1);
661 dim3 threads(block_size, 1, 1);
664 gpu_fire_update_v_kernel<<< grid, threads >>>(d_vel,