Fix potential problem in Messenger related to MPI window
[hoomd-blue.git] / libhoomd / cuda / TwoStepNPHRigidGPU.cu
blob8a3839d069a088e14b9745279e29e90fa029974e
1 /*
2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
32 permission.
34 Disclaimer
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: ndtrung
52 #include "QuaternionMath.h"
53 #include "TwoStepNPTRigidGPU.cuh"
55 #ifdef WIN32
56 #include <cassert>
57 #else
58 #include <assert.h>
59 #endif
61 /*! \file TwoStepNPTRigidGPU.cu
62     \brief Defines GPU kernel code for NPT integration on the GPU. Used by TwoStepNPTRigidGPU.
65 // Flag for invalid particle index, identical to the sentinel value NO_INDEX in RigidData.h
66 #define INVALID_INDEX 0xffffffff
68 /*! Maclaurine expansion
69     \param x Point to take the expansion
72 __device__ Scalar nph_maclaurin_series(Scalar x)
73     {
74     Scalar x2, x4;
75     x2 = x * x;
76     x4 = x2 * x2;
77     return (Scalar(1.0) + Scalar(1.0/6.0) * x2 + Scalar(1.0/120.0) * x4 + Scalar(1.0/5040.0) * x2 * x4 + Scalar(1.0/362880.0) * x4 * x4);
78     }
80 /*! Kernel to zero virial contribution from particles from rigid bodies
81     \param d_virial_rigid Virial contribution from particles in rigid bodies
82     \param local_num Number of particles in this card
84 extern "C" __global__ void gpu_nph_rigid_zero_virial_rigid_kernel(Scalar *d_virial_rigid,
85                                                                  unsigned int local_num)
86     {
87     int idx = blockIdx.x * blockDim.x + threadIdx.x; // particle's index
89     if (idx < local_num)
90         {
91         d_virial_rigid[idx] = 0;
92         }
94     }
96 /*! Takes the first half-step forward for rigid bodies in the velocity-verlet NVT integration
97     \param rdata_com Body center of mass
98     \param d_rigid_group Body group
99     \param n_group_bodies Number of rigid bodies in my group
100     \param n_bodies Total umber of rigid bodies
101     \param box Box dimensions for periodic boundary condition handling
102     \param npt_rdata_dilation Volume scaling factor
103     \param npt_rdata_dimension System dimensionality
104     \param npt_rdata_new box New box sizes
107 extern "C" __global__ void gpu_nph_rigid_remap_kernel(Scalar4 *rdata_com,
108                                                       unsigned int *d_rigid_group,
109                                                       unsigned int n_group_bodies,
110                                                       unsigned int n_bodies,
111                                                       BoxDim box,
112                                                       Scalar nph_rdata_dilation,
113                                                       unsigned int nph_rdata_dimension,
114                                                       Scalar4 *nph_rdata_new_box)
115     {
116     unsigned int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
117     if (group_idx >= n_group_bodies)
118         return;
120     unsigned int idx_body = d_rigid_group[group_idx];
122     Scalar3 curL = box.getL();
123     Scalar3 L;
125     // reset box to new size/shape
126     L.x = curL.x * nph_rdata_dilation;
127     L.y = curL.y * nph_rdata_dilation;
128     if (nph_rdata_dimension == 3)
129         L.z = curL.z * nph_rdata_dilation;
131     // copy and setL
132     BoxDim newBox = box;
133     newBox.setL(L);
135     Scalar4 com = rdata_com[idx_body];
136     Scalar3 f = box.makeFraction(make_scalar3(com.x, com.y, com.z));
137     Scalar3 pos = newBox.makeCoordinates(f);
139     // write out results
140     rdata_com[idx_body] = make_scalar4(pos.x, pos.y, pos.z, 0);
142     if (idx_body == 0)
143         {
144         *(nph_rdata_new_box) = make_scalar4(L.x, L.y, L.z, 0.0f);
145         }
146     }
149 #pragma mark RIGID_STEP_ONE_KERNEL
150 /*! Takes the first half-step forward for rigid bodies in the velocity-verlet NVT integration
151     \param rdata_com Body center of mass
152     \param rdata_vel Body velocity
153     \param rdata_angmom Angular momentum
154     \param rdata_angvel Angular velocity
155     \param rdata_orientation Quaternion
156     \param rdata_body_image Body image
157     \param rdata_conjqm Conjugate quaternion momentum
158     \param d_rigid_mass Body mass
159     \param d_rigid_mi Body inertia moments
160     \param n_group_bodies Number of rigid bodies in my group
161     \param d_rigid_force Body forces
162     \param d_rigid_torque Body torques
163     \param d_rigid_group Body indices
164     \param n_bodies Total umber of rigid bodies
165     \param npt_rdata_epsilon_dot Barostat velocity
166     \param npt_rdata_partial_Ksum_t Body translational kinetic energy
167     \param npt_rdata_partial_Ksum_r Body rotation kinetic energy
168     \param npt_rdata_nf_t Translational degrees of freedom
169     \param npt_rdata_nf_r Translational degrees of freedom
170     \param npt_rdata_dimension System dimesion
171     \param box Box dimensions for periodic boundary condition handling
172     \param deltaT Timestep
176 extern "C" __global__ void gpu_nph_rigid_step_one_body_kernel(Scalar4* rdata_com,
177                                                             Scalar4* rdata_vel,
178                                                             Scalar4* rdata_angmom,
179                                                             Scalar4* rdata_angvel,
180                                                             Scalar4* rdata_orientation,
181                                                             int3* rdata_body_image,
182                                                             Scalar4* rdata_conjqm,
183                                                             Scalar *d_rigid_mass,
184                                                             Scalar4 *d_rigid_mi,
185                                                             Scalar4 *d_rigid_force,
186                                                             Scalar4 *d_rigid_torque,
187                                                             unsigned int *d_rigid_group,
188                                                             unsigned int n_group_bodies,
189                                                             unsigned int n_bodies,
190                                                             Scalar npt_rdata_epsilon_dot,
191                                                             Scalar* npt_rdata_partial_Ksum_t,
192                                                             Scalar* npt_rdata_partial_Ksum_r,
193                                                             unsigned int npt_rdata_nf_t,
194                                                             unsigned int npt_rdata_nf_r,
195                                                             unsigned int npt_rdata_dimension,
196                                                             BoxDim box,
197                                                             Scalar deltaT)
198     {
199     unsigned int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
201     if (group_idx >= n_group_bodies)
202         return;
204     // do velocity verlet update
205     // v(t+deltaT/2) = v(t) + (1/2)a*deltaT
206     // r(t+deltaT) = r(t) + v(t+deltaT/2)*deltaT
207     Scalar body_mass;
208     Scalar4 moment_inertia, com, vel, orientation, ex_space, ey_space, ez_space, force, torque, conjqm;
209     int3 body_image;
210     Scalar4 mbody, tbody, fquat;
212     Scalar dt_half = Scalar(0.5) * deltaT;
213     Scalar onednft, onednfr, tmp, scale_t, scale_r, scale_v, akin_t, akin_r;
215     onednft = Scalar(1.0) + (Scalar) (npt_rdata_dimension) / (Scalar) (npt_rdata_nf_t+npt_rdata_nf_r);
216     onednfr = (Scalar) (npt_rdata_dimension) / (Scalar) (npt_rdata_nf_t+npt_rdata_nf_r);
218     tmp = Scalar(-1.0) * dt_half * onednft * npt_rdata_epsilon_dot;
219     scale_t = fast::exp(tmp);
220     tmp = Scalar(-1.0) * dt_half * onednfr * npt_rdata_epsilon_dot;
221     scale_r = fast::exp(tmp);
222     tmp = dt_half * npt_rdata_epsilon_dot;
223     scale_v = deltaT * fast::exp(tmp) * nph_maclaurin_series(tmp);
225     unsigned int idx_body = d_rigid_group[group_idx];
226     body_mass = d_rigid_mass[idx_body];
227     moment_inertia = d_rigid_mi[idx_body];
228     com = rdata_com[idx_body];
229     vel = rdata_vel[idx_body];
230     orientation = rdata_orientation[idx_body];
231     body_image = rdata_body_image[idx_body];
232     force = d_rigid_force[idx_body];
233     torque = d_rigid_torque[idx_body];
234     conjqm = rdata_conjqm[idx_body];
236     exyzFromQuaternion(orientation, ex_space, ey_space, ez_space);
238     // update velocity
239     Scalar dtfm = dt_half / body_mass;
241     Scalar4 vel2;
242     vel2.x = vel.x + dtfm * force.x;
243     vel2.y = vel.y + dtfm * force.y;
244     vel2.z = vel.z + dtfm * force.z;
245     vel2.x *= scale_t;
246     vel2.y *= scale_t;
247     vel2.z *= scale_t;
248     vel2.w = vel.w;
250     tmp = vel2.x * vel2.x + vel2.y * vel2.y + vel2.z * vel2.z;
251     akin_t = body_mass * tmp;
253     // update position
254     Scalar3 pos2;
255     pos2.x = com.x + vel2.x * scale_v;
256     pos2.y = com.y + vel2.y * scale_v;
257     pos2.z = com.z + vel2.z * scale_v;
259     // time to fix the periodic boundary conditions
260     box.wrap(pos2, body_image);
262     matrix_dot(ex_space, ey_space, ez_space, torque, tbody);
263     quatvec(orientation, tbody, fquat);
265     Scalar4 conjqm2;
266     conjqm2.x = conjqm.x + deltaT * fquat.x;
267     conjqm2.y = conjqm.y + deltaT * fquat.y;
268     conjqm2.z = conjqm.z + deltaT * fquat.z;
269     conjqm2.w = conjqm.w + deltaT * fquat.w;
271     conjqm2.x *= scale_r;
272     conjqm2.y *= scale_r;
273     conjqm2.z *= scale_r;
274     conjqm2.w *= scale_r;
276     // use no_squish rotate to update p and q
277     no_squish_rotate(3, conjqm2, orientation, moment_inertia, dt_half);
278     no_squish_rotate(2, conjqm2, orientation, moment_inertia, dt_half);
279     no_squish_rotate(1, conjqm2, orientation, moment_inertia, deltaT);
280     no_squish_rotate(2, conjqm2, orientation, moment_inertia, dt_half);
281     no_squish_rotate(3, conjqm2, orientation, moment_inertia, dt_half);
283     // update the exyz_space
284     // transform p back to angmom
285     // update angular velocity
286     Scalar4 angmom2;
287     exyzFromQuaternion(orientation, ex_space, ey_space, ez_space);
288     invquatvec(orientation, conjqm2, mbody);
289     transpose_dot(ex_space, ey_space, ez_space, mbody, angmom2);
291     angmom2.x *= Scalar(0.5);
292     angmom2.y *= Scalar(0.5);
293     angmom2.z *= Scalar(0.5);
295     Scalar4 angvel2;
296     computeAngularVelocity(angmom2, moment_inertia, ex_space, ey_space, ez_space, angvel2);
298     akin_r = angmom2.x * angvel2.x + angmom2.y * angvel2.y + angmom2.z * angvel2.z;
300     // write out the results (MEM_TRANSFER: ? bytes)
301     rdata_com[idx_body] = make_scalar4(pos2.x, pos2.y, pos2.z, com.w);
302     rdata_vel[idx_body] = vel2;
303     rdata_angmom[idx_body] = angmom2;
304     rdata_angvel[idx_body] = angvel2;
305     rdata_orientation[idx_body] = orientation;
306     rdata_body_image[idx_body] = body_image;
307     rdata_conjqm[idx_body] = conjqm2;
309     npt_rdata_partial_Ksum_t[group_idx] = akin_t;
310     npt_rdata_partial_Ksum_r[group_idx] = akin_r;
311     }
313 /*! \param rigid_data Rigid body data to step forward 1/2 step
314     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
315     \param group_size Number of members in the group
316     \param d_net_force Particle net forces
317     \param box Box dimensions for periodic boundary condition handling
318     \param npt_rdata Thermostat/barostat data
319     \param deltaT Amount of real time to step forward in one time step
322 cudaError_t gpu_nph_rigid_step_one(const gpu_rigid_data_arrays& rigid_data,
323                                    unsigned int *d_group_members,
324                                    unsigned int group_size,
325                                    Scalar4 *d_net_force,
326                                    const BoxDim& box,
327                                    const gpu_npt_rigid_data& npt_rdata,
328                                    Scalar deltaT)
329     {
330     unsigned int n_bodies = rigid_data.n_bodies;
331     unsigned int n_group_bodies = rigid_data.n_group_bodies;
333     // setup the grid to run the kernel for rigid bodies
334     int block_size = 64;
335     int n_blocks = n_group_bodies / block_size + 1;
336     dim3 body_grid(n_blocks, 1, 1);
337     dim3 body_threads(block_size, 1, 1);
338     gpu_nph_rigid_step_one_body_kernel<<< body_grid, body_threads  >>>(rigid_data.com,
339                                                                        rigid_data.vel,
340                                                                        rigid_data.angmom,
341                                                                        rigid_data.angvel,
342                                                                        rigid_data.orientation,
343                                                                        rigid_data.body_image,
344                                                                        rigid_data.conjqm,
345                                                                        rigid_data.body_mass,
346                                                                        rigid_data.moment_inertia,
347                                                                        rigid_data.force,
348                                                                        rigid_data.torque,
349                                                                        rigid_data.body_indices,
350                                                                        n_group_bodies,
351                                                                        n_bodies,
352                                                                        npt_rdata.epsilon_dot,
353                                                                        npt_rdata.partial_Ksum_t,
354                                                                        npt_rdata.partial_Ksum_r,
355                                                                        npt_rdata.nf_t,
356                                                                        npt_rdata.nf_r,
357                                                                        npt_rdata.dimension,
358                                                                        box,
359                                                                        deltaT);
361     gpu_nph_rigid_remap_kernel<<< body_grid, body_threads >>>(rigid_data.com,
362                                                               rigid_data.body_indices,
363                                                               n_group_bodies,
364                                                               n_bodies,
365                                                               box,
366                                                               npt_rdata.dilation,
367                                                               npt_rdata.dimension,
368                                                               npt_rdata.new_box);
371     return cudaSuccess;
372     }
374 #pragma mark RIGID_STEP_TWO_KERNEL
375 //! Takes the 2nd 1/2 step forward in the velocity-verlet NPH integration scheme
377     \param rdata_vel Body velocity
378     \param rdata_angmom Angular momentum
379     \param rdata_angvel Angular velocity
380     \param rdata_orientation Quaternion
381     \param rdata_conjqm Conjugate quaternion momentum
382     \param d_rigid_mass Body mass
383     \param d_rigid_mi Body inertia moments
384     \param d_rigid_force Body forces
385     \param d_rigid_torque Body torques
386     \param d_rigid_group Body indices
387     \param n_group_bodies Number of rigid bodies in my group
388     \param n_bodies Total number of rigid bodies
389     \param npt_rdata_eta_dot_t0 Thermostat translational part
390     \param npt_rdata_eta_dot_r0 Thermostat rotational part
391     \param npt_rdata_epsilon_dot Barostat velocity
392     \param npt_rdata_partial_Ksum_t Body translational kinetic energy
393     \param npt_rdata_partial_Ksum_r Body rotation kinetic energy
394     \param npt_rdata_nf_t Translational degrees of freedom
395     \param npt_rdata_nf_r Translational degrees of freedom
396     \param npt_rdata_dimension System dimesion
397     \param deltaT Timestep
398     \param box Box dimensions for periodic boundary condition handling
401 extern "C" __global__ void gpu_nph_rigid_step_two_body_kernel(Scalar4* rdata_vel,
402                                                               Scalar4* rdata_angmom,
403                                                               Scalar4* rdata_angvel,
404                                                               Scalar4* rdata_orientation,
405                                                               Scalar4* rdata_conjqm,
406                                                               Scalar *d_rigid_mass,
407                                                               Scalar4 *d_rigid_mi,
408                                                               Scalar4 *d_rigid_force,
409                                                               Scalar4 *d_rigid_torque,
410                                                               unsigned int *d_rigid_group,
411                                                               unsigned int n_group_bodies,
412                                                               unsigned int n_bodies,
413                                                               Scalar npt_rdata_epsilon_dot,
414                                                               Scalar* npt_rdata_partial_Ksum_t,
415                                                               Scalar* npt_rdata_partial_Ksum_r,
416                                                               unsigned int npt_rdata_nf_t,
417                                                               unsigned int npt_rdata_nf_r,
418                                                               unsigned int npt_rdata_dimension,
419                                                               BoxDim box,
420                                                               Scalar deltaT)
421     {
422     unsigned int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
424     if (group_idx >= n_group_bodies)
425         return;
427     Scalar body_mass;
428     Scalar4 moment_inertia, vel, ex_space, ey_space, ez_space, orientation, conjqm;
429     Scalar4 force, torque;
430     Scalar4 mbody, tbody, fquat;
432     Scalar dt_half = Scalar(0.5) * deltaT;
433     Scalar onednft, onednfr, tmp, scale_t, scale_r, akin_t, akin_r;
435     onednft = Scalar(1.0) + (Scalar) (npt_rdata_dimension) / (Scalar) (npt_rdata_nf_t+npt_rdata_nf_r);
436     onednfr = (Scalar) (npt_rdata_dimension) / (Scalar) (npt_rdata_nf_t+npt_rdata_nf_r);
438     tmp = Scalar(-1.0) * dt_half * onednft * npt_rdata_epsilon_dot;
439     scale_t = exp(tmp);
440     tmp = Scalar(-1.0) * dt_half * onednfr * npt_rdata_epsilon_dot;
441     scale_r = exp(tmp);
443     unsigned int idx_body = d_rigid_group[group_idx];
445     // Update body velocity and angmom
446     body_mass = d_rigid_mass[idx_body];
447     moment_inertia = d_rigid_mi[idx_body];
448     vel = rdata_vel[idx_body];
449     force = d_rigid_force[idx_body];
450     torque = d_rigid_torque[idx_body];
451     orientation = rdata_orientation[idx_body];
452     conjqm = rdata_conjqm[idx_body];
454     exyzFromQuaternion(orientation, ex_space, ey_space, ez_space);
456     Scalar dtfm = dt_half / body_mass;
458     // update the velocity
459     Scalar4 vel2;
460     vel2.x = scale_t * vel.x + dtfm * force.x;
461     vel2.y = scale_t * vel.y + dtfm * force.y;
462     vel2.z = scale_t * vel.z + dtfm * force.z;
463     vel2.w = 0;
465     tmp = vel2.x * vel2.x + vel2.y * vel2.y + vel2.z * vel2.z;
466     akin_t = body_mass * tmp;
468     // update angular momentum
469     matrix_dot(ex_space, ey_space, ez_space, torque, tbody);
470     quatvec(orientation, tbody, fquat);
472     Scalar4  conjqm2, angmom2;
473     conjqm2.x = scale_r * conjqm.x + deltaT * fquat.x;
474     conjqm2.y = scale_r * conjqm.y + deltaT * fquat.y;
475     conjqm2.z = scale_r * conjqm.z + deltaT * fquat.z;
476     conjqm2.w = scale_r * conjqm.w + deltaT * fquat.w;
478     invquatvec(orientation, conjqm2, mbody);
479     transpose_dot(ex_space, ey_space, ez_space, mbody, angmom2);
481     angmom2.x *= Scalar(0.5);
482     angmom2.y *= Scalar(0.5);
483     angmom2.z *= Scalar(0.5);
484     angmom2.w = 0;
486     // update angular velocity
487     Scalar4 angvel2;
488     computeAngularVelocity(angmom2, moment_inertia, ex_space, ey_space, ez_space, angvel2);
490     akin_r = angmom2.x * angvel2.x + angmom2.y * angvel2.y + angmom2.z * angvel2.z;
492     rdata_vel[idx_body] = vel2;
493     rdata_angmom[idx_body] = angmom2;
494     rdata_angvel[idx_body] = angvel2;
495     rdata_conjqm[idx_body] = conjqm2;
497     npt_rdata_partial_Ksum_t[group_idx] = akin_t;
498     npt_rdata_partial_Ksum_r[group_idx] = akin_r;
499     }
501 /*! \param rigid_data Rigid body data to step forward 1/2 step
502     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
503     \param group_size Number of members in the group
504     \param d_net_force Particle net forces
505     \param d_net_virial Particle net virial
506     \param box Box dimensions for periodic boundary condition handling
507     \param npt_rdata Thermostat/barostat data
508     \param deltaT Amount of real time to step forward in one time step
511 cudaError_t gpu_nph_rigid_step_two( const gpu_rigid_data_arrays& rigid_data,
512                                     unsigned int *d_group_members,
513                                     unsigned int group_size,
514                                     Scalar4 *d_net_force,
515                                     Scalar *d_net_virial,
516                                     const BoxDim& box,
517                                     const gpu_npt_rigid_data& npt_rdata,
518                                     Scalar deltaT)
519     {
520     unsigned int n_bodies = rigid_data.n_bodies;
521     unsigned int n_group_bodies = rigid_data.n_group_bodies;
523     unsigned int block_size = 64;
524     unsigned int n_blocks = n_group_bodies / block_size + 1;
525     dim3 body_grid(n_blocks, 1, 1);
526     dim3 body_threads(block_size, 1, 1);
527     gpu_nph_rigid_step_two_body_kernel<<< body_grid, body_threads >>>(rigid_data.vel,
528                                                                     rigid_data.angmom,
529                                                                     rigid_data.angvel,
530                                                                     rigid_data.orientation,
531                                                                     rigid_data.conjqm,
532                                                                     rigid_data.body_mass,
533                                                                     rigid_data.moment_inertia,
534                                                                     rigid_data.force,
535                                                                     rigid_data.torque,
536                                                                     rigid_data.body_indices,
537                                                                     n_group_bodies,
538                                                                     n_bodies,
539                                                                     npt_rdata.epsilon_dot,
540                                                                     npt_rdata.partial_Ksum_t,
541                                                                     npt_rdata.partial_Ksum_r,
542                                                                     npt_rdata.nf_t,
543                                                                     npt_rdata.nf_r,
544                                                                     npt_rdata.dimension,
545                                                                     box,
546                                                                     deltaT);
549     return cudaSuccess;
550     }
552 #pragma mark RIGID_KINETIC_ENERGY_REDUCTION
554 //! Shared memory for kinetic energy reduction
555 extern __shared__ Scalar nph_rigid_sdata[];
557 /*! Summing the kinetic energy of rigid bodies
558     \param npt_rdata Thermostat data for rigid bodies
561 extern "C" __global__ void gpu_nph_rigid_reduce_ksum_kernel(Scalar* npt_rdata_partial_Ksum_t,
562                                                             Scalar* npt_rdata_partial_Ksum_r,
563                                                             Scalar* npt_rdata_Ksum_t,
564                                                             Scalar* npt_rdata_Ksum_r,
565                                                             unsigned int n_bodies)
566     {
567     int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
569     Scalar* body_ke_t = nph_rigid_sdata;
570     Scalar* body_ke_r = &nph_rigid_sdata[blockDim.x];
572     Scalar Ksum_t = 0, Ksum_r=0;
574     // sum up the values in the partial sum via a sliding window
575     for (int start = 0; start < n_bodies; start += blockDim.x)
576         {
577         if (start + threadIdx.x < n_bodies)
578             {
579             body_ke_t[threadIdx.x] = npt_rdata_partial_Ksum_t[start + threadIdx.x];
580             body_ke_r[threadIdx.x] = npt_rdata_partial_Ksum_r[start + threadIdx.x];
581             }
582         else
583             {
584             body_ke_t[threadIdx.x] = 0;
585             body_ke_r[threadIdx.x] = 0;
586             }
587         __syncthreads();
589         // reduce the sum within a block
590         int offset = blockDim.x >> 1;
591         while (offset > 0)
592             {
593             if (threadIdx.x < offset)
594                 {
595                 body_ke_t[threadIdx.x] += body_ke_t[threadIdx.x + offset];
596                 body_ke_r[threadIdx.x] += body_ke_r[threadIdx.x + offset];
597                 }
598             offset >>= 1;
599             __syncthreads();
600             }
602         // everybody sums up Ksum
603         Ksum_t += body_ke_t[0];
604         Ksum_r += body_ke_r[0];
605         }
607     __syncthreads();
610     if (global_idx == 0)
611         {
612         *npt_rdata_Ksum_t = Ksum_t;
613         *npt_rdata_Ksum_r = Ksum_r;
614         }
616     }
619     \param npt_rdata Thermostat/barostat data for rigid bodies
621 cudaError_t gpu_nph_rigid_reduce_ksum(const gpu_npt_rigid_data& npt_rdata)
622     {
623     // setup the grid to run the kernel
624     int block_size = 128;
625     dim3 grid( 1, 1, 1);
626     dim3 threads(block_size, 1, 1);
628     // run the kernel: double the block size to accomodate Ksum_t and Ksum_r
629     gpu_nph_rigid_reduce_ksum_kernel<<< grid, threads, 2 * block_size * sizeof(Scalar) >>>(npt_rdata.partial_Ksum_t,
630                                                                                           npt_rdata.partial_Ksum_r,
631                                                                                           npt_rdata.Ksum_t,
632                                                                                           npt_rdata.Ksum_r,
633                                                                                           npt_rdata.n_bodies);
635     return cudaSuccess;
636     }