Fix potential problem in Messenger related to MPI window
[hoomd-blue.git] / libhoomd / cuda / ComputeThermoGPU.cu
blob0e9766cd50a681da88697b8e43d0ecd9968d5017
1 /*
2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
32 permission.
34 Disclaimer
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: joaander
52 #include "ComputeThermoGPU.cuh"
54 #ifdef WIN32
55 #include <cassert>
56 #else
57 #include <assert.h>
58 #endif
60 //! Shared memory used in reducing the sums
61 extern __shared__ Scalar3 compute_thermo_sdata[];
62 //! Shared memory used in reducing the sums of the pressure tensor
63 extern __shared__ Scalar compute_pressure_tensor_sdata[];
65 /*! \file ComputeThermoGPU.cu
66     \brief Defines GPU kernel code for computing thermodynamic properties on the GPU. Used by ComputeThermoGPU.
69 //! Perform partial sums of the thermo properties on the GPU
70 /*! \param d_scratch Scratch space to hold partial sums. One element is written per block
71     \param d_net_force Net force / pe array from ParticleData
72     \param d_net_virial Net virial array from ParticleData
73     \param virial_pitch pitch of 2D virial array
74     \param d_velocity Particle velocity and mass array from ParticleData
75     \param d_group_members List of group members for which to sum properties
76     \param group_size Number of particles in the group
78     All partial sums are packaged up in a Scalar4 to keep pointer management down.
79      - 2*Kinetic energy is summed in .x
80      - Potential energy is summed in .y
81      - W is summed in .z
83     One thread is executed per group member. That thread reads in the values for its member into shared memory
84     and then the block performs a reduction in parallel to produce a partial sum output for the block. These
85     partial sums are written to d_scratch[blockIdx.x]. sizeof(Scalar3)*block_size of dynamic shared memory are needed
86     for this kernel to run.
89 __global__ void gpu_compute_thermo_partial_sums(Scalar4 *d_scratch,
90                                                 Scalar4 *d_net_force,
91                                                 Scalar *d_net_virial,
92                                                 const unsigned int virial_pitch,
93                                                 Scalar4 *d_velocity,
94                                                 unsigned int *d_group_members,
95                                                 unsigned int group_size)
96     {
97     // determine which particle this thread works on
98     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
100     Scalar3 my_element; // element of scratch space read in
101     if (group_idx < group_size)
102         {
103         unsigned int idx = d_group_members[group_idx];
105         // update positions to the next timestep and update velocities to the next half step
106         Scalar4 net_force = d_net_force[idx];
107         Scalar net_isotropic_virial;
108         // (1/3)*trace of virial tensor
109         net_isotropic_virial = Scalar(1.0/3.0)*
110                                (d_net_virial[0*virial_pitch+idx]   // xx
111                                +d_net_virial[3*virial_pitch+idx]   // yy
112                                +d_net_virial[5*virial_pitch+idx]); // zz
113         Scalar4 vel = d_velocity[idx];
114         Scalar mass = vel.w;
116         // compute our contribution to the sum
117         my_element.x = mass * (vel.x*vel.x + vel.y*vel.y + vel.z*vel.z);
118         my_element.y = net_force.w;
119         my_element.z = net_isotropic_virial;
120         }
121     else
122         {
123         // non-participating thread: contribute 0 to the sum
124         my_element = make_scalar3(0, 0, 0);
125         }
127     compute_thermo_sdata[threadIdx.x] = my_element;
128     __syncthreads();
130     // reduce the sum in parallel
131     int offs = blockDim.x >> 1;
132     while (offs > 0)
133         {
134         if (threadIdx.x < offs)
135             {
136             compute_thermo_sdata[threadIdx.x].x += compute_thermo_sdata[threadIdx.x + offs].x;
137             compute_thermo_sdata[threadIdx.x].y += compute_thermo_sdata[threadIdx.x + offs].y;
138             compute_thermo_sdata[threadIdx.x].z += compute_thermo_sdata[threadIdx.x + offs].z;
139             }
140         offs >>= 1;
141         __syncthreads();
142         }
144     // write out our partial sum
145     if (threadIdx.x == 0)
146         {
147         Scalar3 res = compute_thermo_sdata[0];
148         d_scratch[blockIdx.x] = make_scalar4(res.x, res.y, res.z, 0);
149         }
150     }
152 //! Perform partial sums of the pressure tensor on the GPU
153 /*! \param d_scratch Scratch space to hold partial sums. One element is written per block
154     \param d_net_force Net force / pe array from ParticleData
155     \param d_net_virial Net virial array from ParticleData
156     \param virial_pitch pitch of 2D virial array
157     \param d_velocity Particle velocity and mass array from ParticleData
158     \param d_group_members List of group members for which to sum properties
159     \param group_size Number of particles in the group
161     One thread is executed per group member. That thread reads in the six values (components of the presure tensor)
162     for its member into shared memory and then the block performs a reduction in parallel to produce a partial sum output for the block.
163     These partial sums are written to d_scratch[i*gridDim.x + blockIdx.x], where i=0..5 is the index of the component.
164     For this kernel to run, 6*sizeof(Scalar)*block_size of dynamic shared memory are needed.
167 __global__ void gpu_compute_pressure_tensor_partial_sums(Scalar *d_scratch,
168                                                 Scalar4 *d_net_force,
169                                                 Scalar *d_net_virial,
170                                                 const unsigned int virial_pitch,
171                                                 Scalar4 *d_velocity,
172                                                 unsigned int *d_group_members,
173                                                 unsigned int group_size)
174     {
175     // determine which particle this thread works on
176     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
178     Scalar my_element[6]; // element of scratch space read in
179     if (group_idx < group_size)
180         {
181         unsigned int idx = d_group_members[group_idx];
183         // compute contribution to pressure tensor and store it in my_element
184         Scalar4 vel = d_velocity[idx];
185         Scalar mass = vel.w;
186         my_element[0] = mass*vel.x*vel.x + d_net_virial[0*virial_pitch+idx];   // xx
187         my_element[1] = mass*vel.x*vel.y + d_net_virial[1*virial_pitch+idx];   // xy
188         my_element[2] = mass*vel.x*vel.z + d_net_virial[2*virial_pitch+idx];   // xz
189         my_element[3] = mass*vel.y*vel.y + d_net_virial[3*virial_pitch+idx];   // yy
190         my_element[4] = mass*vel.y*vel.z + d_net_virial[4*virial_pitch+idx];   // yz
191         my_element[5] = mass*vel.z*vel.z + d_net_virial[5*virial_pitch+idx];   // zz
192         }
193     else
194         {
195         // non-participating thread: contribute 0 to the sum
196         my_element[0] = 0;
197         my_element[1] = 0;
198         my_element[2] = 0;
199         my_element[3] = 0;
200         my_element[4] = 0;
201         my_element[5] = 0;
202         }
204     for (unsigned int i = 0; i < 6; i++)
205         compute_pressure_tensor_sdata[i*blockDim.x+threadIdx.x] = my_element[i];
207     __syncthreads();
209     // reduce the sum in parallel
210     int offs = blockDim.x >> 1;
211     while (offs > 0)
212         {
213         if (threadIdx.x < offs)
214             {
215             for (unsigned int i = 0; i < 6; i++)
216                 compute_pressure_tensor_sdata[i*blockDim.x+threadIdx.x] += compute_pressure_tensor_sdata[i*blockDim.x + threadIdx.x + offs];
217             }
218         offs >>= 1;
219         __syncthreads();
220         }
222     // write out our partial sum
223     if (threadIdx.x == 0)
224         {
225         for (unsigned int i = 0; i < 6; i++)
226             d_scratch[gridDim.x * i + blockIdx.x] = compute_pressure_tensor_sdata[i*blockDim.x];
227         }
228     }
230 //! Complete partial sums and compute final thermodynamic quantities (for pressure, only isotropic contribution)
231 /*! \param d_properties Property array to write final values
232     \param d_scratch Partial sums
233     \param ndof Number of degrees of freedom this group posesses
234     \param box Box the particles are in
235     \param D Dimensionality of the system
236     \param group_size Number of particles in the group
237     \param num_partial_sums Number of partial sums in \a d_scratch
238     \param external_virial External contribution to virial (1/3 trace)
241     Only one block is executed. In that block, the partial sums are read in and reduced to final values. From the final
242     sums, the thermodynamic properties are computed and written to d_properties.
244     sizeof(Scalar3)*block_size bytes of shared memory are needed for this kernel to run.
246 __global__ void gpu_compute_thermo_final_sums(Scalar *d_properties,
247                                               Scalar4 *d_scratch,
248                                               unsigned int ndof,
249                                               BoxDim box,
250                                               unsigned int D,
251                                               unsigned int group_size,
252                                               unsigned int num_partial_sums,
253                                               Scalar external_virial
254                                               )
255     {
256     Scalar3 final_sum = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
258     // sum up the values in the partial sum via a sliding window
259     for (int start = 0; start < num_partial_sums; start += blockDim.x)
260         {
261         __syncthreads();
262         if (start + threadIdx.x < num_partial_sums)
263             {
264             Scalar4 scratch = d_scratch[start + threadIdx.x];
265             compute_thermo_sdata[threadIdx.x] = make_scalar3(scratch.x, scratch.y, scratch.z);
266             }
267         else
268             compute_thermo_sdata[threadIdx.x] = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
269         __syncthreads();
271         // reduce the sum in parallel
272         int offs = blockDim.x >> 1;
273         while (offs > 0)
274             {
275             if (threadIdx.x < offs)
276                 {
277                 compute_thermo_sdata[threadIdx.x].x += compute_thermo_sdata[threadIdx.x + offs].x;
278                 compute_thermo_sdata[threadIdx.x].y += compute_thermo_sdata[threadIdx.x + offs].y;
279                 compute_thermo_sdata[threadIdx.x].z += compute_thermo_sdata[threadIdx.x + offs].z;
280                 }
281             offs >>= 1;
282             __syncthreads();
283             }
285         if (threadIdx.x == 0)
286             {
287             final_sum.x += compute_thermo_sdata[0].x;
288             final_sum.y += compute_thermo_sdata[0].y;
289             final_sum.z += compute_thermo_sdata[0].z;
290             }
291         }
293     if (threadIdx.x == 0)
294         {
295         // compute final quantities
296         Scalar ke_total = final_sum.x * Scalar(0.5);
297         Scalar pe_total = final_sum.y;
298         Scalar W = final_sum.z + external_virial;
300         // compute the temperature
301         Scalar temperature = Scalar(2.0) * Scalar(ke_total) / Scalar(ndof);
303         // compute the pressure
304         // volume/area & other 2D stuff needed
306         Scalar volume;
307         Scalar3 L = box.getL();
309         if (D == 2)
310             {
311             // "volume" is area in 2D
312             volume = L.x * L.y;
313             // W needs to be corrected since the 1/3 factor is built in
314             W *= Scalar(3.0)/Scalar(2.0);
315             }
316         else
317             {
318             volume = L.x * L.y * L.z;
319             }
321         // pressure: P = (N * K_B * T + W)/V
322         Scalar pressure =  (Scalar(2.0) * ke_total / Scalar(D) + W) / volume;
324         // fill out the GPUArray
325         d_properties[thermo_index::temperature] = temperature;
326         d_properties[thermo_index::pressure] = pressure;
327         d_properties[thermo_index::kinetic_energy] = Scalar(ke_total);
328         d_properties[thermo_index::potential_energy] = Scalar(pe_total);
329         }
330     }
332 //! Complete partial sums and compute final pressure tensor
333 /*! \param d_properties Property array to write final values
334     \param d_scratch Partial sums
335     \param box Box the particles are in
336     \param group_size Number of particles in the group
337     \param num_partial_sums Number of partial sums in \a d_scratch
339     \param external_virial_xx External contribution to virial (xx component)
340     \param external_virial_xy External contribution to virial (xy component)
341     \param external_virial_xz External contribution to virial (xz component)
342     \param external_virial_yy External contribution to virial (yy component)
343     \param external_virial_yz External contribution to virial (yz component)
344     \param external_virial_zz External contribution to virial (zz component)
346     Only one block is executed. In that block, the partial sums are read in and reduced to final values. From the final
347     sums, the thermodynamic properties are computed and written to d_properties.
349     6*sizeof(Scalar)*block_size bytes of shared memory are needed for this kernel to run.
351 __global__ void gpu_compute_pressure_tensor_final_sums(Scalar *d_properties,
352                                               Scalar *d_scratch,
353                                               BoxDim box,
354                                               unsigned int group_size,
355                                               unsigned int num_partial_sums,
356                                               Scalar external_virial_xx,
357                                               Scalar external_virial_xy,
358                                               Scalar external_virial_xz,
359                                               Scalar external_virial_yy,
360                                               Scalar external_virial_yz,
361                                               Scalar external_virial_zz,
362                                               bool twod)
363     {
364     Scalar final_sum[6];
366     final_sum[0] = external_virial_xx;
367     final_sum[1] = external_virial_xy;
368     final_sum[2] = external_virial_xz;
369     final_sum[3] = external_virial_yy;
370     final_sum[4] = external_virial_yz;
371     final_sum[5] = external_virial_zz;
373     // sum up the values in the partial sum via a sliding window
374     for (int start = 0; start < num_partial_sums; start += blockDim.x)
375         {
376         __syncthreads();
377         if (start + threadIdx.x < num_partial_sums)
378             {
379             for (unsigned int i = 0; i < 6; i++)
380                 compute_pressure_tensor_sdata[i * blockDim.x + threadIdx.x] = d_scratch[i*num_partial_sums + start + threadIdx.x];
381             }
382         else
383             for (unsigned int i = 0; i < 6; i++)
384                 compute_pressure_tensor_sdata[i * blockDim.x + threadIdx.x] = Scalar(0.0);
385         __syncthreads();
387         // reduce the sum in parallel
388         int offs = blockDim.x >> 1;
389         while (offs > 0)
390             {
391             if (threadIdx.x < offs)
392                 {
393                 for (unsigned int i = 0; i < 6; i++)
394                     compute_pressure_tensor_sdata[i*blockDim.x + threadIdx.x] += compute_pressure_tensor_sdata[i*blockDim.x + threadIdx.x + offs];
395                 }
396             offs >>= 1;
397             __syncthreads();
398             }
400         if (threadIdx.x == 0)
401             {
402             for (unsigned int i = 0; i < 6; i++)
403                 final_sum[i] += compute_pressure_tensor_sdata[i*blockDim.x];
404             }
405         }
407     if (threadIdx.x == 0)
408         {
409         // fill out the GPUArray
410         // we have thus far calculated the sum of the kinetic part of the pressure tensor
411         // and the virial part, the definition includes an inverse factor of the box volume
412         Scalar V = box.getVolume(twod);
414         d_properties[thermo_index::pressure_xx] = final_sum[0]/V;
415         d_properties[thermo_index::pressure_xy] = final_sum[1]/V;
416         d_properties[thermo_index::pressure_xz] = final_sum[2]/V;
417         d_properties[thermo_index::pressure_yy] = final_sum[3]/V;
418         d_properties[thermo_index::pressure_yz] = final_sum[4]/V;
419         d_properties[thermo_index::pressure_zz] = final_sum[5]/V;
420         }
421     }
423 //! Compute thermodynamic properties of a group on the GPU
424 /*! \param d_properties Array to write computed properties
425     \param d_vel particle velocities and masses on the GPU
426     \param d_group_members List of group members
427     \param group_size Number of group members
428     \param box Box the particles are in
429     \param args Additional arguments
430     \param compute_pressure_tensor whether to compute the full pressure tensor
432     This function drives gpu_compute_thermo_partial_sums and gpu_compute_thermo_final_sums, see them for details.
435 cudaError_t gpu_compute_thermo(Scalar *d_properties,
436                                Scalar4 *d_vel,
437                                unsigned int *d_group_members,
438                                unsigned int group_size,
439                                const BoxDim& box,
440                                const compute_thermo_args& args,
441                                const bool compute_pressure_tensor
442                                )
443     {
444     assert(d_properties);
445     assert(d_vel);
446     assert(d_group_members);
447     assert(args.d_net_force);
448     assert(args.d_net_virial);
449     assert(args.d_scratch);
451     dim3 grid(args.n_blocks, 1, 1);
452     dim3 threads(args.block_size, 1, 1);
453     unsigned int shared_bytes = sizeof(Scalar3)*args.block_size;
455     Scalar external_virial = Scalar(1.0/3.0)*(args.external_virial_xx
456                              + args.external_virial_yy
457                              + args.external_virial_zz);
459     gpu_compute_thermo_partial_sums<<<grid,threads, shared_bytes>>>(args.d_scratch,
460                                                                     args.d_net_force,
461                                                                     args.d_net_virial,
462                                                                     args.virial_pitch,
463                                                                     d_vel,
464                                                                     d_group_members,
465                                                                     group_size);
468     if (compute_pressure_tensor)
469         {
470         assert(args.d_scratch_pressure_tensor);
472         shared_bytes = 6 * sizeof(Scalar) * args.block_size;
473         // run the kernel
474         gpu_compute_pressure_tensor_partial_sums<<<grid, threads, shared_bytes>>>(args.d_scratch_pressure_tensor,
475                                                                                   args.d_net_force,
476                                                                                   args.d_net_virial,
477                                                                                   args.virial_pitch,
478                                                                                   d_vel,
479                                                                                   d_group_members,
480                                                                                   group_size);
481         }
483     // setup the grid to run the final kernel
484     int final_block_size = 512;
485     grid = dim3(1, 1, 1);
486     threads = dim3(final_block_size, 1, 1);
487     shared_bytes = sizeof(Scalar3)*final_block_size;
489     // run the kernel
490     gpu_compute_thermo_final_sums<<<grid, threads, shared_bytes>>>(d_properties,
491                                                                    args.d_scratch,
492                                                                    args.ndof,
493                                                                    box,
494                                                                    args.D,
495                                                                    group_size,
496                                                                    args.n_blocks,
497                                                                    external_virial);
499     if (compute_pressure_tensor)
500         {
501         shared_bytes = 6 * sizeof(Scalar) * final_block_size;
502         // run the kernel
503         gpu_compute_pressure_tensor_final_sums<<<grid, threads, shared_bytes>>>(d_properties,
504                                                                                args.d_scratch_pressure_tensor,
505                                                                                box,
506                                                                                group_size,
507                                                                                args.n_blocks,
508                                                                                args.external_virial_xx,
509                                                                                args.external_virial_xy,
510                                                                                args.external_virial_xz,
511                                                                                args.external_virial_yy,
512                                                                                args.external_virial_yz,
513                                                                                args.external_virial_zz,
514                                                                                args.D == 2
515                                                                                );
516         }
518     return cudaSuccess;
519     }