libhoomd/cuda/ComputeThermoGPU.cu

   1 /*
   2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
   3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
   4 the University of Michigan All rights reserved.
   5
   6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
   7 copyright is held, by various Contributors who have granted The Regents of the
   8 University of Michigan the right to modify and/or distribute such Contributions.
   9
  10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
  11 and binary forms, provided you abide by the following conditions:
  12
  13 * Redistributions of source code must retain the above copyright notice, this
  14 list of conditions, and the following disclaimer both in the code and
  15 prominently in any materials provided with the distribution.
  16
  17 * Redistributions in binary form must reproduce the above copyright notice, this
  18 list of conditions, and the following disclaimer in the documentation and/or
  19 other materials provided with the distribution.
  20
  21 * All publications and presentations based on HOOMD-blue, including any reports
  22 or published results obtained, in whole or in part, with HOOMD-blue, will
  23 acknowledge its use according to the terms posted at the time of submission on:
  24 http://codeblue.umich.edu/hoomd-blue/citations.html
  25
  26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
  27 http://codeblue.umich.edu/hoomd-blue/
  28
  29 * Apart from the above required attributions, neither the name of the copyright
  30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
  31 promote products derived from this software without specific prior written
  32 permission.
  33
  34 Disclaimer
  35
  36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
  37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
  39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
  40
  41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 */
  49
  50 // Maintainer: joaander
  51
  52 #include "ComputeThermoGPU.cuh"
  53
  54 #ifdef WIN32
  55 #include <cassert>
  56 #else
  57 #include <assert.h>
  58 #endif
  59
  60 //! Shared memory used in reducing the sums
  61 extern __shared__ Scalar3 compute_thermo_sdata[];
  62 //! Shared memory used in reducing the sums of the pressure tensor
  63 extern __shared__ Scalar compute_pressure_tensor_sdata[];
  64
  65 /*! \file ComputeThermoGPU.cu
  66     \brief Defines GPU kernel code for computing thermodynamic properties on the GPU. Used by ComputeThermoGPU.
  67 */
  68
  69 //! Perform partial sums of the thermo properties on the GPU
  70 /*! \param d_scratch Scratch space to hold partial sums. One element is written per block
  71     \param d_net_force Net force / pe array from ParticleData
  72     \param d_net_virial Net virial array from ParticleData
  73     \param virial_pitch pitch of 2D virial array
  74     \param d_velocity Particle velocity and mass array from ParticleData
  75     \param d_group_members List of group members for which to sum properties
  76     \param group_size Number of particles in the group
  77
  78     All partial sums are packaged up in a Scalar4 to keep pointer management down.
  79      - 2*Kinetic energy is summed in .x
  80      - Potential energy is summed in .y
  81      - W is summed in .z
  82
  83     One thread is executed per group member. That thread reads in the values for its member into shared memory
  84     and then the block performs a reduction in parallel to produce a partial sum output for the block. These
  85     partial sums are written to d_scratch[blockIdx.x]. sizeof(Scalar3)*block_size of dynamic shared memory are needed
  86     for this kernel to run.
  87 */
  88
  89 __global__ void gpu_compute_thermo_partial_sums(Scalar4 *d_scratch,
  90                                                 Scalar4 *d_net_force,
  91                                                 Scalar *d_net_virial,
  92                                                 const unsigned int virial_pitch,
  93                                                 Scalar4 *d_velocity,
  94                                                 unsigned int *d_group_members,
  95                                                 unsigned int group_size)
  96     {
  97     // determine which particle this thread works on
  98     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
  99
 100     Scalar3 my_element; // element of scratch space read in
 101     if (group_idx < group_size)
 102         {
 103         unsigned int idx = d_group_members[group_idx];
 104
 105         // update positions to the next timestep and update velocities to the next half step
 106         Scalar4 net_force = d_net_force[idx];
 107         Scalar net_isotropic_virial;
 108         // (1/3)*trace of virial tensor
 109         net_isotropic_virial = Scalar(1.0/3.0)*
 110                                (d_net_virial[0*virial_pitch+idx]   // xx
 111                                +d_net_virial[3*virial_pitch+idx]   // yy
 112                                +d_net_virial[5*virial_pitch+idx]); // zz
 113         Scalar4 vel = d_velocity[idx];
 114         Scalar mass = vel.w;
 115
 116         // compute our contribution to the sum
 117         my_element.x = mass * (vel.x*vel.x + vel.y*vel.y + vel.z*vel.z);
 118         my_element.y = net_force.w;
 119         my_element.z = net_isotropic_virial;
 120         }
 121     else
 122         {
 123         // non-participating thread: contribute 0 to the sum
 124         my_element = make_scalar3(0, 0, 0);
 125         }
 126
 127     compute_thermo_sdata[threadIdx.x] = my_element;
 128     __syncthreads();
 129
 130     // reduce the sum in parallel
 131     int offs = blockDim.x >> 1;
 132     while (offs > 0)
 133         {
 134         if (threadIdx.x < offs)
 135             {
 136             compute_thermo_sdata[threadIdx.x].x += compute_thermo_sdata[threadIdx.x + offs].x;
 137             compute_thermo_sdata[threadIdx.x].y += compute_thermo_sdata[threadIdx.x + offs].y;
 138             compute_thermo_sdata[threadIdx.x].z += compute_thermo_sdata[threadIdx.x + offs].z;
 139             }
 140         offs >>= 1;
 141         __syncthreads();
 142         }
 143
 144     // write out our partial sum
 145     if (threadIdx.x == 0)
 146         {
 147         Scalar3 res = compute_thermo_sdata[0];
 148         d_scratch[blockIdx.x] = make_scalar4(res.x, res.y, res.z, 0);
 149         }
 150     }
 151
 152 //! Perform partial sums of the pressure tensor on the GPU
 153 /*! \param d_scratch Scratch space to hold partial sums. One element is written per block
 154     \param d_net_force Net force / pe array from ParticleData
 155     \param d_net_virial Net virial array from ParticleData
 156     \param virial_pitch pitch of 2D virial array
 157     \param d_velocity Particle velocity and mass array from ParticleData
 158     \param d_group_members List of group members for which to sum properties
 159     \param group_size Number of particles in the group
 160
 161     One thread is executed per group member. That thread reads in the six values (components of the presure tensor)
 162     for its member into shared memory and then the block performs a reduction in parallel to produce a partial sum output for the block.
 163     These partial sums are written to d_scratch[i*gridDim.x + blockIdx.x], where i=0..5 is the index of the component.
 164     For this kernel to run, 6*sizeof(Scalar)*block_size of dynamic shared memory are needed.
 165 */
 166
 167 __global__ void gpu_compute_pressure_tensor_partial_sums(Scalar *d_scratch,
 168                                                 Scalar4 *d_net_force,
 169                                                 Scalar *d_net_virial,
 170                                                 const unsigned int virial_pitch,
 171                                                 Scalar4 *d_velocity,
 172                                                 unsigned int *d_group_members,
 173                                                 unsigned int group_size)
 174     {
 175     // determine which particle this thread works on
 176     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 177
 178     Scalar my_element[6]; // element of scratch space read in
 179     if (group_idx < group_size)
 180         {
 181         unsigned int idx = d_group_members[group_idx];
 182
 183         // compute contribution to pressure tensor and store it in my_element
 184         Scalar4 vel = d_velocity[idx];
 185         Scalar mass = vel.w;
 186         my_element[0] = mass*vel.x*vel.x + d_net_virial[0*virial_pitch+idx];   // xx
 187         my_element[1] = mass*vel.x*vel.y + d_net_virial[1*virial_pitch+idx];   // xy
 188         my_element[2] = mass*vel.x*vel.z + d_net_virial[2*virial_pitch+idx];   // xz
 189         my_element[3] = mass*vel.y*vel.y + d_net_virial[3*virial_pitch+idx];   // yy
 190         my_element[4] = mass*vel.y*vel.z + d_net_virial[4*virial_pitch+idx];   // yz
 191         my_element[5] = mass*vel.z*vel.z + d_net_virial[5*virial_pitch+idx];   // zz
 192         }
 193     else
 194         {
 195         // non-participating thread: contribute 0 to the sum
 196         my_element[0] = 0;
 197         my_element[1] = 0;
 198         my_element[2] = 0;
 199         my_element[3] = 0;
 200         my_element[4] = 0;
 201         my_element[5] = 0;
 202         }
 203
 204     for (unsigned int i = 0; i < 6; i++)
 205         compute_pressure_tensor_sdata[i*blockDim.x+threadIdx.x] = my_element[i];
 206
 207     __syncthreads();
 208
 209     // reduce the sum in parallel
 210     int offs = blockDim.x >> 1;
 211     while (offs > 0)
 212         {
 213         if (threadIdx.x < offs)
 214             {
 215             for (unsigned int i = 0; i < 6; i++)
 216                 compute_pressure_tensor_sdata[i*blockDim.x+threadIdx.x] += compute_pressure_tensor_sdata[i*blockDim.x + threadIdx.x + offs];
 217             }
 218         offs >>= 1;
 219         __syncthreads();
 220         }
 221
 222     // write out our partial sum
 223     if (threadIdx.x == 0)
 224         {
 225         for (unsigned int i = 0; i < 6; i++)
 226             d_scratch[gridDim.x * i + blockIdx.x] = compute_pressure_tensor_sdata[i*blockDim.x];
 227         }
 228     }
 229
 230 //! Complete partial sums and compute final thermodynamic quantities (for pressure, only isotropic contribution)
 231 /*! \param d_properties Property array to write final values
 232     \param d_scratch Partial sums
 233     \param ndof Number of degrees of freedom this group posesses
 234     \param box Box the particles are in
 235     \param D Dimensionality of the system
 236     \param group_size Number of particles in the group
 237     \param num_partial_sums Number of partial sums in \a d_scratch
 238     \param external_virial External contribution to virial (1/3 trace)
 239
 240
 241     Only one block is executed. In that block, the partial sums are read in and reduced to final values. From the final
 242     sums, the thermodynamic properties are computed and written to d_properties.
 243
 244     sizeof(Scalar3)*block_size bytes of shared memory are needed for this kernel to run.
 245 */
 246 __global__ void gpu_compute_thermo_final_sums(Scalar *d_properties,
 247                                               Scalar4 *d_scratch,
 248                                               unsigned int ndof,
 249                                               BoxDim box,
 250                                               unsigned int D,
 251                                               unsigned int group_size,
 252                                               unsigned int num_partial_sums,
 253                                               Scalar external_virial
 254                                               )
 255     {
 256     Scalar3 final_sum = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
 257
 258     // sum up the values in the partial sum via a sliding window
 259     for (int start = 0; start < num_partial_sums; start += blockDim.x)
 260         {
 261         __syncthreads();
 262         if (start + threadIdx.x < num_partial_sums)
 263             {
 264             Scalar4 scratch = d_scratch[start + threadIdx.x];
 265             compute_thermo_sdata[threadIdx.x] = make_scalar3(scratch.x, scratch.y, scratch.z);
 266             }
 267         else
 268             compute_thermo_sdata[threadIdx.x] = make_scalar3(Scalar(0.0), Scalar(0.0), Scalar(0.0));
 269         __syncthreads();
 270
 271         // reduce the sum in parallel
 272         int offs = blockDim.x >> 1;
 273         while (offs > 0)
 274             {
 275             if (threadIdx.x < offs)
 276                 {
 277                 compute_thermo_sdata[threadIdx.x].x += compute_thermo_sdata[threadIdx.x + offs].x;
 278                 compute_thermo_sdata[threadIdx.x].y += compute_thermo_sdata[threadIdx.x + offs].y;
 279                 compute_thermo_sdata[threadIdx.x].z += compute_thermo_sdata[threadIdx.x + offs].z;
 280                 }
 281             offs >>= 1;
 282             __syncthreads();
 283             }
 284
 285         if (threadIdx.x == 0)
 286             {
 287             final_sum.x += compute_thermo_sdata[0].x;
 288             final_sum.y += compute_thermo_sdata[0].y;
 289             final_sum.z += compute_thermo_sdata[0].z;
 290             }
 291         }
 292
 293     if (threadIdx.x == 0)
 294         {
 295         // compute final quantities
 296         Scalar ke_total = final_sum.x * Scalar(0.5);
 297         Scalar pe_total = final_sum.y;
 298         Scalar W = final_sum.z + external_virial;
 299
 300         // compute the temperature
 301         Scalar temperature = Scalar(2.0) * Scalar(ke_total) / Scalar(ndof);
 302
 303         // compute the pressure
 304         // volume/area & other 2D stuff needed
 305
 306         Scalar volume;
 307         Scalar3 L = box.getL();
 308
 309         if (D == 2)
 310             {
 311             // "volume" is area in 2D
 312             volume = L.x * L.y;
 313             // W needs to be corrected since the 1/3 factor is built in
 314             W *= Scalar(3.0)/Scalar(2.0);
 315             }
 316         else
 317             {
 318             volume = L.x * L.y * L.z;
 319             }
 320
 321         // pressure: P = (N * K_B * T + W)/V
 322         Scalar pressure =  (Scalar(2.0) * ke_total / Scalar(D) + W) / volume;
 323
 324         // fill out the GPUArray
 325         d_properties[thermo_index::temperature] = temperature;
 326         d_properties[thermo_index::pressure] = pressure;
 327         d_properties[thermo_index::kinetic_energy] = Scalar(ke_total);
 328         d_properties[thermo_index::potential_energy] = Scalar(pe_total);
 329         }
 330     }
 331
 332 //! Complete partial sums and compute final pressure tensor
 333 /*! \param d_properties Property array to write final values
 334     \param d_scratch Partial sums
 335     \param box Box the particles are in
 336     \param group_size Number of particles in the group
 337     \param num_partial_sums Number of partial sums in \a d_scratch
 338
 339     \param external_virial_xx External contribution to virial (xx component)
 340     \param external_virial_xy External contribution to virial (xy component)
 341     \param external_virial_xz External contribution to virial (xz component)
 342     \param external_virial_yy External contribution to virial (yy component)
 343     \param external_virial_yz External contribution to virial (yz component)
 344     \param external_virial_zz External contribution to virial (zz component)
 345
 346     Only one block is executed. In that block, the partial sums are read in and reduced to final values. From the final
 347     sums, the thermodynamic properties are computed and written to d_properties.
 348
 349     6*sizeof(Scalar)*block_size bytes of shared memory are needed for this kernel to run.
 350 */
 351 __global__ void gpu_compute_pressure_tensor_final_sums(Scalar *d_properties,
 352                                               Scalar *d_scratch,
 353                                               BoxDim box,
 354                                               unsigned int group_size,
 355                                               unsigned int num_partial_sums,
 356                                               Scalar external_virial_xx,
 357                                               Scalar external_virial_xy,
 358                                               Scalar external_virial_xz,
 359                                               Scalar external_virial_yy,
 360                                               Scalar external_virial_yz,
 361                                               Scalar external_virial_zz,
 362                                               bool twod)
 363     {
 364     Scalar final_sum[6];
 365
 366     final_sum[0] = external_virial_xx;
 367     final_sum[1] = external_virial_xy;
 368     final_sum[2] = external_virial_xz;
 369     final_sum[3] = external_virial_yy;
 370     final_sum[4] = external_virial_yz;
 371     final_sum[5] = external_virial_zz;
 372
 373     // sum up the values in the partial sum via a sliding window
 374     for (int start = 0; start < num_partial_sums; start += blockDim.x)
 375         {
 376         __syncthreads();
 377         if (start + threadIdx.x < num_partial_sums)
 378             {
 379             for (unsigned int i = 0; i < 6; i++)
 380                 compute_pressure_tensor_sdata[i * blockDim.x + threadIdx.x] = d_scratch[i*num_partial_sums + start + threadIdx.x];
 381             }
 382         else
 383             for (unsigned int i = 0; i < 6; i++)
 384                 compute_pressure_tensor_sdata[i * blockDim.x + threadIdx.x] = Scalar(0.0);
 385         __syncthreads();
 386
 387         // reduce the sum in parallel
 388         int offs = blockDim.x >> 1;
 389         while (offs > 0)
 390             {
 391             if (threadIdx.x < offs)
 392                 {
 393                 for (unsigned int i = 0; i < 6; i++)
 394                     compute_pressure_tensor_sdata[i*blockDim.x + threadIdx.x] += compute_pressure_tensor_sdata[i*blockDim.x + threadIdx.x + offs];
 395                 }
 396             offs >>= 1;
 397             __syncthreads();
 398             }
 399
 400         if (threadIdx.x == 0)
 401             {
 402             for (unsigned int i = 0; i < 6; i++)
 403                 final_sum[i] += compute_pressure_tensor_sdata[i*blockDim.x];
 404             }
 405         }
 406
 407     if (threadIdx.x == 0)
 408         {
 409         // fill out the GPUArray
 410         // we have thus far calculated the sum of the kinetic part of the pressure tensor
 411         // and the virial part, the definition includes an inverse factor of the box volume
 412         Scalar V = box.getVolume(twod);
 413
 414         d_properties[thermo_index::pressure_xx] = final_sum[0]/V;
 415         d_properties[thermo_index::pressure_xy] = final_sum[1]/V;
 416         d_properties[thermo_index::pressure_xz] = final_sum[2]/V;
 417         d_properties[thermo_index::pressure_yy] = final_sum[3]/V;
 418         d_properties[thermo_index::pressure_yz] = final_sum[4]/V;
 419         d_properties[thermo_index::pressure_zz] = final_sum[5]/V;
 420         }
 421     }
 422
 423 //! Compute thermodynamic properties of a group on the GPU
 424 /*! \param d_properties Array to write computed properties
 425     \param d_vel particle velocities and masses on the GPU
 426     \param d_group_members List of group members
 427     \param group_size Number of group members
 428     \param box Box the particles are in
 429     \param args Additional arguments
 430     \param compute_pressure_tensor whether to compute the full pressure tensor
 431
 432     This function drives gpu_compute_thermo_partial_sums and gpu_compute_thermo_final_sums, see them for details.
 433 */
 434
 435 cudaError_t gpu_compute_thermo(Scalar *d_properties,
 436                                Scalar4 *d_vel,
 437                                unsigned int *d_group_members,
 438                                unsigned int group_size,
 439                                const BoxDim& box,
 440                                const compute_thermo_args& args,
 441                                const bool compute_pressure_tensor
 442                                )
 443     {
 444     assert(d_properties);
 445     assert(d_vel);
 446     assert(d_group_members);
 447     assert(args.d_net_force);
 448     assert(args.d_net_virial);
 449     assert(args.d_scratch);
 450
 451     dim3 grid(args.n_blocks, 1, 1);
 452     dim3 threads(args.block_size, 1, 1);
 453     unsigned int shared_bytes = sizeof(Scalar3)*args.block_size;
 454
 455     Scalar external_virial = Scalar(1.0/3.0)*(args.external_virial_xx
 456                              + args.external_virial_yy
 457                              + args.external_virial_zz);
 458
 459     gpu_compute_thermo_partial_sums<<<grid,threads, shared_bytes>>>(args.d_scratch,
 460                                                                     args.d_net_force,
 461                                                                     args.d_net_virial,
 462                                                                     args.virial_pitch,
 463                                                                     d_vel,
 464                                                                     d_group_members,
 465                                                                     group_size);
 466
 467
 468     if (compute_pressure_tensor)
 469         {
 470         assert(args.d_scratch_pressure_tensor);
 471
 472         shared_bytes = 6 * sizeof(Scalar) * args.block_size;
 473         // run the kernel
 474         gpu_compute_pressure_tensor_partial_sums<<<grid, threads, shared_bytes>>>(args.d_scratch_pressure_tensor,
 475                                                                                   args.d_net_force,
 476                                                                                   args.d_net_virial,
 477                                                                                   args.virial_pitch,
 478                                                                                   d_vel,
 479                                                                                   d_group_members,
 480                                                                                   group_size);
 481         }
 482
 483     // setup the grid to run the final kernel
 484     int final_block_size = 512;
 485     grid = dim3(1, 1, 1);
 486     threads = dim3(final_block_size, 1, 1);
 487     shared_bytes = sizeof(Scalar3)*final_block_size;
 488
 489     // run the kernel
 490     gpu_compute_thermo_final_sums<<<grid, threads, shared_bytes>>>(d_properties,
 491                                                                    args.d_scratch,
 492                                                                    args.ndof,
 493                                                                    box,
 494                                                                    args.D,
 495                                                                    group_size,
 496                                                                    args.n_blocks,
 497                                                                    external_virial);
 498
 499     if (compute_pressure_tensor)
 500         {
 501         shared_bytes = 6 * sizeof(Scalar) * final_block_size;
 502         // run the kernel
 503         gpu_compute_pressure_tensor_final_sums<<<grid, threads, shared_bytes>>>(d_properties,
 504                                                                                args.d_scratch_pressure_tensor,
 505                                                                                box,
 506                                                                                group_size,
 507                                                                                args.n_blocks,
 508                                                                                args.external_virial_xx,
 509                                                                                args.external_virial_xy,
 510                                                                                args.external_virial_xz,
 511                                                                                args.external_virial_yy,
 512                                                                                args.external_virial_yz,
 513                                                                                args.external_virial_zz,
 514                                                                                args.D == 2
 515                                                                                );
 516         }
 517
 518     return cudaSuccess;
 519     }