libhoomd/cuda/TwoStepNPTMTKGPU.cu

   1 /*
   2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
   3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
   4 the University of Michigan All rights reserved.
   5
   6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
   7 copyright is held, by various Contributors who have granted The Regents of the
   8 University of Michigan the right to modify and/or distribute such Contributions.
   9
  10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
  11 and binary forms, provided you abide by the following conditions:
  12
  13 * Redistributions of source code must retain the above copyright notice, this
  14 list of conditions, and the following disclaimer both in the code and
  15 prominently in any materials provided with the distribution.
  16
  17 * Redistributions in binary form must reproduce the above copyright notice, this
  18 list of conditions, and the following disclaimer in the documentation and/or
  19 other materials provided with the distribution.
  20
  21 * All publications and presentations based on HOOMD-blue, including any reports
  22 or published results obtained, in whole or in part, with HOOMD-blue, will
  23 acknowledge its use according to the terms posted at the time of submission on:
  24 http://codeblue.umich.edu/hoomd-blue/citations.html
  25
  26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
  27 http://codeblue.umich.edu/hoomd-blue/
  28
  29 * Apart from the above required attributions, neither the name of the copyright
  30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
  31 promote products derived from this software without specific prior written
  32 permission.
  33
  34 Disclaimer
  35
  36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
  37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
  39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
  40
  41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 */
  49
  50 // Maintainer: jglaser
  51
  52 #include "TwoStepNPTMTKGPU.cuh"
  53
  54 #ifdef WIN32
  55 #include <cassert>
  56 #else
  57 #include <assert.h>
  58 #endif
  59
  60 /*! \file TwoStepNPTMTKGPU.cu
  61     \brief Defines GPU kernel code for NPT integration on the GPU using the Martyna-Tobias-Klein update equations. Used by TwoStepNPTMTKGPU.
  62 */
  63
  64 //! Shared memory used in reducing the sum of the squared velocities
  65 extern __shared__ Scalar npt_mtk_sdata[];
  66
  67 //! Kernel to propagate the positions and velocities, first half of NPT update
  68 __global__ void gpu_npt_mtk_step_one_kernel(Scalar4 *d_pos,
  69                              Scalar4 *d_vel,
  70                              const Scalar3 *d_accel,
  71                              unsigned int *d_group_members,
  72                              unsigned int group_size,
  73                              Scalar exp_thermo_fac,
  74                              Scalar mat_exp_v_xx,
  75                              Scalar mat_exp_v_xy,
  76                              Scalar mat_exp_v_xz,
  77                              Scalar mat_exp_v_yy,
  78                              Scalar mat_exp_v_yz,
  79                              Scalar mat_exp_v_zz,
  80                              Scalar mat_exp_v_int_xx,
  81                              Scalar mat_exp_v_int_xy,
  82                              Scalar mat_exp_v_int_xz,
  83                              Scalar mat_exp_v_int_yy,
  84                              Scalar mat_exp_v_int_yz,
  85                              Scalar mat_exp_v_int_zz,
  86                              Scalar mat_exp_r_xx,
  87                              Scalar mat_exp_r_xy,
  88                              Scalar mat_exp_r_xz,
  89                              Scalar mat_exp_r_yy,
  90                              Scalar mat_exp_r_yz,
  91                              Scalar mat_exp_r_zz,
  92                              Scalar mat_exp_r_int_xx,
  93                              Scalar mat_exp_r_int_xy,
  94                              Scalar mat_exp_r_int_xz,
  95                              Scalar mat_exp_r_int_yy,
  96                              Scalar mat_exp_r_int_yz,
  97                              Scalar mat_exp_r_int_zz,
  98                              Scalar deltaT,
  99                              bool rescale_all)
 100     {
 101     // determine which particle this thread works on
 102     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 103
 104     // initialize eigenvectors
 105     if (group_idx < group_size)
 106         {
 107         unsigned int idx = d_group_members[group_idx];
 108
 109         // fetch particle position
 110         Scalar4 pos = d_pos[idx];
 111
 112         // fetch particle velocity and acceleration
 113         Scalar4 vel = d_vel[idx];
 114         Scalar3 v = make_scalar3(vel.x, vel.y, vel.z);
 115         Scalar3 accel = d_accel[idx];;
 116         Scalar3 r = make_scalar3(pos.x, pos.y, pos.z);
 117
 118         // apply thermostat update of velocity
 119         v *= exp_thermo_fac;
 120
 121         // propagate velocity by half a time step and position by the full time step
 122         // by multiplying with upper triangular matrix
 123         v.x = mat_exp_v_xx * v.x + mat_exp_v_xy * v.y + mat_exp_v_xz * v.z;
 124         v.y = mat_exp_v_yy * v.y + mat_exp_v_yz * v.z;
 125         v.z = mat_exp_v_zz * v.z;
 126
 127         v.x += mat_exp_v_int_xx * accel.x + mat_exp_v_int_xy * accel.y + mat_exp_v_int_xz * accel.z;
 128         v.y += mat_exp_v_int_yy * accel.y + mat_exp_v_int_yz * accel.z;
 129         v.z += mat_exp_v_int_zz * accel.z;
 130
 131         if (!rescale_all)
 132             {
 133             // rescale this group of particles
 134             r.x = mat_exp_r_xx * r.x + mat_exp_r_xy * r.y + mat_exp_r_xz * r.z;
 135             r.y = mat_exp_r_yy * r.y + mat_exp_r_yz * r.z;
 136             r.z = mat_exp_r_zz * r.z;
 137             }
 138
 139         r.x += mat_exp_r_int_xx * v.x + mat_exp_r_int_xy * v.y + mat_exp_r_int_xz * v.z;
 140         r.y += mat_exp_r_int_yy * v.y + mat_exp_r_int_yz * v.z;
 141         r.z += mat_exp_r_int_zz * v.z;
 142
 143         // write out the results
 144         d_pos[idx] = make_scalar4(r.x,r.y,r.z,pos.w);
 145         d_vel[idx] = make_scalar4(v.x,v.y,v.z,vel.w);
 146         }
 147     }
 148
 149 /*! \param d_pos array of particle positions
 150     \param d_vel array of particle velocities
 151     \param d_accel array of particle accelerations
 152     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 153     \param group_size Number of members in the group
 154     \param exp_thermo_fac Update factor for thermostat
 155     \param mat_exp_v Matrix exponential for velocity update
 156     \param mat_exp_v_int Integrated matrix exp for velocity update
 157     \param mat_exp_r Matrix exponential for position update
 158     \param mat_exp_r_int Integrated matrix exp for position update
 159     \param deltaT Time to advance (for one full step)
 160     \param deltaT Time to move forward in one whole step
 161     \param rescale_all True if all particles in the system should be rescaled at once
 162
 163     This is just a kernel driver for gpu_npt_mtk_step_one_kernel(). See it for more details.
 164 */
 165 cudaError_t gpu_npt_mtk_step_one(Scalar4 *d_pos,
 166                              Scalar4 *d_vel,
 167                              const Scalar3 *d_accel,
 168                              unsigned int *d_group_members,
 169                              unsigned int group_size,
 170                              Scalar exp_thermo_fac,
 171                              Scalar *mat_exp_v,
 172                              Scalar *mat_exp_v_int,
 173                              Scalar *mat_exp_r,
 174                              Scalar *mat_exp_r_int,
 175                              Scalar deltaT,
 176                              bool rescale_all)
 177     {
 178     // setup the grid to run the kernel
 179     unsigned int block_size = 256;
 180     dim3 grid( (group_size / block_size) + 1, 1, 1);
 181     dim3 threads(block_size, 1, 1);
 182
 183     // run the kernel
 184     gpu_npt_mtk_step_one_kernel<<< grid, threads >>>(d_pos,
 185                                                  d_vel,
 186                                                  d_accel,
 187                                                  d_group_members,
 188                                                  group_size,
 189                                                  exp_thermo_fac,
 190                                                  mat_exp_v[0],
 191                                                  mat_exp_v[1],
 192                                                  mat_exp_v[2],
 193                                                  mat_exp_v[3],
 194                                                  mat_exp_v[4],
 195                                                  mat_exp_v[5],
 196                                                  mat_exp_v_int[0],
 197                                                  mat_exp_v_int[1],
 198                                                  mat_exp_v_int[2],
 199                                                  mat_exp_v_int[3],
 200                                                  mat_exp_v_int[4],
 201                                                  mat_exp_v_int[5],
 202                                                  mat_exp_r[0],
 203                                                  mat_exp_r[1],
 204                                                  mat_exp_r[2],
 205                                                  mat_exp_r[3],
 206                                                  mat_exp_r[4],
 207                                                  mat_exp_r[5],
 208                                                  mat_exp_r_int[0],
 209                                                  mat_exp_r_int[1],
 210                                                  mat_exp_r_int[2],
 211                                                  mat_exp_r_int[3],
 212                                                  mat_exp_r_int[4],
 213                                                  mat_exp_r_int[5],
 214                                                  deltaT,
 215                                                  rescale_all);
 216
 217     return cudaSuccess;
 218     }
 219
 220 /*! \param N number of particles in the system
 221     \param d_pos array of particle positions
 222     \param d_image array of particle images
 223     \param box The new box the particles where the particles now reside
 224
 225     Wrap particle positions for all particles in the box
 226 */
 227 extern "C" __global__
 228 void gpu_npt_mtk_wrap_kernel(const unsigned int N,
 229                              Scalar4 *d_pos,
 230                              int3 *d_image,
 231                              BoxDim box)
 232     {
 233     // determine which particle this thread works on
 234     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 235
 236     // wrap ALL particles in the box
 237     if (idx < N)
 238         {
 239         // fetch particle position
 240         Scalar4 postype = d_pos[idx];
 241         Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z);
 242
 243         // read in the image flags
 244         int3 image = d_image[idx];
 245
 246         // fix periodic boundary conditions
 247         box.wrap(pos, image);
 248
 249         // write out the results
 250         d_pos[idx] = make_scalar4(pos.x, pos.y, pos.z, postype.w);
 251         d_image[idx] = image;
 252         }
 253     }
 254
 255 /*! \param N number of particles in the system
 256     \param d_pos array of particle positions
 257     \param d_image array of particle images
 258     \param box The new box the particles where the particles now reside
 259
 260     This is just a kernel driver for gpu_npt_mtk_wrap_kernel(). See it for more details.
 261 */
 262 cudaError_t gpu_npt_mtk_wrap(const unsigned int N,
 263                              Scalar4 *d_pos,
 264                              int3 *d_image,
 265                              const BoxDim& box)
 266     {
 267     // setup the grid to run the kernel
 268     unsigned int block_size=256;
 269     dim3 grid( (N / block_size) + 1, 1, 1);
 270     dim3 threads(block_size, 1, 1);
 271
 272     // run the kernel
 273     gpu_npt_mtk_wrap_kernel<<< grid, threads >>>(N, d_pos, d_image, box);
 274
 275     return cudaSuccess;
 276     }
 277
 278 //! Kernel to propagate the positions and velocities, second half of NPT update
 279 __global__ void gpu_npt_mtk_step_two_kernel(Scalar4 *d_vel,
 280                              Scalar3 *d_accel,
 281                              const Scalar4 *d_net_force,
 282                              unsigned int *d_group_members,
 283                              unsigned int group_size,
 284                              Scalar mat_exp_v_xx,
 285                              Scalar mat_exp_v_xy,
 286                              Scalar mat_exp_v_xz,
 287                              Scalar mat_exp_v_yy,
 288                              Scalar mat_exp_v_yz,
 289                              Scalar mat_exp_v_zz,
 290                              Scalar mat_exp_v_int_xx,
 291                              Scalar mat_exp_v_int_xy,
 292                              Scalar mat_exp_v_int_xz,
 293                              Scalar mat_exp_v_int_yy,
 294                              Scalar mat_exp_v_int_yz,
 295                              Scalar mat_exp_v_int_zz,
 296                              Scalar deltaT)
 297     {
 298     // determine which particle this thread works on
 299     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 300
 301     if (group_idx < group_size)
 302         {
 303         unsigned int idx = d_group_members[group_idx];
 304
 305         // fetch particle velocity and acceleration
 306         Scalar4 vel = d_vel[idx];
 307
 308         // compute acceleration
 309         Scalar minv = Scalar(1.0)/vel.w;
 310         Scalar4 net_force = d_net_force[idx];
 311         Scalar3 accel = make_scalar3(net_force.x, net_force.y, net_force.z);
 312         accel *= minv;
 313
 314         Scalar3 v = make_scalar3(vel.x, vel.y, vel.z);
 315
 316         // propagate velocity by half a time step by multiplying with an upper triangular matrix
 317         v.x = mat_exp_v_xx * v.x + mat_exp_v_xy * v.y + mat_exp_v_xz * v.z;
 318         v.y = mat_exp_v_yy * v.y + mat_exp_v_yz * v.z;
 319         v.z = mat_exp_v_zz * v.z;
 320
 321         v.x += mat_exp_v_int_xx * accel.x + mat_exp_v_int_xy * accel.y + mat_exp_v_int_xz * accel.z;
 322         v.y += mat_exp_v_int_yy * accel.y + mat_exp_v_int_yz * accel.z;
 323         v.z += mat_exp_v_int_zz * accel.z;
 324
 325         // write out velocity
 326         d_vel[idx] = make_scalar4(v.x, v.y, v.z, vel.w);
 327
 328         // since we calculate the acceleration, we need to write it for the next step
 329         d_accel[idx] = accel;
 330         }
 331     }
 332
 333 /*! \param d_vel array of particle velocities
 334     \param d_accel array of particle accelerations
 335     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 336     \param group_size Number of members in the group
 337     \param mat_exp_v Matrix exponential for velocity update
 338     \param mat_exp_v_int Integrated matrix exp for velocity update
 339     \param d_net_force Net force on each particle
 340
 341     \param deltaT Time to move forward in one whole step
 342
 343     This is just a kernel driver for gpu_npt_mtk_step_kernel(). See it for more details.
 344 */
 345 cudaError_t gpu_npt_mtk_step_two(Scalar4 *d_vel,
 346                              Scalar3 *d_accel,
 347                              unsigned int *d_group_members,
 348                              unsigned int group_size,
 349                              Scalar4 *d_net_force,
 350                              Scalar* mat_exp_v,
 351                              Scalar* mat_exp_v_int,
 352                              Scalar deltaT)
 353     {
 354     // setup the grid to run the kernel
 355     unsigned int block_size=256;
 356     dim3 grid( (group_size / block_size) + 1, 1, 1);
 357     dim3 threads(block_size, 1, 1);
 358
 359     // run the kernel
 360     gpu_npt_mtk_step_two_kernel<<< grid, threads >>>(d_vel,
 361                                                      d_accel,
 362                                                      d_net_force,
 363                                                      d_group_members,
 364                                                      group_size,
 365                                                      mat_exp_v[0],
 366                                                      mat_exp_v[1],
 367                                                      mat_exp_v[2],
 368                                                      mat_exp_v[3],
 369                                                      mat_exp_v[4],
 370                                                      mat_exp_v[5],
 371                                                      mat_exp_v_int[0],
 372                                                      mat_exp_v_int[1],
 373                                                      mat_exp_v_int[2],
 374                                                      mat_exp_v_int[3],
 375                                                      mat_exp_v_int[4],
 376                                                      mat_exp_v_int[5],
 377                                                      deltaT);
 378
 379     return cudaSuccess;
 380     }
 381
 382 //! GPU kernel to perform partial reduction of temperature
 383 __global__ void gpu_npt_mtk_temperature_partial(unsigned int *d_group_members,
 384                                                 unsigned int group_size,
 385                                                 Scalar *d_scratch,
 386                                                 Scalar4 *d_velocity)
 387     {
 388     // determine which particle this thread works on
 389     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 390
 391     Scalar mv2_element; // element of scratch space read in
 392     if (group_idx < group_size)
 393         {
 394         unsigned int idx = d_group_members[group_idx];
 395
 396         Scalar4 vel = d_velocity[idx];
 397         Scalar mass = vel.w;
 398
 399         mv2_element =  mass * (vel.x*vel.x + vel.y*vel.y + vel.z*vel.z);
 400         }
 401     else
 402         {
 403         // non-participating thread: contribute 0 to the sum
 404         mv2_element = Scalar(0.0);
 405         }
 406
 407     npt_mtk_sdata[threadIdx.x] = mv2_element;
 408     __syncthreads();
 409
 410     int offs = blockDim.x >> 1;
 411     while (offs > 0)
 412         {
 413         if (threadIdx.x < offs)
 414             npt_mtk_sdata[threadIdx.x] += npt_mtk_sdata[threadIdx.x + offs];
 415
 416         offs >>= 1;
 417         __syncthreads();
 418         }
 419
 420     // write out partial sum
 421     if (threadIdx.x == 0)
 422         d_scratch[blockIdx.x] = npt_mtk_sdata[0];
 423
 424      }
 425
 426 //! GPU kernel to perform final reduction of temperature
 427 __global__ void gpu_npt_mtk_temperature_final_sum(Scalar *d_scratch,
 428                                                   Scalar *d_temperature,
 429                                                   unsigned int ndof,
 430                                                   unsigned int num_partial_sums)
 431     {
 432     Scalar final_sum(0.0);
 433
 434     for (int start = 0; start < num_partial_sums; start += blockDim.x)
 435         {
 436         __syncthreads();
 437         if (start + threadIdx.x < num_partial_sums)
 438             {
 439             npt_mtk_sdata[threadIdx.x] = d_scratch[start + threadIdx.x];
 440             }
 441         else
 442             npt_mtk_sdata[threadIdx.x] = Scalar(0.0);
 443
 444         __syncthreads();
 445
 446         // reduce the sum in parallel
 447         int offs = blockDim.x >> 1;
 448         while (offs > 0)
 449             {
 450             if (threadIdx.x < offs)
 451                 npt_mtk_sdata[threadIdx.x] += npt_mtk_sdata[threadIdx.x + offs];
 452
 453             offs >>=1;
 454             __syncthreads();
 455             }
 456
 457         if (threadIdx.x == 0)
 458             final_sum += npt_mtk_sdata[0];
 459         }
 460
 461     if (threadIdx.x == 0)
 462         *d_temperature = final_sum/Scalar(ndof);
 463     }
 464
 465 /*!\param d_temperature Device variable to store the temperature value (output)
 466    \param d_vel Array of particle velocities and masses
 467    \param d_scratch Temporary scratch space for reduction
 468    \param num_blocks Number of CUDA blocks used in reduction
 469    \param block_size Size of blocks used in reduction
 470    \param d_group_members Members of group for which the reduction is performed
 471    \param group_size Size of group
 472    \param ndof Number of degrees of freedom of group
 473
 474    This function performs the reduction of the temperature on the GPU. It is just
 475    a driver function that calls the appropriate GPU kernels.
 476    */
 477 cudaError_t gpu_npt_mtk_temperature(Scalar *d_temperature,
 478                                     Scalar4 *d_vel,
 479                                     Scalar *d_scratch,
 480                                     unsigned int num_blocks,
 481                                     unsigned int block_size,
 482                                     unsigned int *d_group_members,
 483                                     unsigned int group_size,
 484                                     unsigned int ndof)
 485     {
 486     assert(d_temperature);
 487     assert(d_vel);
 488     assert(d_scratch);
 489
 490     dim3 grid(num_blocks,1,1);
 491     dim3 threads(block_size,1,1);
 492
 493     unsigned int shared_bytes = sizeof(Scalar)*block_size;
 494
 495     // reduce squared velocity norm times mass, first pass
 496     gpu_npt_mtk_temperature_partial<<<grid, threads, shared_bytes>>>(
 497                                                 d_group_members,
 498                                                 group_size,
 499                                                 d_scratch,
 500                                                 d_vel);
 501
 502
 503     unsigned int final_block_size = 512;
 504     grid = dim3(1,1,1);
 505     threads = dim3(final_block_size, 1, 1);
 506     shared_bytes = sizeof(Scalar)*final_block_size;
 507
 508     // reduction, second pass
 509     gpu_npt_mtk_temperature_final_sum<<<grid, threads, shared_bytes>>>(
 510                                                 d_scratch,
 511                                                 d_temperature,
 512                                                 ndof,
 513                                                 num_blocks);
 514
 515     return cudaSuccess;
 516     }
 517
 518 /*! \param d_vel array of particle velocities and masses
 519     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 520     \param group_size Number of members in the group
 521     \param exp_v_fac_thermo scaling factor (per direction) for velocity update generated by thermostat
 522
 523     GPU kernel to thermostat velocities
 524 */
 525 __global__ void gpu_npt_mtk_thermostat_kernel(Scalar4 *d_vel,
 526                              unsigned int *d_group_members,
 527                              unsigned int group_size,
 528                              Scalar exp_v_fac_thermo)
 529     {
 530     // determine which particle this thread works on
 531     int group_idx = blockIdx.x * blockDim.x + threadIdx.x;
 532
 533     if (group_idx < group_size)
 534         {
 535         unsigned int idx = d_group_members[group_idx];
 536
 537         // fetch particle velocity and acceleration
 538         Scalar4 vel = d_vel[idx];
 539         Scalar3 v = make_scalar3(vel.x, vel.y, vel.z);
 540
 541         // rescale
 542         v *= exp_v_fac_thermo;
 543
 544         // write out the results
 545         d_vel[idx] = make_scalar4(v.x,v.y,v.z,vel.w);
 546         }
 547     }
 548
 549 /*! \param d_vel array of particle velocities
 550     \param d_group_members Device array listing the indicies of the mebers of the group to integrate
 551     \param group_size Number of members in the group
 552     \param exp_v_fac_thermo Rescaling factor
 553     \param block_size Kernel block size
 554
 555     This is just a kernel driver for gpu_npt_step_kernel(). See it for more details.
 556 */
 557 cudaError_t gpu_npt_mtk_thermostat(Scalar4 *d_vel,
 558                              unsigned int *d_group_members,
 559                              unsigned int group_size,
 560                              Scalar exp_v_fac_thermo,
 561                              unsigned int block_size)
 562     {
 563     // setup the grid to run the kernel
 564     dim3 grid( (group_size / block_size) + 1, 1, 1);
 565     dim3 threads(block_size, 1, 1);
 566
 567     // run the kernel
 568     gpu_npt_mtk_thermostat_kernel<<< grid, threads >>>(d_vel,
 569                                                      d_group_members,
 570                                                      group_size,
 571                                                      exp_v_fac_thermo);
 572
 573     return cudaSuccess;
 574     }
 575
 576 __global__ void gpu_npt_mtk_rescale_kernel(unsigned int N,
 577                                            Scalar4 *d_postype,
 578                                            Scalar mat_exp_r_xx,
 579                                            Scalar mat_exp_r_xy,
 580                                            Scalar mat_exp_r_xz,
 581                                            Scalar mat_exp_r_yy,
 582                                            Scalar mat_exp_r_yz,
 583                                            Scalar mat_exp_r_zz)
 584     {
 585     unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
 586
 587     if (idx >= N) return;
 588
 589     // rescale position
 590     Scalar4 postype = d_postype[idx];
 591     Scalar3 r = make_scalar3(postype.x,postype.y,postype.z);
 592
 593     r.x = mat_exp_r_xx * r.x + mat_exp_r_xy * r.y + mat_exp_r_xz * r.z;
 594     r.y = mat_exp_r_yy * r.y + mat_exp_r_yz* r.z;
 595     r.z = mat_exp_r_zz * r.z;
 596
 597     d_postype[idx] = make_scalar4(r.x, r.y, r.z, postype.w);
 598     }
 599
 600 void gpu_npt_mtk_rescale(unsigned int N,
 601                        Scalar4 *d_postype,
 602                        Scalar mat_exp_r_xx,
 603                        Scalar mat_exp_r_xy,
 604                        Scalar mat_exp_r_xz,
 605                        Scalar mat_exp_r_yy,
 606                        Scalar mat_exp_r_yz,
 607                        Scalar mat_exp_r_zz)
 608     {
 609     unsigned int block_size = 256;
 610
 611     dim3 grid( (N / block_size) + 1, 1, 1);
 612     dim3 threads(block_size, 1, 1);
 613
 614     gpu_npt_mtk_rescale_kernel<<<grid, threads>>> (N,
 615         d_postype,
 616         mat_exp_r_xx,
 617         mat_exp_r_xy,
 618         mat_exp_r_xz,
 619         mat_exp_r_yy,
 620         mat_exp_r_yz,
 621         mat_exp_r_zz);
 622     }