libhoomd/cuda/TablePotentialGPU.cu

   1 /*
   2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
   3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
   4 the University of Michigan All rights reserved.
   5
   6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
   7 copyright is held, by various Contributors who have granted The Regents of the
   8 University of Michigan the right to modify and/or distribute such Contributions.
   9
  10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
  11 and binary forms, provided you abide by the following conditions:
  12
  13 * Redistributions of source code must retain the above copyright notice, this
  14 list of conditions, and the following disclaimer both in the code and
  15 prominently in any materials provided with the distribution.
  16
  17 * Redistributions in binary form must reproduce the above copyright notice, this
  18 list of conditions, and the following disclaimer in the documentation and/or
  19 other materials provided with the distribution.
  20
  21 * All publications and presentations based on HOOMD-blue, including any reports
  22 or published results obtained, in whole or in part, with HOOMD-blue, will
  23 acknowledge its use according to the terms posted at the time of submission on:
  24 http://codeblue.umich.edu/hoomd-blue/citations.html
  25
  26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
  27 http://codeblue.umich.edu/hoomd-blue/
  28
  29 * Apart from the above required attributions, neither the name of the copyright
  30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
  31 promote products derived from this software without specific prior written
  32 permission.
  33
  34 Disclaimer
  35
  36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
  37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
  39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
  40
  41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 */
  49
  50 // Maintainer: joaander
  51
  52 #include "TablePotentialGPU.cuh"
  53 #include "TextureTools.h"
  54
  55 #include "Index1D.h"
  56
  57 #ifdef WIN32
  58 #include <cassert>
  59 #else
  60 #include <assert.h>
  61 #endif
  62
  63 /*! \file TablePotentialGPU.cu
  64     \brief Defines GPU kernel code for calculating the table pair forces. Used by TablePotentialGPU.
  65 */
  66
  67 //! Texture for reading particle positions
  68 scalar4_tex_t pdata_pos_tex;
  69
  70 //! Texture for reading table values
  71 scalar2_tex_t tables_tex;
  72
  73 /*!  This kernel is called to calculate the table pair forces on all N particles
  74
  75     \param d_force Device memory to write computed forces
  76     \param d_virial Device memory to write computed virials
  77     \param virial_pitch Pitch of 2D virial array
  78     \param N number of particles in system
  79     \param d_pos device array of particle positions
  80     \param box Box dimensions used to implement periodic boundary conditions
  81     \param d_n_neigh Device memory array listing the number of neighbors for each particle
  82     \param d_nlist Device memory array containing the neighbor list contents
  83     \param nli Indexer for indexing \a d_nlist
  84     \param d_params Parameters for each table associated with a type pair
  85     \param ntypes Number of particle types in the system
  86     \param table_width Number of points in each table
  87
  88     See TablePotential for information on the memory layout.
  89
  90     \b Details:
  91     * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests
  92       at a later date may result in this changing.
  93 */
  94 __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force,
  95                                                 Scalar* d_virial,
  96                                                 const unsigned virial_pitch,
  97                                                 const unsigned int N,
  98                                                 const Scalar4 *d_pos,
  99                                                 const BoxDim box,
 100                                                 const unsigned int *d_n_neigh,
 101                                                 const unsigned int *d_nlist,
 102                                                 const Index2D nli,
 103                                                 const Scalar2 *d_tables,
 104                                                 const Scalar4 *d_params,
 105                                                 const unsigned int ntypes,
 106                                                 const unsigned int table_width)
 107     {
 108     // index calculation helpers
 109     Index2DUpperTriangular table_index(ntypes);
 110     Index2D table_value(table_width);
 111
 112     // read in params for easy and fast access in the kernel
 113     extern __shared__ Scalar4 s_params[];
 114     for (unsigned int cur_offset = 0; cur_offset < table_index.getNumElements(); cur_offset += blockDim.x)
 115         {
 116         if (cur_offset + threadIdx.x < table_index.getNumElements())
 117             s_params[cur_offset + threadIdx.x] = d_params[cur_offset + threadIdx.x];
 118         }
 119     __syncthreads();
 120
 121     // start by identifying which particle we are to handle
 122     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
 123
 124     if (idx >= N)
 125         return;
 126
 127     // load in the length of the list
 128     unsigned int n_neigh = d_n_neigh[idx];
 129
 130     // read in the position of our particle. Texture reads of Scalar4's are faster than global reads on compute 1.0 hardware
 131     Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx);
 132     Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z);
 133     unsigned int typei = __scalar_as_int(postype.w);
 134
 135     // initialize the force to 0
 136     Scalar4 force = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
 137     Scalar virialxx = Scalar(0.0);
 138     Scalar virialxy = Scalar(0.0);
 139     Scalar virialxz = Scalar(0.0);
 140     Scalar virialyy = Scalar(0.0);
 141     Scalar virialyz = Scalar(0.0);
 142     Scalar virialzz = Scalar(0.0);
 143
 144     // prefetch neighbor index
 145     unsigned int cur_neigh = 0;
 146     unsigned int next_neigh = d_nlist[nli(idx, 0)];
 147
 148     // loop over neighbors
 149     // on pre Fermi hardware, there is a bug that causes rare and random ULFs when simply looping over n_neigh
 150     // the workaround (activated via the template paramter) is to loop over nlist.height and put an if (i < n_neigh)
 151     // inside the loop
 152     #if (__CUDA_ARCH__ < 200)
 153     for (int neigh_idx = 0; neigh_idx < nli.getH(); neigh_idx++)
 154     #else
 155     for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++)
 156     #endif
 157         {
 158         #if (__CUDA_ARCH__ < 200)
 159         if (neigh_idx < n_neigh)
 160         #endif
 161             {
 162             // read the current neighbor index
 163             // prefetch the next value and set the current one
 164             cur_neigh = next_neigh;
 165             next_neigh = d_nlist[nli(idx, (neigh_idx+1))];
 166
 167             // get the neighbor's position
 168             Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh);
 169             Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z);
 170
 171             // calculate dr (with periodic boundary conditions)
 172             Scalar3 dx = pos - neigh_pos;
 173
 174             // apply periodic boundary conditions
 175             dx = box.minImage(dx);
 176
 177             // access needed parameters
 178             unsigned int typej = __scalar_as_int(neigh_postype.w);
 179             unsigned int cur_table_index = table_index(typei, typej);
 180             Scalar4 params = s_params[cur_table_index];
 181             Scalar rmin = params.x;
 182             Scalar rmax = params.y;
 183             Scalar delta_r = params.z;
 184
 185             // calculate r
 186             Scalar rsq = dot(dx, dx);
 187             Scalar r = sqrtf(rsq);
 188
 189             if (r < rmax && r >= rmin)
 190                 {
 191                 // precomputed term
 192                 Scalar value_f = (r - rmin) / delta_r;
 193
 194                 // compute index into the table and read in values
 195                 unsigned int value_i = floor(value_f);
 196                 Scalar2 VF0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_table_index));
 197                 Scalar2 VF1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_table_index));
 198
 199                 // unpack the data
 200                 Scalar V0 = VF0.x;
 201                 Scalar V1 = VF1.x;
 202                 Scalar F0 = VF0.y;
 203                 Scalar F1 = VF1.y;
 204
 205                 // compute the linear interpolation coefficient
 206                 Scalar f = value_f - Scalar(value_i);
 207
 208                 // interpolate to get V and F;
 209                 Scalar V = V0 + f * (V1 - V0);
 210                 Scalar F = F0 + f * (F1 - F0);
 211
 212                 // convert to standard variables used by the other pair computes in HOOMD-blue
 213                 Scalar forcemag_divr = Scalar(0.0);
 214                 if (r > Scalar(0.0))
 215                     forcemag_divr = F / r;
 216                 Scalar pair_eng = V;
 217                 // calculate the virial
 218                 Scalar force_div2r = Scalar(0.5) * forcemag_divr;
 219                 virialxx +=  dx.x * dx.x * force_div2r;
 220                 virialxy +=  dx.x * dx.y * force_div2r;
 221                 virialxz +=  dx.x * dx.z * force_div2r;
 222                 virialyy +=  dx.y * dx.y * force_div2r;
 223                 virialyz +=  dx.y * dx.z * force_div2r;
 224                 virialzz +=  dx.z * dx.z * force_div2r;
 225
 226                 // add up the force vector components (FLOPS: 7)
 227                 force.x += dx.x * forcemag_divr;
 228                 force.y += dx.y * forcemag_divr;
 229                 force.z += dx.z * forcemag_divr;
 230                 force.w += pair_eng;
 231                 }
 232             }
 233         }
 234
 235     // potential energy per particle must be halved
 236     force.w *= Scalar(0.5);
 237     // now that the force calculation is complete, write out the result
 238     d_force[idx] = force;
 239     d_virial[0*virial_pitch+idx] = virialxx;
 240     d_virial[1*virial_pitch+idx] = virialxy;
 241     d_virial[2*virial_pitch+idx] = virialxz;
 242     d_virial[3*virial_pitch+idx] = virialyy;
 243     d_virial[4*virial_pitch+idx] = virialyz;
 244     d_virial[5*virial_pitch+idx] = virialzz;
 245     }
 246
 247 /*! \param d_force Device memory to write computed forces
 248     \param d_virial Device memory to write computed virials
 249     \param virial_pitch pitch of 2D virial array
 250     \param N number of particles
 251     \param n_ghost number of ghost particles
 252     \param d_pos particle positions on the device
 253     \param box Box dimensions used to implement periodic boundary conditions
 254     \param d_n_neigh Device memory array listing the number of neighbors for each particle
 255     \param d_nlist Device memory array containing the neighbor list contents
 256     \param nli Indexer for indexing \a d_nlist
 257     \param d_tables Tables of the potential and force
 258     \param d_params Parameters for each table associated with a type pair
 259     \param ntypes Number of particle types in the system
 260     \param table_width Number of points in each table
 261     \param block_size Block size at which to run the kernel
 262     \param compute_capability Compute capability of the device (200, 300, 350)
 263
 264     \note This is just a kernel driver. See gpu_compute_table_forces_kernel for full documentation.
 265 */
 266 cudaError_t gpu_compute_table_forces(Scalar4* d_force,
 267                                      Scalar* d_virial,
 268                                      const unsigned int virial_pitch,
 269                                      const unsigned int N,
 270                                      const unsigned int n_ghost,
 271                                      const Scalar4 *d_pos,
 272                                      const BoxDim& box,
 273                                      const unsigned int *d_n_neigh,
 274                                      const unsigned int *d_nlist,
 275                                      const Index2D& nli,
 276                                      const Scalar2 *d_tables,
 277                                      const Scalar4 *d_params,
 278                                      const unsigned int ntypes,
 279                                      const unsigned int table_width,
 280                                      const unsigned int block_size,
 281                                      const unsigned int compute_capability)
 282     {
 283     assert(d_params);
 284     assert(d_tables);
 285     assert(ntypes > 0);
 286     assert(table_width > 1);
 287
 288     static unsigned int max_block_size = UINT_MAX;
 289     if (max_block_size == UINT_MAX)
 290         {
 291         cudaFuncAttributes attr;
 292         cudaFuncGetAttributes(&attr, (const void *)gpu_compute_table_forces_kernel);
 293         max_block_size = attr.maxThreadsPerBlock;
 294         }
 295
 296     unsigned int run_block_size = min(block_size, max_block_size);
 297
 298     // index calculation helper
 299     Index2DUpperTriangular table_index(ntypes);
 300
 301     // setup the grid to run the kernel
 302     dim3 grid( (int)ceil((double)N / (double)run_block_size), 1, 1);
 303     dim3 threads(run_block_size, 1, 1);
 304
 305     if (compute_capability < 350)
 306         {
 307         // bind the pdata position texture
 308         pdata_pos_tex.normalized = false;
 309         pdata_pos_tex.filterMode = cudaFilterModePoint;
 310         cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4) * (N+n_ghost));
 311         if (error != cudaSuccess)
 312             return error;
 313
 314         // bind the tables texture
 315         tables_tex.normalized = false;
 316         tables_tex.filterMode = cudaFilterModePoint;
 317         error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_width * table_index.getNumElements());
 318         if (error != cudaSuccess)
 319             return error;
 320         }
 321
 322     gpu_compute_table_forces_kernel<<< grid, threads, sizeof(Scalar4)*table_index.getNumElements() >>>
 323             (d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist, nli, d_tables, d_params, ntypes, table_width);
 324
 325     return cudaSuccess;
 326     }
 327
 328 // vim:syntax=cpp