2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: joaander
52 #include "TablePotentialGPU.cuh"
53 #include "TextureTools.h"
63 /*! \file TablePotentialGPU.cu
64 \brief Defines GPU kernel code for calculating the table pair forces. Used by TablePotentialGPU.
67 //! Texture for reading particle positions
68 scalar4_tex_t pdata_pos_tex;
70 //! Texture for reading table values
71 scalar2_tex_t tables_tex;
73 /*! This kernel is called to calculate the table pair forces on all N particles
75 \param d_force Device memory to write computed forces
76 \param d_virial Device memory to write computed virials
77 \param virial_pitch Pitch of 2D virial array
78 \param N number of particles in system
79 \param d_pos device array of particle positions
80 \param box Box dimensions used to implement periodic boundary conditions
81 \param d_n_neigh Device memory array listing the number of neighbors for each particle
82 \param d_nlist Device memory array containing the neighbor list contents
83 \param nli Indexer for indexing \a d_nlist
84 \param d_params Parameters for each table associated with a type pair
85 \param ntypes Number of particle types in the system
86 \param table_width Number of points in each table
88 See TablePotential for information on the memory layout.
91 * Table entries are read from tables_tex. Note that currently this is bound to a 1D memory region. Performance tests
92 at a later date may result in this changing.
94 __global__ void gpu_compute_table_forces_kernel(Scalar4* d_force,
96 const unsigned virial_pitch,
100 const unsigned int *d_n_neigh,
101 const unsigned int *d_nlist,
103 const Scalar2 *d_tables,
104 const Scalar4 *d_params,
105 const unsigned int ntypes,
106 const unsigned int table_width)
108 // index calculation helpers
109 Index2DUpperTriangular table_index(ntypes);
110 Index2D table_value(table_width);
112 // read in params for easy and fast access in the kernel
113 extern __shared__ Scalar4 s_params[];
114 for (unsigned int cur_offset = 0; cur_offset < table_index.getNumElements(); cur_offset += blockDim.x)
116 if (cur_offset + threadIdx.x < table_index.getNumElements())
117 s_params[cur_offset + threadIdx.x] = d_params[cur_offset + threadIdx.x];
121 // start by identifying which particle we are to handle
122 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
127 // load in the length of the list
128 unsigned int n_neigh = d_n_neigh[idx];
130 // read in the position of our particle. Texture reads of Scalar4's are faster than global reads on compute 1.0 hardware
131 Scalar4 postype = texFetchScalar4(d_pos, pdata_pos_tex, idx);
132 Scalar3 pos = make_scalar3(postype.x, postype.y, postype.z);
133 unsigned int typei = __scalar_as_int(postype.w);
135 // initialize the force to 0
136 Scalar4 force = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
137 Scalar virialxx = Scalar(0.0);
138 Scalar virialxy = Scalar(0.0);
139 Scalar virialxz = Scalar(0.0);
140 Scalar virialyy = Scalar(0.0);
141 Scalar virialyz = Scalar(0.0);
142 Scalar virialzz = Scalar(0.0);
144 // prefetch neighbor index
145 unsigned int cur_neigh = 0;
146 unsigned int next_neigh = d_nlist[nli(idx, 0)];
148 // loop over neighbors
149 // on pre Fermi hardware, there is a bug that causes rare and random ULFs when simply looping over n_neigh
150 // the workaround (activated via the template paramter) is to loop over nlist.height and put an if (i < n_neigh)
152 #if (__CUDA_ARCH__ < 200)
153 for (int neigh_idx = 0; neigh_idx < nli.getH(); neigh_idx++)
155 for (int neigh_idx = 0; neigh_idx < n_neigh; neigh_idx++)
158 #if (__CUDA_ARCH__ < 200)
159 if (neigh_idx < n_neigh)
162 // read the current neighbor index
163 // prefetch the next value and set the current one
164 cur_neigh = next_neigh;
165 next_neigh = d_nlist[nli(idx, (neigh_idx+1))];
167 // get the neighbor's position
168 Scalar4 neigh_postype = texFetchScalar4(d_pos, pdata_pos_tex, cur_neigh);
169 Scalar3 neigh_pos = make_scalar3(neigh_postype.x, neigh_postype.y, neigh_postype.z);
171 // calculate dr (with periodic boundary conditions)
172 Scalar3 dx = pos - neigh_pos;
174 // apply periodic boundary conditions
175 dx = box.minImage(dx);
177 // access needed parameters
178 unsigned int typej = __scalar_as_int(neigh_postype.w);
179 unsigned int cur_table_index = table_index(typei, typej);
180 Scalar4 params = s_params[cur_table_index];
181 Scalar rmin = params.x;
182 Scalar rmax = params.y;
183 Scalar delta_r = params.z;
186 Scalar rsq = dot(dx, dx);
187 Scalar r = sqrtf(rsq);
189 if (r < rmax && r >= rmin)
192 Scalar value_f = (r - rmin) / delta_r;
194 // compute index into the table and read in values
195 unsigned int value_i = floor(value_f);
196 Scalar2 VF0 = texFetchScalar2(d_tables, tables_tex, table_value(value_i, cur_table_index));
197 Scalar2 VF1 = texFetchScalar2(d_tables, tables_tex, table_value(value_i+1, cur_table_index));
205 // compute the linear interpolation coefficient
206 Scalar f = value_f - Scalar(value_i);
208 // interpolate to get V and F;
209 Scalar V = V0 + f * (V1 - V0);
210 Scalar F = F0 + f * (F1 - F0);
212 // convert to standard variables used by the other pair computes in HOOMD-blue
213 Scalar forcemag_divr = Scalar(0.0);
215 forcemag_divr = F / r;
217 // calculate the virial
218 Scalar force_div2r = Scalar(0.5) * forcemag_divr;
219 virialxx += dx.x * dx.x * force_div2r;
220 virialxy += dx.x * dx.y * force_div2r;
221 virialxz += dx.x * dx.z * force_div2r;
222 virialyy += dx.y * dx.y * force_div2r;
223 virialyz += dx.y * dx.z * force_div2r;
224 virialzz += dx.z * dx.z * force_div2r;
226 // add up the force vector components (FLOPS: 7)
227 force.x += dx.x * forcemag_divr;
228 force.y += dx.y * forcemag_divr;
229 force.z += dx.z * forcemag_divr;
235 // potential energy per particle must be halved
236 force.w *= Scalar(0.5);
237 // now that the force calculation is complete, write out the result
238 d_force[idx] = force;
239 d_virial[0*virial_pitch+idx] = virialxx;
240 d_virial[1*virial_pitch+idx] = virialxy;
241 d_virial[2*virial_pitch+idx] = virialxz;
242 d_virial[3*virial_pitch+idx] = virialyy;
243 d_virial[4*virial_pitch+idx] = virialyz;
244 d_virial[5*virial_pitch+idx] = virialzz;
247 /*! \param d_force Device memory to write computed forces
248 \param d_virial Device memory to write computed virials
249 \param virial_pitch pitch of 2D virial array
250 \param N number of particles
251 \param n_ghost number of ghost particles
252 \param d_pos particle positions on the device
253 \param box Box dimensions used to implement periodic boundary conditions
254 \param d_n_neigh Device memory array listing the number of neighbors for each particle
255 \param d_nlist Device memory array containing the neighbor list contents
256 \param nli Indexer for indexing \a d_nlist
257 \param d_tables Tables of the potential and force
258 \param d_params Parameters for each table associated with a type pair
259 \param ntypes Number of particle types in the system
260 \param table_width Number of points in each table
261 \param block_size Block size at which to run the kernel
262 \param compute_capability Compute capability of the device (200, 300, 350)
264 \note This is just a kernel driver. See gpu_compute_table_forces_kernel for full documentation.
266 cudaError_t gpu_compute_table_forces(Scalar4* d_force,
268 const unsigned int virial_pitch,
269 const unsigned int N,
270 const unsigned int n_ghost,
271 const Scalar4 *d_pos,
273 const unsigned int *d_n_neigh,
274 const unsigned int *d_nlist,
276 const Scalar2 *d_tables,
277 const Scalar4 *d_params,
278 const unsigned int ntypes,
279 const unsigned int table_width,
280 const unsigned int block_size,
281 const unsigned int compute_capability)
286 assert(table_width > 1);
288 static unsigned int max_block_size = UINT_MAX;
289 if (max_block_size == UINT_MAX)
291 cudaFuncAttributes attr;
292 cudaFuncGetAttributes(&attr, (const void *)gpu_compute_table_forces_kernel);
293 max_block_size = attr.maxThreadsPerBlock;
296 unsigned int run_block_size = min(block_size, max_block_size);
298 // index calculation helper
299 Index2DUpperTriangular table_index(ntypes);
301 // setup the grid to run the kernel
302 dim3 grid( (int)ceil((double)N / (double)run_block_size), 1, 1);
303 dim3 threads(run_block_size, 1, 1);
305 if (compute_capability < 350)
307 // bind the pdata position texture
308 pdata_pos_tex.normalized = false;
309 pdata_pos_tex.filterMode = cudaFilterModePoint;
310 cudaError_t error = cudaBindTexture(0, pdata_pos_tex, d_pos, sizeof(Scalar4) * (N+n_ghost));
311 if (error != cudaSuccess)
314 // bind the tables texture
315 tables_tex.normalized = false;
316 tables_tex.filterMode = cudaFilterModePoint;
317 error = cudaBindTexture(0, tables_tex, d_tables, sizeof(Scalar2) * table_width * table_index.getNumElements());
318 if (error != cudaSuccess)
322 gpu_compute_table_forces_kernel<<< grid, threads, sizeof(Scalar4)*table_index.getNumElements() >>>
323 (d_force, d_virial, virial_pitch, N, d_pos, box, d_n_neigh, d_nlist, nli, d_tables, d_params, ntypes, table_width);