Enable -march=native in OS X clang builds
[hoomd-blue.git] / libhoomd / computes_gpu / NeighborListGPU.cc
blob6e76bf71615eb798a653c7e60b777499c3cdb464
1 /*
2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
32 permission.
34 Disclaimer
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: joaander
52 /*! \file NeighborListGPU.cc
53 \brief Implementation of the NeighborListGPU class
56 #include "NeighborListGPU.h"
57 #include "NeighborListGPU.cuh"
59 #include <boost/python.hpp>
60 using namespace boost::python;
62 #ifdef ENABLE_MPI
63 #include "Communicator.h"
64 #endif
66 #include <iostream>
67 using namespace std;
69 /*! \param num_iters Number of iterations to average for the benchmark
70 \returns Milliseconds of execution time per calculation
72 Calls filterNlist repeatedly to benchmark the neighbor list filter step.
74 double NeighborListGPU::benchmarkFilter(unsigned int num_iters)
76 ClockSource t;
77 // warm up run
78 forceUpdate();
79 compute(0);
80 buildNlist(0);
81 filterNlist();
83 #ifdef ENABLE_CUDA
84 if (exec_conf->isCUDAEnabled())
86 cudaThreadSynchronize();
87 CHECK_CUDA_ERROR();
89 #endif
91 // benchmark
92 uint64_t start_time = t.getTime();
93 for (unsigned int i = 0; i < num_iters; i++)
94 filterNlist();
96 #ifdef ENABLE_CUDA
97 if (exec_conf->isCUDAEnabled())
98 cudaThreadSynchronize();
99 #endif
100 uint64_t total_time_ns = t.getTime() - start_time;
102 // convert the run time to milliseconds
103 return double(total_time_ns) / 1e6 / double(num_iters);
106 void NeighborListGPU::buildNlist(unsigned int timestep)
108 if (m_storage_mode != full)
110 m_exec_conf->msg->error() << "Only full mode nlists can be generated on the GPU" << endl;
111 throw runtime_error("Error computing neighbor list");
114 if (m_filter_body || m_filter_diameter)
116 m_exec_conf->msg->error() << "NeighborListGPU does not currently support body or diameter exclusions." << endl;
117 m_exec_conf->msg->error() << "Please contact the developers and notify them that you need this functionality" << endl;
119 throw runtime_error("Error computing neighbor list");
122 // check that the simulation box is big enough
123 const BoxDim& box = m_pdata->getBox();
124 Scalar3 nearest_plane_distance = box.getNearestPlaneDistance();
126 if (m_prof)
127 m_prof->push(exec_conf, "compute");
129 // acquire the particle data
130 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
131 // access the nlist data arrays
132 ArrayHandle<unsigned int> d_nlist(m_nlist, access_location::device, access_mode::overwrite);
133 ArrayHandle<unsigned int> d_n_neigh(m_n_neigh, access_location::device, access_mode::overwrite);
135 ArrayHandle<Scalar4> d_last_pos(m_last_pos, access_location::device, access_mode::overwrite);
137 // start by creating a temporary copy of r_cut sqaured
138 Scalar rmax = m_r_cut + m_r_buff;
139 // add d_max - 1.0, if diameter filtering is not already taking care of it
140 if (!m_filter_diameter)
141 rmax += m_d_max - Scalar(1.0);
142 Scalar rmaxsq = rmax*rmax;
144 if ((box.getPeriodic().x && nearest_plane_distance.x <= rmax * 2.0) ||
145 (box.getPeriodic().y && nearest_plane_distance.y <= rmax * 2.0) ||
146 (this->m_sysdef->getNDimensions() == 3 && box.getPeriodic().z && nearest_plane_distance.z <= rmax * 2.0))
148 m_exec_conf->msg->error() << "nlist: Simulation box is too small! Particles would be interacting with themselves." << endl;
149 throw runtime_error("Error updating neighborlist bins");
152 gpu_compute_nlist_nsq(d_nlist.data,
153 d_n_neigh.data,
154 d_last_pos.data,
155 m_conditions.getDeviceFlags(),
156 m_nlist_indexer,
157 d_pos.data,
158 m_pdata->getN(),
159 m_pdata->getNGhosts(),
160 box,
161 rmaxsq);
163 if (exec_conf->isCUDAErrorCheckingEnabled())
164 CHECK_CUDA_ERROR();
166 if (m_prof)
167 m_prof->pop(exec_conf);
170 void NeighborListGPU::scheduleDistanceCheck(unsigned int timestep)
172 // prevent against unnecessary calls
173 if (! shouldCheckDistance(timestep))
175 m_distcheck_scheduled = false;
176 return;
179 // scan through the particle data arrays and calculate distances
180 if (m_prof) m_prof->push(exec_conf, "dist-check");
182 // access data
183 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
184 BoxDim box = m_pdata->getBox();
185 ArrayHandle<Scalar4> d_last_pos(m_last_pos, access_location::device, access_mode::read);
187 // get current global nearest plane distance
188 Scalar3 L_g = m_pdata->getGlobalBox().getNearestPlaneDistance();
190 // Cutoff distance for inclusion in neighbor list
191 Scalar rmax = m_r_cut + m_r_buff;
192 if (!m_filter_diameter)
193 rmax += m_d_max - Scalar(1.0);
195 // Find direction of maximum box length contraction (smallest eigenvalue of deformation tensor)
196 Scalar3 lambda = L_g / m_last_L;
197 Scalar lambda_min = (lambda.x < lambda.y) ? lambda.x : lambda.y;
198 lambda_min = (lambda_min < lambda.z) ? lambda_min : lambda.z;
200 // maximum displacement for each particle (after subtraction of homogeneous dilations)
201 Scalar delta_max = (rmax*lambda_min - m_r_cut)/Scalar(2.0);
202 Scalar maxshiftsq = delta_max > 0 ? delta_max*delta_max : 0;
204 ArrayHandle<unsigned int> h_flags(m_flags, access_location::device, access_mode::readwrite);
205 gpu_nlist_needs_update_check_new(h_flags.data,
206 d_last_pos.data,
207 d_pos.data,
208 m_pdata->getN(),
209 box,
210 maxshiftsq,
211 lambda,
212 ++m_checkn);
214 if (exec_conf->isCUDAErrorCheckingEnabled())
215 CHECK_CUDA_ERROR();
217 m_distcheck_scheduled = true;
218 m_last_schedule_tstep = timestep;
220 // record synchronization point
221 cudaEventRecord(m_event);
223 if (m_prof) m_prof->pop(exec_conf);
226 bool NeighborListGPU::distanceCheck(unsigned int timestep)
228 // check if we have scheduled a kernel for the current time step
229 if (! m_distcheck_scheduled || m_last_schedule_tstep != timestep)
230 scheduleDistanceCheck(timestep);
232 m_distcheck_scheduled = false;
234 ArrayHandleAsync<unsigned int> h_flags(m_flags, access_location::host, access_mode::read);
236 // wait for kernel to complete
237 cudaEventSynchronize(m_event);
239 bool result = (*h_flags.data == m_checkn);
241 #ifdef ENABLE_MPI
242 if (m_pdata->getDomainDecomposition())
244 if (m_prof) m_prof->push(m_exec_conf,"MPI allreduce");
245 // check if migrate criterium is fulfilled on any rank
246 int local_result = result ? 1 : 0;
247 int global_result = 0;
248 MPI_Allreduce(&local_result,
249 &global_result,
251 MPI_INT,
252 MPI_MAX,
253 m_exec_conf->getMPICommunicator());
254 result = (global_result > 0);
255 if (m_prof) m_prof->pop();
257 #endif
260 return result;
263 /*! Calls gpu_nlsit_filter() to filter the neighbor list on the GPU
265 void NeighborListGPU::filterNlist()
267 if (m_prof)
268 m_prof->push(exec_conf, "filter");
270 // access data
272 ArrayHandle<unsigned int> d_n_ex_idx(m_n_ex_idx, access_location::device, access_mode::read);
273 ArrayHandle<unsigned int> d_ex_list_idx(m_ex_list_idx, access_location::device, access_mode::read);
274 ArrayHandle<unsigned int> d_n_neigh(m_n_neigh, access_location::device, access_mode::readwrite);
275 ArrayHandle<unsigned int> d_nlist(m_nlist, access_location::device, access_mode::readwrite);
277 m_tuner_filter->begin();
278 gpu_nlist_filter(d_n_neigh.data,
279 d_nlist.data,
280 m_nlist_indexer,
281 d_n_ex_idx.data,
282 d_ex_list_idx.data,
283 m_ex_list_indexer,
284 m_pdata->getN(),
285 m_tuner_filter->getParam());
286 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
287 m_tuner_filter->end();
289 if (m_prof)
290 m_prof->pop(exec_conf);
294 //! Update the exclusion list on the GPU
295 void NeighborListGPU::updateExListIdx()
297 if (m_prof)
298 m_prof->push(m_exec_conf,"update-ex");
300 ArrayHandle<unsigned int> d_rtag(m_pdata->getRTags(), access_location::device, access_mode::read);
301 ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
303 ArrayHandle<unsigned int> d_n_ex_tag(m_n_ex_tag, access_location::device, access_mode::read);
304 ArrayHandle<unsigned int> d_ex_list_tag(m_ex_list_tag, access_location::device, access_mode::read);
305 ArrayHandle<unsigned int> d_n_ex_idx(m_n_ex_idx, access_location::device, access_mode::overwrite);
306 ArrayHandle<unsigned int> d_ex_list_idx(m_ex_list_idx, access_location::device, access_mode::overwrite);
308 gpu_update_exclusion_list(d_tag.data,
309 d_rtag.data,
310 d_n_ex_tag.data,
311 d_ex_list_tag.data,
312 m_ex_list_indexer_tag,
313 d_n_ex_idx.data,
314 d_ex_list_idx.data,
315 m_ex_list_indexer,
316 m_pdata->getN());
317 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
319 if (m_prof)
320 m_prof->pop(m_exec_conf);
323 void export_NeighborListGPU()
325 class_<NeighborListGPU, boost::shared_ptr<NeighborListGPU>, bases<NeighborList>, boost::noncopyable >
326 ("NeighborListGPU", init< boost::shared_ptr<SystemDefinition>, Scalar, Scalar >())
327 .def("benchmarkFilter", &NeighborListGPU::benchmarkFilter)