2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: joaander
52 /*! \file NeighborListGPUBinned.cc
53 \brief Defines NeighborListGPUBinned
56 #include "NeighborListGPUBinned.h"
57 #include "NeighborListGPUBinned.cuh"
59 #include <boost/python.hpp>
60 using namespace boost::python
;
63 #include "Communicator.h"
66 NeighborListGPUBinned::NeighborListGPUBinned(boost::shared_ptr
<SystemDefinition
> sysdef
,
69 boost::shared_ptr
<CellList
> cl
)
70 : NeighborListGPU(sysdef
, r_cut
, r_buff
), m_cl(cl
), m_param(0)
72 // create a default cell list if one was not specified
74 m_cl
= boost::shared_ptr
<CellList
>(new CellList(sysdef
));
76 m_cl
->setNominalWidth(r_cut
+ r_buff
+ m_d_max
- Scalar(1.0));
78 m_cl
->setComputeTDB(false);
83 // default to 0 last allocated quantities
84 m_last_dim
= make_uint3(0,0,0);
90 // initialize autotuner
91 // the full block size and threads_per_particle matrix is searched,
92 // encoded as block_size*10000 + threads_per_particle
93 std::vector
<unsigned int> valid_params
;
94 for (unsigned int block_size
= 32; block_size
<= 1024; block_size
+= 32)
97 while (s
<= this->m_exec_conf
->dev_prop
.warpSize
)
99 valid_params
.push_back(block_size
*10000 + s
);
104 m_tuner
.reset(new Autotuner(valid_params
, 5, 100000, "nlist_binned", this->m_exec_conf
));
105 m_last_tuned_timestep
= 0;
108 // synchronize over MPI
109 m_tuner
->setSync(m_pdata
->getDomainDecomposition());
112 // When running on compute 1.x, textures are allocated with the height equal to the number of cells
113 // limit the number of cells to the maximum texture dimension
114 if (exec_conf
->getComputeCapability() < 200)
116 m_cl
->setMaxCells(exec_conf
->dev_prop
.maxTexture2D
[1]);
120 NeighborListGPUBinned::~NeighborListGPUBinned()
122 // free the old arrays
123 if (dca_cell_adj
!= NULL
)
124 cudaFreeArray(dca_cell_adj
);
125 if (dca_cell_xyzf
!= NULL
)
126 cudaFreeArray(dca_cell_xyzf
);
127 if (dca_cell_tdb
!= NULL
)
128 cudaFreeArray(dca_cell_tdb
);
133 void NeighborListGPUBinned::setRCut(Scalar r_cut
, Scalar r_buff
)
135 NeighborListGPU::setRCut(r_cut
, r_buff
);
137 m_cl
->setNominalWidth(r_cut
+ r_buff
+ m_d_max
- Scalar(1.0));
140 void NeighborListGPUBinned::setMaximumDiameter(Scalar d_max
)
142 NeighborListGPU::setMaximumDiameter(d_max
);
144 // need to update the cell list settings appropriately
145 m_cl
->setNominalWidth(m_r_cut
+ m_r_buff
+ m_d_max
- Scalar(1.0));
148 void NeighborListGPUBinned::setFilterBody(bool filter_body
)
150 NeighborListGPU::setFilterBody(filter_body
);
152 // need to update the cell list settings appropriately
153 if (m_filter_body
|| m_filter_diameter
)
154 m_cl
->setComputeTDB(true);
156 m_cl
->setComputeTDB(false);
159 void NeighborListGPUBinned::setFilterDiameter(bool filter_diameter
)
161 NeighborListGPU::setFilterDiameter(filter_diameter
);
163 // need to update the cell list settings appropriately
164 if (m_filter_body
|| m_filter_diameter
)
165 m_cl
->setComputeTDB(true);
167 m_cl
->setComputeTDB(false);
170 void NeighborListGPUBinned::buildNlist(unsigned int timestep
)
172 if (m_storage_mode
!= full
)
174 m_exec_conf
->msg
->error() << "Only full mode nlists can be generated on the GPU" << endl
;
175 throw runtime_error("Error computing neighbor list");
178 m_cl
->compute(timestep
);
181 m_prof
->push(exec_conf
, "compute");
183 // acquire the particle data
184 ArrayHandle
<Scalar4
> d_pos(m_pdata
->getPositions(), access_location::device
, access_mode::read
);
185 ArrayHandle
<Scalar
> d_diameter(m_pdata
->getDiameters(), access_location::device
, access_mode::read
);
186 ArrayHandle
<unsigned int> d_body(m_pdata
->getBodies(), access_location::device
, access_mode::read
);
188 const BoxDim
& box
= m_pdata
->getBox();
189 Scalar3 nearest_plane_distance
= box
.getNearestPlaneDistance();
191 // access the cell list data arrays
192 ArrayHandle
<unsigned int> d_cell_size(m_cl
->getCellSizeArray(), access_location::device
, access_mode::read
);
193 ArrayHandle
<Scalar4
> d_cell_xyzf(m_cl
->getXYZFArray(), access_location::device
, access_mode::read
);
194 ArrayHandle
<Scalar4
> d_cell_tdb(m_cl
->getTDBArray(), access_location::device
, access_mode::read
);
195 ArrayHandle
<unsigned int> d_cell_adj(m_cl
->getCellAdjArray(), access_location::device
, access_mode::read
);
197 ArrayHandle
<unsigned int> d_nlist(m_nlist
, access_location::device
, access_mode::overwrite
);
198 ArrayHandle
<unsigned int> d_n_neigh(m_n_neigh
, access_location::device
, access_mode::overwrite
);
199 ArrayHandle
<Scalar4
> d_last_pos(m_last_pos
, access_location::device
, access_mode::overwrite
);
201 // start by creating a temporary copy of r_cut sqaured
202 Scalar rmax
= m_r_cut
+ m_r_buff
;
203 // add d_max - 1.0, if diameter filtering is not already taking care of it
204 if (!m_filter_diameter
)
205 rmax
+= m_d_max
- Scalar(1.0);
206 Scalar rmaxsq
= rmax
*rmax
;
208 if ((box
.getPeriodic().x
&& nearest_plane_distance
.x
<= rmax
* 2.0) ||
209 (box
.getPeriodic().y
&& nearest_plane_distance
.y
<= rmax
* 2.0) ||
210 (this->m_sysdef
->getNDimensions() == 3 && box
.getPeriodic().z
&& nearest_plane_distance
.z
<= rmax
* 2.0))
212 m_exec_conf
->msg
->error() << "nlist: Simulation box is too small! Particles would be interacting with themselves." << endl
;
213 throw runtime_error("Error updating neighborlist bins");
216 if (exec_conf
->getComputeCapability() >= 200)
218 // we should not call the tuner with MPI sync enabled
219 // if the kernel is launched more than once in the same timestep,
220 // since those kernel launches may occur only on some, not all MPI ranks
221 bool tune
= !m_param
&& m_last_tuned_timestep
!= timestep
;
223 if (tune
) this->m_tuner
->begin();
224 unsigned int param
= !m_param
? this->m_tuner
->getParam() : m_param
;
225 unsigned int block_size
= param
/ 10000;
226 unsigned int threads_per_particle
= param
% 10000;
228 gpu_compute_nlist_binned_shared(d_nlist
.data
,
231 m_conditions
.getDeviceFlags(),
241 m_cl
->getCellIndexer(),
242 m_cl
->getCellListIndexer(),
243 m_cl
->getCellAdjIndexer(),
246 threads_per_particle
,
250 m_cl
->getGhostWidth(),
251 m_exec_conf
->getComputeCapability()/10);
252 if (exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
253 if (tune
) this->m_tuner
->end();
255 m_last_tuned_timestep
= timestep
;
259 #ifndef SINGLE_PRECISION
260 m_exec_conf
->msg
->error() << "NeighborListGPUBinned doesn't work in double precision on compute 1.x" << endl
;
261 throw runtime_error("Error computing neighbor list");
264 unsigned int ncell
= m_cl
->getDim().x
* m_cl
->getDim().y
* m_cl
->getDim().z
;
266 // upate the cuda array allocations (note, this is smart enough to not reallocate when there has been no change)
267 if (needReallocateCudaArrays())
269 allocateCudaArrays();
270 cudaMemcpyToArray(dca_cell_adj
, 0, 0, d_cell_adj
.data
, sizeof(unsigned int)*ncell
*27, cudaMemcpyDeviceToDevice
);
273 // update the values in those arrays
274 if (m_prof
) m_prof
->push(exec_conf
, "copy");
275 cudaMemcpyToArray(dca_cell_xyzf
, 0, 0, d_cell_xyzf
.data
, sizeof(Scalar4
)*ncell
*m_last_cell_Nmax
, cudaMemcpyDeviceToDevice
);
276 if (m_filter_body
|| m_filter_diameter
)
277 cudaMemcpyToArray(dca_cell_tdb
, 0, 0, d_cell_tdb
.data
, sizeof(Scalar4
)*ncell
*m_last_cell_Nmax
, cudaMemcpyDeviceToDevice
);
279 if (m_prof
) m_prof
->pop(exec_conf
);
281 if (exec_conf
->isCUDAErrorCheckingEnabled())
284 gpu_compute_nlist_binned_1x(d_nlist
.data
,
287 m_conditions
.getDeviceFlags(),
297 m_cl
->getCellIndexer(),
303 m_cl
->getGhostWidth());
305 if (exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
309 m_prof
->pop(exec_conf
);
312 bool NeighborListGPUBinned::needReallocateCudaArrays()
314 // quit now if the dimensions are the same as the last allocation
315 uint3 cur_dim
= m_cl
->getDim();
316 unsigned int cur_cell_Nmax
= m_cl
->getNmax();
318 if (cur_dim
.x
== m_last_dim
.x
&&
319 cur_dim
.y
== m_last_dim
.y
&&
320 cur_dim
.z
== m_last_dim
.z
&&
321 cur_cell_Nmax
== m_last_cell_Nmax
&&
322 dca_cell_adj
!= NULL
&&
323 dca_cell_xyzf
!= NULL
&&
324 dca_cell_tdb
!= NULL
)
334 void NeighborListGPUBinned::allocateCudaArrays()
336 // quit now if the dimensions are the same as the last allocation
337 uint3 cur_dim
= m_cl
->getDim();
338 unsigned int cur_cell_Nmax
= m_cl
->getNmax();
340 m_last_dim
= cur_dim
;
341 m_last_cell_Nmax
= cur_cell_Nmax
;
343 // free the old arrays
344 if (dca_cell_adj
!= NULL
)
345 cudaFreeArray(dca_cell_adj
);
346 if (dca_cell_xyzf
!= NULL
)
347 cudaFreeArray(dca_cell_xyzf
);
348 if (dca_cell_tdb
!= NULL
)
349 cudaFreeArray(dca_cell_tdb
);
353 // allocate the new ones
354 unsigned int ncell
= cur_dim
.x
* cur_dim
.y
* cur_dim
.z
;
356 cudaChannelFormatDesc xyzf_desc
= cudaCreateChannelDesc
< Scalar4
>();
357 cudaMallocArray(&dca_cell_xyzf
, &xyzf_desc
, cur_cell_Nmax
, ncell
);
358 cudaMallocArray(&dca_cell_tdb
, &xyzf_desc
, cur_cell_Nmax
, ncell
);
359 cudaChannelFormatDesc adj_desc
= cudaCreateChannelDesc
< unsigned int >();
360 cudaMallocArray(&dca_cell_adj
, &adj_desc
, 27, ncell
);
365 void export_NeighborListGPUBinned()
367 class_
<NeighborListGPUBinned
, boost::shared_ptr
<NeighborListGPUBinned
>, bases
<NeighborListGPU
>, boost::noncopyable
>
368 ("NeighborListGPUBinned", init
< boost::shared_ptr
<SystemDefinition
>, Scalar
, Scalar
, boost::shared_ptr
<CellList
> >())
369 .def("setTuningParam", &NeighborListGPUBinned::setTuningParam
)