libhoomd/computes_gpu/NeighborListGPUBinned.cc

   1 /*
   2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
   3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
   4 the University of Michigan All rights reserved.
   5
   6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
   7 copyright is held, by various Contributors who have granted The Regents of the
   8 University of Michigan the right to modify and/or distribute such Contributions.
   9
  10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
  11 and binary forms, provided you abide by the following conditions:
  12
  13 * Redistributions of source code must retain the above copyright notice, this
  14 list of conditions, and the following disclaimer both in the code and
  15 prominently in any materials provided with the distribution.
  16
  17 * Redistributions in binary form must reproduce the above copyright notice, this
  18 list of conditions, and the following disclaimer in the documentation and/or
  19 other materials provided with the distribution.
  20
  21 * All publications and presentations based on HOOMD-blue, including any reports
  22 or published results obtained, in whole or in part, with HOOMD-blue, will
  23 acknowledge its use according to the terms posted at the time of submission on:
  24 http://codeblue.umich.edu/hoomd-blue/citations.html
  25
  26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
  27 http://codeblue.umich.edu/hoomd-blue/
  28
  29 * Apart from the above required attributions, neither the name of the copyright
  30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
  31 promote products derived from this software without specific prior written
  32 permission.
  33
  34 Disclaimer
  35
  36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
  37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
  39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
  40
  41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 */
  49
  50 // Maintainer: joaander
  51
  52 /*! \file NeighborListGPUBinned.cc
  53     \brief Defines NeighborListGPUBinned
  54 */
  55
  56 #include "NeighborListGPUBinned.h"
  57 #include "NeighborListGPUBinned.cuh"
  58
  59 #include <boost/python.hpp>
  60 using namespace boost::python;
  61
  62 #ifdef ENABLE_MPI
  63 #include "Communicator.h"
  64 #endif
  65
  66 NeighborListGPUBinned::NeighborListGPUBinned(boost::shared_ptr<SystemDefinition> sysdef,
  67                                              Scalar r_cut,
  68                                              Scalar r_buff,
  69                                              boost::shared_ptr<CellList> cl)
  70     : NeighborListGPU(sysdef, r_cut, r_buff), m_cl(cl), m_param(0)
  71     {
  72     // create a default cell list if one was not specified
  73     if (!m_cl)
  74         m_cl = boost::shared_ptr<CellList>(new CellList(sysdef));
  75
  76     m_cl->setNominalWidth(r_cut + r_buff + m_d_max - Scalar(1.0));
  77     m_cl->setRadius(1);
  78     m_cl->setComputeTDB(false);
  79     m_cl->setFlagIndex();
  80
  81     CHECK_CUDA_ERROR();
  82
  83     // default to 0 last allocated quantities
  84     m_last_dim = make_uint3(0,0,0);
  85     m_last_cell_Nmax = 0;
  86     dca_cell_adj = NULL;
  87     dca_cell_xyzf = NULL;
  88     dca_cell_tdb = NULL;
  89
  90     // initialize autotuner
  91     // the full block size and threads_per_particle matrix is searched,
  92     // encoded as block_size*10000 + threads_per_particle
  93     std::vector<unsigned int> valid_params;
  94     for (unsigned int block_size = 32; block_size <= 1024; block_size += 32)
  95         {
  96         int s=1;
  97         while (s <= this->m_exec_conf->dev_prop.warpSize)
  98             {
  99             valid_params.push_back(block_size*10000 + s);
 100             s = s * 2;
 101             }
 102         }
 103
 104     m_tuner.reset(new Autotuner(valid_params, 5, 100000, "nlist_binned", this->m_exec_conf));
 105     m_last_tuned_timestep = 0;
 106
 107     #ifdef ENABLE_MPI
 108     // synchronize over MPI
 109     m_tuner->setSync(m_pdata->getDomainDecomposition());
 110     #endif
 111
 112     // When running on compute 1.x, textures are allocated with the height equal to the number of cells
 113     // limit the number of cells to the maximum texture dimension
 114     if (exec_conf->getComputeCapability() < 200)
 115         {
 116         m_cl->setMaxCells(exec_conf->dev_prop.maxTexture2D[1]);
 117         }
 118     }
 119
 120 NeighborListGPUBinned::~NeighborListGPUBinned()
 121     {
 122     // free the old arrays
 123     if (dca_cell_adj != NULL)
 124         cudaFreeArray(dca_cell_adj);
 125     if (dca_cell_xyzf != NULL)
 126         cudaFreeArray(dca_cell_xyzf);
 127     if (dca_cell_tdb != NULL)
 128         cudaFreeArray(dca_cell_tdb);
 129
 130     CHECK_CUDA_ERROR();
 131     }
 132
 133 void NeighborListGPUBinned::setRCut(Scalar r_cut, Scalar r_buff)
 134     {
 135     NeighborListGPU::setRCut(r_cut, r_buff);
 136
 137     m_cl->setNominalWidth(r_cut + r_buff + m_d_max - Scalar(1.0));
 138     }
 139
 140 void NeighborListGPUBinned::setMaximumDiameter(Scalar d_max)
 141     {
 142     NeighborListGPU::setMaximumDiameter(d_max);
 143
 144     // need to update the cell list settings appropriately
 145     m_cl->setNominalWidth(m_r_cut + m_r_buff + m_d_max - Scalar(1.0));
 146     }
 147
 148 void NeighborListGPUBinned::setFilterBody(bool filter_body)
 149     {
 150     NeighborListGPU::setFilterBody(filter_body);
 151
 152     // need to update the cell list settings appropriately
 153     if (m_filter_body || m_filter_diameter)
 154         m_cl->setComputeTDB(true);
 155     else
 156         m_cl->setComputeTDB(false);
 157     }
 158
 159 void NeighborListGPUBinned::setFilterDiameter(bool filter_diameter)
 160     {
 161     NeighborListGPU::setFilterDiameter(filter_diameter);
 162
 163     // need to update the cell list settings appropriately
 164     if (m_filter_body || m_filter_diameter)
 165         m_cl->setComputeTDB(true);
 166     else
 167         m_cl->setComputeTDB(false);
 168     }
 169
 170 void NeighborListGPUBinned::buildNlist(unsigned int timestep)
 171     {
 172     if (m_storage_mode != full)
 173         {
 174         m_exec_conf->msg->error() << "Only full mode nlists can be generated on the GPU" << endl;
 175         throw runtime_error("Error computing neighbor list");
 176         }
 177
 178     m_cl->compute(timestep);
 179
 180     if (m_prof)
 181         m_prof->push(exec_conf, "compute");
 182
 183     // acquire the particle data
 184     ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
 185     ArrayHandle<Scalar> d_diameter(m_pdata->getDiameters(), access_location::device, access_mode::read);
 186     ArrayHandle<unsigned int> d_body(m_pdata->getBodies(), access_location::device, access_mode::read);
 187
 188     const BoxDim& box = m_pdata->getBox();
 189     Scalar3 nearest_plane_distance = box.getNearestPlaneDistance();
 190
 191     // access the cell list data arrays
 192     ArrayHandle<unsigned int> d_cell_size(m_cl->getCellSizeArray(), access_location::device, access_mode::read);
 193     ArrayHandle<Scalar4> d_cell_xyzf(m_cl->getXYZFArray(), access_location::device, access_mode::read);
 194     ArrayHandle<Scalar4> d_cell_tdb(m_cl->getTDBArray(), access_location::device, access_mode::read);
 195     ArrayHandle<unsigned int> d_cell_adj(m_cl->getCellAdjArray(), access_location::device, access_mode::read);
 196
 197     ArrayHandle<unsigned int> d_nlist(m_nlist, access_location::device, access_mode::overwrite);
 198     ArrayHandle<unsigned int> d_n_neigh(m_n_neigh, access_location::device, access_mode::overwrite);
 199     ArrayHandle<Scalar4> d_last_pos(m_last_pos, access_location::device, access_mode::overwrite);
 200
 201     // start by creating a temporary copy of r_cut sqaured
 202     Scalar rmax = m_r_cut + m_r_buff;
 203     // add d_max - 1.0, if diameter filtering is not already taking care of it
 204     if (!m_filter_diameter)
 205         rmax += m_d_max - Scalar(1.0);
 206     Scalar rmaxsq = rmax*rmax;
 207
 208     if ((box.getPeriodic().x && nearest_plane_distance.x <= rmax * 2.0) ||
 209         (box.getPeriodic().y && nearest_plane_distance.y <= rmax * 2.0) ||
 210         (this->m_sysdef->getNDimensions() == 3 && box.getPeriodic().z && nearest_plane_distance.z <= rmax * 2.0))
 211         {
 212         m_exec_conf->msg->error() << "nlist: Simulation box is too small! Particles would be interacting with themselves." << endl;
 213         throw runtime_error("Error updating neighborlist bins");
 214         }
 215
 216     if (exec_conf->getComputeCapability() >= 200)
 217         {
 218         // we should not call the tuner with MPI sync enabled
 219         // if the kernel is launched more than once in the same timestep,
 220         // since those kernel launches may occur only on some, not all MPI ranks
 221         bool tune = !m_param && m_last_tuned_timestep != timestep;
 222
 223         if (tune) this->m_tuner->begin();
 224         unsigned int param = !m_param ? this->m_tuner->getParam() : m_param;
 225         unsigned int block_size = param / 10000;
 226         unsigned int threads_per_particle = param % 10000;
 227
 228         gpu_compute_nlist_binned_shared(d_nlist.data,
 229                                  d_n_neigh.data,
 230                                  d_last_pos.data,
 231                                  m_conditions.getDeviceFlags(),
 232                                  m_nlist_indexer,
 233                                  d_pos.data,
 234                                  d_body.data,
 235                                  d_diameter.data,
 236                                  m_pdata->getN(),
 237                                  d_cell_size.data,
 238                                  d_cell_xyzf.data,
 239                                  d_cell_tdb.data,
 240                                  d_cell_adj.data,
 241                                  m_cl->getCellIndexer(),
 242                                  m_cl->getCellListIndexer(),
 243                                  m_cl->getCellAdjIndexer(),
 244                                  box,
 245                                  rmaxsq,
 246                                  threads_per_particle,
 247                                  block_size,
 248                                  m_filter_body,
 249                                  m_filter_diameter,
 250                                  m_cl->getGhostWidth(),
 251                                  m_exec_conf->getComputeCapability()/10);
 252         if (exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
 253         if (tune) this->m_tuner->end();
 254
 255         m_last_tuned_timestep = timestep;
 256         }
 257     else
 258         {
 259         #ifndef SINGLE_PRECISION
 260         m_exec_conf->msg->error() << "NeighborListGPUBinned doesn't work in double precision on compute 1.x" << endl;
 261         throw runtime_error("Error computing neighbor list");
 262         #endif
 263
 264         unsigned int ncell = m_cl->getDim().x * m_cl->getDim().y * m_cl->getDim().z;
 265
 266         // upate the cuda array allocations (note, this is smart enough to not reallocate when there has been no change)
 267         if (needReallocateCudaArrays())
 268             {
 269             allocateCudaArrays();
 270             cudaMemcpyToArray(dca_cell_adj, 0, 0, d_cell_adj.data, sizeof(unsigned int)*ncell*27, cudaMemcpyDeviceToDevice);
 271             }
 272
 273         // update the values in those arrays
 274         if (m_prof) m_prof->push(exec_conf, "copy");
 275         cudaMemcpyToArray(dca_cell_xyzf, 0, 0, d_cell_xyzf.data, sizeof(Scalar4)*ncell*m_last_cell_Nmax, cudaMemcpyDeviceToDevice);
 276         if (m_filter_body || m_filter_diameter)
 277             cudaMemcpyToArray(dca_cell_tdb, 0, 0, d_cell_tdb.data, sizeof(Scalar4)*ncell*m_last_cell_Nmax, cudaMemcpyDeviceToDevice);
 278
 279         if (m_prof) m_prof->pop(exec_conf);
 280
 281         if (exec_conf->isCUDAErrorCheckingEnabled())
 282             CHECK_CUDA_ERROR();
 283
 284         gpu_compute_nlist_binned_1x(d_nlist.data,
 285                                     d_n_neigh.data,
 286                                     d_last_pos.data,
 287                                     m_conditions.getDeviceFlags(),
 288                                     m_nlist_indexer,
 289                                     d_pos.data,
 290                                     d_body.data,
 291                                     d_diameter.data,
 292                                     m_pdata->getN(),
 293                                     d_cell_size.data,
 294                                     dca_cell_xyzf,
 295                                     dca_cell_tdb,
 296                                     dca_cell_adj,
 297                                     m_cl->getCellIndexer(),
 298                                     box,
 299                                     rmaxsq,
 300                                     m_block_size,
 301                                     m_filter_body,
 302                                     m_filter_diameter,
 303                                     m_cl->getGhostWidth());
 304
 305         if (exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
 306         }
 307
 308     if (m_prof)
 309         m_prof->pop(exec_conf);
 310     }
 311
 312 bool NeighborListGPUBinned::needReallocateCudaArrays()
 313     {
 314     // quit now if the dimensions are the same as the last allocation
 315     uint3 cur_dim = m_cl->getDim();
 316     unsigned int cur_cell_Nmax = m_cl->getNmax();
 317
 318     if (cur_dim.x == m_last_dim.x &&
 319         cur_dim.y == m_last_dim.y &&
 320         cur_dim.z == m_last_dim.z &&
 321         cur_cell_Nmax == m_last_cell_Nmax &&
 322         dca_cell_adj != NULL &&
 323         dca_cell_xyzf != NULL &&
 324         dca_cell_tdb != NULL)
 325         {
 326         return false;
 327         }
 328     else
 329         {
 330         return true;
 331         }
 332     }
 333
 334 void NeighborListGPUBinned::allocateCudaArrays()
 335     {
 336     // quit now if the dimensions are the same as the last allocation
 337     uint3 cur_dim = m_cl->getDim();
 338     unsigned int cur_cell_Nmax = m_cl->getNmax();
 339
 340     m_last_dim = cur_dim;
 341     m_last_cell_Nmax = cur_cell_Nmax;
 342
 343     // free the old arrays
 344     if (dca_cell_adj != NULL)
 345         cudaFreeArray(dca_cell_adj);
 346     if (dca_cell_xyzf != NULL)
 347         cudaFreeArray(dca_cell_xyzf);
 348     if (dca_cell_tdb != NULL)
 349         cudaFreeArray(dca_cell_tdb);
 350
 351     CHECK_CUDA_ERROR();
 352
 353     // allocate the new ones
 354     unsigned int ncell = cur_dim.x * cur_dim.y * cur_dim.z;
 355
 356     cudaChannelFormatDesc xyzf_desc = cudaCreateChannelDesc< Scalar4 >();
 357     cudaMallocArray(&dca_cell_xyzf, &xyzf_desc, cur_cell_Nmax, ncell);
 358     cudaMallocArray(&dca_cell_tdb, &xyzf_desc, cur_cell_Nmax, ncell);
 359     cudaChannelFormatDesc adj_desc = cudaCreateChannelDesc< unsigned int >();
 360     cudaMallocArray(&dca_cell_adj, &adj_desc, 27, ncell);
 361
 362     CHECK_CUDA_ERROR();
 363     }
 364
 365 void export_NeighborListGPUBinned()
 366     {
 367     class_<NeighborListGPUBinned, boost::shared_ptr<NeighborListGPUBinned>, bases<NeighborListGPU>, boost::noncopyable >
 368                      ("NeighborListGPUBinned", init< boost::shared_ptr<SystemDefinition>, Scalar, Scalar, boost::shared_ptr<CellList> >())
 369                     .def("setTuningParam", &NeighborListGPUBinned::setTuningParam)
 370                      ;
 371     }