libhoomd/computes_gpu/NeighborListGPUBinned.cu

   1 /*
   2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
   3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
   4 the University of Michigan All rights reserved.
   5
   6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
   7 copyright is held, by various Contributors who have granted The Regents of the
   8 University of Michigan the right to modify and/or distribute such Contributions.
   9
  10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
  11 and binary forms, provided you abide by the following conditions:
  12
  13 * Redistributions of source code must retain the above copyright notice, this
  14 list of conditions, and the following disclaimer both in the code and
  15 prominently in any materials provided with the distribution.
  16
  17 * Redistributions in binary form must reproduce the above copyright notice, this
  18 list of conditions, and the following disclaimer in the documentation and/or
  19 other materials provided with the distribution.
  20
  21 * All publications and presentations based on HOOMD-blue, including any reports
  22 or published results obtained, in whole or in part, with HOOMD-blue, will
  23 acknowledge its use according to the terms posted at the time of submission on:
  24 http://codeblue.umich.edu/hoomd-blue/citations.html
  25
  26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
  27 http://codeblue.umich.edu/hoomd-blue/
  28
  29 * Apart from the above required attributions, neither the name of the copyright
  30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
  31 promote products derived from this software without specific prior written
  32 permission.
  33
  34 Disclaimer
  35
  36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
  37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
  39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
  40
  41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 */
  49
  50 // Maintainer: joaander
  51
  52 #include "NeighborListGPUBinned.cuh"
  53 #include "TextureTools.h"
  54
  55 /*! \file NeighborListGPUBinned.cu
  56     \brief Defines GPU kernel code for O(N) neighbor list generation on the GPU
  57 */
  58
  59 //! Texture for reading d_cell_xyzf
  60 scalar4_tex_t cell_xyzf_1d_tex;
  61
  62 //! Warp-centric scan
  63 template<int NT>
  64 struct warp_scan
  65     {
  66     #if __CUDA_ARCH__ >= 300
  67     enum { capacity = 0 }; // uses no shared memory
  68     #else
  69     enum { capacity = NT > 1 ? (2 * NT + 1) : 1};
  70     #endif
  71
  72     __device__ static int Scan(int tid, unsigned char x, volatile unsigned char *shared, unsigned char* total)
  73         {
  74         #if __CUDA_ARCH__ >= 300
  75         // Kepler version
  76         unsigned int laneid;
  77         //This command gets the lane ID within the current warp
  78         asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
  79
  80         int first = laneid - tid;
  81
  82         #pragma unroll
  83         for(int offset = 1; offset < NT; offset += offset)
  84             {
  85             int y = __shfl(x,(first + tid - offset) &(WARP_SIZE -1));
  86             if(tid >= offset) x += y;
  87             }
  88
  89         // all threads get the total from the last thread in the cta
  90         *total = __shfl(x,first + NT - 1);
  91
  92         // shift by one (exclusive scan)
  93         int y = __shfl(x,(first + tid - 1) &(WARP_SIZE-1));
  94         x = tid ? y : 0;
  95
  96         #else // __CUDA_ARCH__ >= 300
  97
  98         shared[tid] = x;
  99         int first = 0;
 100         // no syncthreads here (inside warp)
 101
 102         for(int offset = 1; offset < NT; offset += offset)
 103             {
 104             if(tid >= offset)
 105                 x = shared[first + tid - offset] + x;
 106             first = NT - first;
 107             shared[first + tid] = x;
 108             // no syncthreads here (inside warp)
 109             }
 110         *total = shared[first + NT - 1];
 111
 112         // shift by one (exclusive scan)
 113         x = tid ? shared[first + tid - 1] : 0;
 114         #endif
 115         // no syncthreads here (inside warp)
 116         return x;
 117         }
 118     };
 119
 120 //! Kernel call for generating neighbor list on the GPU (shared memory version)
 121 /*! \tparam flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering.
 122     \param d_nlist Neighbor list data structure to write
 123     \param d_n_neigh Number of neighbors to write
 124     \param d_last_updated_pos Particle positions at this update are written to this array
 125     \param d_conditions Conditions array for writing overflow condition
 126     \param nli Indexer to access \a d_nlist
 127     \param d_pos Particle positions
 128     \param d_body Particle body indices
 129     \param d_diameter Particle diameters
 130     \param N Number of particles
 131     \param d_cell_size Number of particles in each cell
 132     \param d_cell_xyzf Cell contents (xyzf array from CellList with flag=type)
 133     \param d_cell_tdb Cell contents (tdb array from CellList with)
 134     \param d_cell_adj Cell adjacency list
 135     \param ci Cell indexer for indexing cells
 136     \param cli Cell list indexer for indexing into d_cell_xyzf
 137     \param cadji Adjacent cell indexer listing the 27 neighboring cells
 138     \param box Simulation box dimensions
 139     \param r_maxsq The maximum radius for which to include particles as neighbors, squared
 140     \param r_max The maximum radius for which to include particles as neighbors
 141     \param ghost_width Width of ghost cell layer
 142
 143     \note optimized for Fermi
 144 */
 145 template<unsigned char flags, int threads_per_particle>
 146 __global__ void gpu_compute_nlist_binned_shared_kernel(unsigned int *d_nlist,
 147                                                     unsigned int *d_n_neigh,
 148                                                     Scalar4 *d_last_updated_pos,
 149                                                     unsigned int *d_conditions,
 150                                                     const Index2D nli,
 151                                                     const Scalar4 *d_pos,
 152                                                     const unsigned int *d_body,
 153                                                     const Scalar *d_diameter,
 154                                                     const unsigned int N,
 155                                                     const unsigned int *d_cell_size,
 156                                                     const Scalar4 *d_cell_xyzf,
 157                                                     const Scalar4 *d_cell_tdb,
 158                                                     const unsigned int *d_cell_adj,
 159                                                     const Index3D ci,
 160                                                     const Index2D cli,
 161                                                     const Index2D cadji,
 162                                                     const BoxDim box,
 163                                                     const Scalar r_maxsq,
 164                                                     const Scalar r_max,
 165                                                     const Scalar3 ghost_width)
 166     {
 167     bool filter_body = flags & 1;
 168     bool filter_diameter = flags & 2;
 169
 170     // each set of threads_per_particle threads is going to compute the neighbor list for a single particle
 171     int my_pidx;
 172     if (gridDim.y > 1)
 173         {
 174         // fermi workaround
 175         my_pidx = (blockIdx.x + blockIdx.y*65535) * (blockDim.x/threads_per_particle) + threadIdx.x/threads_per_particle;
 176         }
 177     else
 178         {
 179         my_pidx = blockIdx.x * (blockDim.x/threads_per_particle) + threadIdx.x/threads_per_particle;
 180         }
 181
 182     // return early if out of bounds
 183     if (my_pidx >= N) return;
 184
 185     // first, determine which bin this particle belongs to
 186     Scalar4 my_postype = d_pos[my_pidx];
 187     Scalar3 my_pos = make_scalar3(my_postype.x, my_postype.y, my_postype.z);
 188
 189     unsigned int my_body = d_body[my_pidx];
 190     Scalar my_diameter = d_diameter[my_pidx];
 191
 192     Scalar3 f = box.makeFraction(my_pos, ghost_width);
 193
 194     // find the bin each particle belongs in
 195     int ib = (int)(f.x * ci.getW());
 196     int jb = (int)(f.y * ci.getH());
 197     int kb = (int)(f.z * ci.getD());
 198
 199     uchar3 periodic = box.getPeriodic();
 200
 201     // need to handle the case where the particle is exactly at the box hi
 202     if (ib == ci.getW() && periodic.x)
 203         ib = 0;
 204     if (jb == ci.getH() && periodic.y)
 205         jb = 0;
 206     if (kb == ci.getD() && periodic.z)
 207         kb = 0;
 208
 209     int my_cell = ci(ib,jb,kb);
 210
 211     // shared memory (volatile is required, since we are doing warp-centric)
 212     volatile extern __shared__ unsigned char sh[];
 213
 214     // index of current neighbor
 215     unsigned int cur_adj = 0;
 216
 217     // current cell
 218     unsigned int neigh_cell = d_cell_adj[cadji(cur_adj, my_cell)];
 219
 220     // size of current cell
 221     unsigned int neigh_size = d_cell_size[neigh_cell];
 222
 223     // offset of cta in shared memory
 224     int cta_offs = (threadIdx.x/threads_per_particle)*warp_scan<threads_per_particle>::capacity;
 225
 226     // current index in cell
 227     int cur_offset = threadIdx.x % threads_per_particle;
 228
 229     bool done = false;
 230
 231     // total number of neighbors
 232     unsigned int nneigh = 0;
 233
 234     while (! done)
 235         {
 236         // initalize with default
 237         unsigned int neighbor;
 238         unsigned char has_neighbor = 0;
 239
 240         // advance neighbor cell
 241         while (cur_offset >= neigh_size && !done )
 242             {
 243             cur_offset -= neigh_size;
 244             cur_adj++;
 245             if (cur_adj < cadji.getW())
 246                 {
 247                 neigh_cell = d_cell_adj[cadji(cur_adj, my_cell)];
 248                 neigh_size = d_cell_size[neigh_cell];
 249                 }
 250             else
 251                 // we are past the end of the cell neighbors
 252                 done = true;
 253             }
 254
 255         // if the first thread in the cta has no work, terminate the loop
 256         if (done && !(threadIdx.x % threads_per_particle)) break;
 257
 258         if (!done)
 259             {
 260             Scalar4 cur_xyzf = texFetchScalar4(d_cell_xyzf, cell_xyzf_1d_tex, cli(cur_offset, neigh_cell));
 261
 262             Scalar4 cur_tdb = make_scalar4(0, 0, 0, 0);
 263             if (filter_diameter || filter_body)
 264                 cur_tdb = d_cell_tdb[cli(cur_offset, neigh_cell)];
 265
 266             // advance cur_offset
 267             cur_offset += threads_per_particle;
 268
 269             unsigned int neigh_body = __scalar_as_int(cur_tdb.z);
 270             Scalar neigh_diameter = cur_tdb.y;
 271
 272             Scalar3 neigh_pos = make_scalar3(cur_xyzf.x,
 273                                            cur_xyzf.y,
 274                                            cur_xyzf.z);
 275             int cur_neigh = __scalar_as_int(cur_xyzf.w);
 276
 277             // compute the distance between the two particles
 278             Scalar3 dx = my_pos - neigh_pos;
 279
 280             // wrap the periodic boundary conditions
 281             dx = box.minImage(dx);
 282
 283             // compute dr squared
 284             Scalar drsq = dot(dx,dx);
 285
 286             bool excluded = (my_pidx == cur_neigh);
 287
 288             if (filter_body && my_body != 0xffffffff)
 289                 excluded = excluded | (my_body == neigh_body);
 290
 291             Scalar sqshift(0.0);
 292             if (filter_diameter)
 293                 {
 294                 // compute the shift in radius to accept neighbors based on their diameters
 295                 Scalar delta = (my_diameter + neigh_diameter) * Scalar(0.5) - Scalar(1.0);
 296                 // r^2 < (r_max + delta)^2
 297                 // r^2 < r_maxsq + delta^2 + 2*r_max*delta
 298                 sqshift = (delta + Scalar(2.0) * r_max) * delta;
 299                 }
 300
 301             // store result in shared memory
 302             if (drsq <= (r_maxsq + sqshift) && !excluded)
 303                 {
 304                 neighbor = cur_neigh;
 305                 has_neighbor = 1;
 306                 }
 307             }
 308
 309         // no syncthreads here, we assume threads_per_particle < warp size
 310
 311         // scan over flags
 312         unsigned char n;
 313         int k = warp_scan<threads_per_particle>::Scan(threadIdx.x % threads_per_particle,
 314             has_neighbor, &sh[cta_offs], &n);
 315
 316         if (has_neighbor && nneigh + k < nli.getH())
 317             d_nlist[nli(my_pidx, nneigh + k)] = neighbor;
 318
 319         // increment total neighbor count
 320         nneigh += n;
 321         } // end while
 322
 323     if (threadIdx.x % threads_per_particle == 0)
 324         {
 325         // flag if we need to grow the neighbor list
 326         if (nneigh >= nli.getH())
 327             atomicMax(&d_conditions[0], nneigh);
 328
 329         d_n_neigh[my_pidx] = nneigh;
 330         d_last_updated_pos[my_pidx] = my_postype;
 331         }
 332     }
 333
 334 //! determine maximum possible block size
 335 template<typename T>
 336 int get_max_block_size(T func)
 337     {
 338     cudaFuncAttributes attr;
 339     cudaFuncGetAttributes(&attr, (const void*)func);
 340     int max_threads = attr.maxThreadsPerBlock;
 341     // number of threads has to be multiple of warp size
 342     max_threads -= max_threads % max_threads_per_particle;
 343     return max_threads;
 344     }
 345
 346 void gpu_nlist_binned_bind_texture(const Scalar4 *d_cell_xyzf, unsigned int n_elements)
 347     {
 348     // bind the position texture
 349     cell_xyzf_1d_tex.normalized = false;
 350     cell_xyzf_1d_tex.filterMode = cudaFilterModePoint;
 351     cudaBindTexture(0, cell_xyzf_1d_tex, d_cell_xyzf, sizeof(Scalar4)*n_elements);
 352     }
 353
 354 //! recursive template to launch neighborlist with given template parameters
 355 /* \tparam cur_tpp Number of threads per particle (assumed to be power of two) */
 356 template<int cur_tpp>
 357 inline void launcher(unsigned int *d_nlist,
 358               unsigned int *d_n_neigh,
 359               Scalar4 *d_last_updated_pos,
 360               unsigned int *d_conditions,
 361               const Index2D nli,
 362               const Scalar4 *d_pos,
 363               const unsigned int *d_body,
 364               const Scalar *d_diameter,
 365               const unsigned int N,
 366               const unsigned int *d_cell_size,
 367               const Scalar4 *d_cell_xyzf,
 368               const Scalar4 *d_cell_tdb,
 369               const unsigned int *d_cell_adj,
 370               const Index3D ci,
 371               const Index2D cli,
 372               const Index2D cadji,
 373               const BoxDim box,
 374               const Scalar r_maxsq,
 375               const Scalar r_max,
 376               const Scalar3 ghost_width,
 377               const unsigned int compute_capability,
 378               unsigned int tpp,
 379               bool filter_diameter,
 380               bool filter_body,
 381               unsigned int block_size)
 382     {
 383     unsigned int shared_size = 0;
 384
 385     if (tpp == cur_tpp && cur_tpp != 0)
 386         {
 387         if (!filter_diameter && !filter_body)
 388             {
 389             static unsigned int max_block_size = UINT_MAX;
 390             if (max_block_size == UINT_MAX)
 391                 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<0,cur_tpp>);
 392             if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 393
 394             block_size = block_size < max_block_size ? block_size : max_block_size;
 395             dim3 grid(N / (block_size/tpp) + 1);
 396             if (compute_capability < 30 && grid.x > 65535)
 397                 {
 398                 grid.y = grid.x/65535 + 1;
 399                 grid.x = 65535;
 400                 }
 401
 402             if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
 403
 404             gpu_compute_nlist_binned_shared_kernel<0,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
 405                                                                              d_n_neigh,
 406                                                                              d_last_updated_pos,
 407                                                                              d_conditions,
 408                                                                              nli,
 409                                                                              d_pos,
 410                                                                              d_body,
 411                                                                              d_diameter,
 412                                                                              N,
 413                                                                              d_cell_size,
 414                                                                              d_cell_xyzf,
 415                                                                              d_cell_tdb,
 416                                                                              d_cell_adj,
 417                                                                              ci,
 418                                                                              cli,
 419                                                                              cadji,
 420                                                                              box,
 421                                                                              r_maxsq,
 422                                                                              sqrtf(r_maxsq),
 423                                                                              ghost_width);
 424             }
 425         else if (!filter_diameter && filter_body)
 426             {
 427             static unsigned int max_block_size = UINT_MAX;
 428             if (max_block_size == UINT_MAX)
 429                 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<1,cur_tpp>);
 430             if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 431
 432             block_size = block_size < max_block_size ? block_size : max_block_size;
 433             dim3 grid(N / (block_size/tpp) + 1);
 434             if (compute_capability < 30 && grid.x > 65535)
 435                 {
 436                 grid.y = grid.x/65535 + 1;
 437                 grid.x = 65535;
 438                 }
 439
 440             if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
 441
 442             gpu_compute_nlist_binned_shared_kernel<1,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
 443                                                                              d_n_neigh,
 444                                                                              d_last_updated_pos,
 445                                                                              d_conditions,
 446                                                                              nli,
 447                                                                              d_pos,
 448                                                                              d_body,
 449                                                                              d_diameter,
 450                                                                              N,
 451                                                                              d_cell_size,
 452                                                                              d_cell_xyzf,
 453                                                                              d_cell_tdb,
 454                                                                              d_cell_adj,
 455                                                                              ci,
 456                                                                              cli,
 457                                                                              cadji,
 458                                                                              box,
 459                                                                              r_maxsq,
 460                                                                              sqrtf(r_maxsq),
 461                                                                              ghost_width);
 462             }
 463         else if (filter_diameter && !filter_body)
 464             {
 465             static unsigned int max_block_size = UINT_MAX;
 466             if (max_block_size == UINT_MAX)
 467                 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<2,cur_tpp>);
 468             if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 469
 470             block_size = block_size < max_block_size ? block_size : max_block_size;
 471             dim3 grid(N / (block_size/tpp) + 1);
 472             if (compute_capability < 30 && grid.x > 65535)
 473                 {
 474                 grid.y = grid.x/65535 + 1;
 475                 grid.x = 65535;
 476                 }
 477
 478             if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
 479
 480             gpu_compute_nlist_binned_shared_kernel<2,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
 481                                                                              d_n_neigh,
 482                                                                              d_last_updated_pos,
 483                                                                              d_conditions,
 484                                                                              nli,
 485                                                                              d_pos,
 486                                                                              d_body,
 487                                                                              d_diameter,
 488                                                                              N,
 489                                                                              d_cell_size,
 490                                                                              d_cell_xyzf,
 491                                                                              d_cell_tdb,
 492                                                                              d_cell_adj,
 493                                                                              ci,
 494                                                                              cli,
 495                                                                              cadji,
 496                                                                              box,
 497                                                                              r_maxsq,
 498                                                                              sqrtf(r_maxsq),
 499                                                                              ghost_width);
 500             }
 501         else if (filter_diameter && filter_body)
 502             {
 503             static unsigned int max_block_size = UINT_MAX;
 504             if (max_block_size == UINT_MAX)
 505                 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<3,cur_tpp>);
 506             if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
 507
 508             block_size = block_size < max_block_size ? block_size : max_block_size;
 509             dim3 grid(N / (block_size/tpp) + 1);
 510             if (compute_capability < 30 && grid.x > 65535)
 511                 {
 512                 grid.y = grid.x/65535 + 1;
 513                 grid.x = 65535;
 514                 }
 515
 516             if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
 517
 518             gpu_compute_nlist_binned_shared_kernel<3,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
 519                                                                              d_n_neigh,
 520                                                                              d_last_updated_pos,
 521                                                                              d_conditions,
 522                                                                              nli,
 523                                                                              d_pos,
 524                                                                              d_body,
 525                                                                              d_diameter,
 526                                                                              N,
 527                                                                              d_cell_size,
 528                                                                              d_cell_xyzf,
 529                                                                              d_cell_tdb,
 530                                                                              d_cell_adj,
 531                                                                              ci,
 532                                                                              cli,
 533                                                                              cadji,
 534                                                                              box,
 535                                                                              r_maxsq,
 536                                                                              sqrtf(r_maxsq),
 537                                                                              ghost_width);
 538             }
 539         }
 540     else
 541         {
 542         launcher<cur_tpp/2>(d_nlist,
 543                      d_n_neigh,
 544                      d_last_updated_pos,
 545                      d_conditions,
 546                      nli,
 547                      d_pos,
 548                      d_body,
 549                      d_diameter,
 550                      N,
 551                      d_cell_size,
 552                      d_cell_xyzf,
 553                      d_cell_tdb,
 554                      d_cell_adj,
 555                      ci,
 556                      cli,
 557                      cadji,
 558                      box,
 559                      r_maxsq,
 560                      sqrtf(r_maxsq),
 561                      ghost_width,
 562                      compute_capability,
 563                      tpp,
 564                      filter_diameter,
 565                      filter_body,
 566                      block_size
 567                      );
 568         }
 569     }
 570
 571 //! template specialization to terminate recursion
 572 template<>
 573 inline void launcher<min_threads_per_particle/2>(unsigned int *d_nlist,
 574               unsigned int *d_n_neigh,
 575               Scalar4 *d_last_updated_pos,
 576               unsigned int *d_conditions,
 577               const Index2D nli,
 578               const Scalar4 *d_pos,
 579               const unsigned int *d_body,
 580               const Scalar *d_diameter,
 581               const unsigned int N,
 582               const unsigned int *d_cell_size,
 583               const Scalar4 *d_cell_xyzf,
 584               const Scalar4 *d_cell_tdb,
 585               const unsigned int *d_cell_adj,
 586               const Index3D ci,
 587               const Index2D cli,
 588               const Index2D cadji,
 589               const BoxDim box,
 590               const Scalar r_maxsq,
 591               const Scalar r_max,
 592               const Scalar3 ghost_width,
 593               const unsigned int compute_capability,
 594               unsigned int tpp,
 595               bool filter_diameter,
 596               bool filter_body,
 597               unsigned int block_size)
 598     { }
 599
 600 cudaError_t gpu_compute_nlist_binned_shared(unsigned int *d_nlist,
 601                                      unsigned int *d_n_neigh,
 602                                      Scalar4 *d_last_updated_pos,
 603                                      unsigned int *d_conditions,
 604                                      const Index2D& nli,
 605                                      const Scalar4 *d_pos,
 606                                      const unsigned int *d_body,
 607                                      const Scalar *d_diameter,
 608                                      const unsigned int N,
 609                                      const unsigned int *d_cell_size,
 610                                      const Scalar4 *d_cell_xyzf,
 611                                      const Scalar4 *d_cell_tdb,
 612                                      const unsigned int *d_cell_adj,
 613                                      const Index3D& ci,
 614                                      const Index2D& cli,
 615                                      const Index2D& cadji,
 616                                      const BoxDim& box,
 617                                      const Scalar r_maxsq,
 618                                      const unsigned int threads_per_particle,
 619                                      const unsigned int block_size,
 620                                      bool filter_body,
 621                                      bool filter_diameter,
 622                                      const Scalar3& ghost_width,
 623                                      const unsigned int compute_capability)
 624     {
 625     launcher<max_threads_per_particle>(d_nlist,
 626                                    d_n_neigh,
 627                                    d_last_updated_pos,
 628                                    d_conditions,
 629                                    nli,
 630                                    d_pos,
 631                                    d_body,
 632                                    d_diameter,
 633                                    N,
 634                                    d_cell_size,
 635                                    d_cell_xyzf,
 636                                    d_cell_tdb,
 637                                    d_cell_adj,
 638                                    ci,
 639                                    cli,
 640                                    cadji,
 641                                    box,
 642                                    r_maxsq,
 643                                    sqrtf(r_maxsq),
 644                                    ghost_width,
 645                                    compute_capability,
 646                                    threads_per_particle,
 647                                    filter_diameter,
 648                                    filter_body,
 649                                    block_size
 650                                    );
 651
 652     return cudaSuccess;
 653     }
 654
 655 // don't compile the 1x nlist kernel in double precision builds
 656 #ifdef SINGLE_PRECISION
 657 //! Texture for reading d_cell_adj
 658 texture<unsigned int, 2, cudaReadModeElementType> cell_adj_tex;
 659 //! Texture for reading d_cell_size
 660 texture<unsigned int, 1, cudaReadModeElementType> cell_size_tex;
 661 //! Texture for reading d_cell_xyzf
 662 texture<Scalar4, 2, cudaReadModeElementType> cell_xyzf_tex;
 663 //! Texture for reading d_cell_tdb
 664 texture<Scalar4, 2, cudaReadModeElementType> cell_tdb_tex;
 665
 666 //! Kernel call for generating neighbor list on the GPU
 667 /*! \tparam filter_flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering.
 668     \param d_nlist Neighbor list data structure to write
 669     \param d_n_neigh Number of neighbors to write
 670     \param d_last_updated_pos Particle positions at this update are written to this array
 671     \param d_conditions Conditions array for writing overflow condition
 672     \param nli Indexer to access \a d_nlist
 673     \param d_pos Particle positions
 674     \param d_body Particle body indices
 675     \param d_diameter Particle diameters
 676     \param N Number of particles
 677     \param ci Cell indexer for indexing cells
 678     \param box Simulation box dimensions
 679     \param r_maxsq The maximum radius for which to include particles as neighbors, squared
 680     \param r_max The maximum radius for which to include particles as neighbors
 681     \param ghost_width Width of ghost cell layer
 682
 683     \note optimized for compute 1.x devices
 684 */
 685 template<unsigned char filter_flags>
 686 __global__ void gpu_compute_nlist_binned_1x_kernel(unsigned int *d_nlist,
 687                                                    unsigned int *d_n_neigh,
 688                                                    Scalar4 *d_last_updated_pos,
 689                                                    unsigned int *d_conditions,
 690                                                    const Index2D nli,
 691                                                    const Scalar4 *d_pos,
 692                                                    const unsigned int *d_body,
 693                                                    const Scalar *d_diameter,
 694                                                    const unsigned int N,
 695                                                    const Index3D ci,
 696                                                    const BoxDim box,
 697                                                    const float r_maxsq,
 698                                                    const float r_max,
 699                                                    const Scalar3 ghost_width)
 700     {
 701     bool filter_body = filter_flags & 1;
 702     bool filter_diameter = filter_flags & 2;
 703
 704     // each thread is going to compute the neighbor list for a single particle
 705     int my_pidx = blockDim.x * blockIdx.x + threadIdx.x;
 706
 707     // count the number of neighbors needed
 708     unsigned int n_neigh_needed = 0;
 709
 710     // quit early if we are past the end of the array
 711     if (my_pidx >= N)
 712         return;
 713
 714     // first, determine which bin this particle belongs to
 715     Scalar4 my_postype = d_pos[my_pidx];
 716     Scalar3 my_pos = make_scalar3(my_postype.x, my_postype.y, my_postype.z);
 717
 718     unsigned int my_body = d_body[my_pidx];
 719     Scalar my_diameter = d_diameter[my_pidx];
 720
 721     // get periodic flags
 722     uchar3 periodic = box.getPeriodic();
 723
 724     // find the bin each particle belongs in
 725     Scalar3 f = box.makeFraction(my_pos,ghost_width);
 726     unsigned int ib = (unsigned int)(f.x * ci.getW());
 727     unsigned int jb = (unsigned int)(f.y * ci.getH());
 728     unsigned int kb = (unsigned int)(f.z * ci.getD());
 729
 730     // need to handle the case where the particle is exactly at the box hi
 731     if (ib == ci.getW() && periodic.x)
 732         ib = 0;
 733     if (jb == ci.getH() && periodic.y)
 734         jb = 0;
 735     if (kb == ci.getD() && periodic.z)
 736         kb = 0;
 737
 738     int my_cell = ci(ib,jb,kb);
 739
 740     // each thread will determine the neighborlist of a single particle
 741     // count number of neighbors found so far in n_neigh
 742     int n_neigh = 0;
 743
 744     // loop over all adjacent bins
 745     for (unsigned int cur_adj = 0; cur_adj < 27; cur_adj++)
 746         {
 747         int neigh_cell = tex2D(cell_adj_tex, cur_adj, my_cell);
 748         unsigned int size = tex1Dfetch(cell_size_tex, neigh_cell);
 749
 750         Scalar4 next_xyzf = tex2D(cell_xyzf_tex, 0, neigh_cell);
 751
 752         // now, we are set to loop through the array
 753         for (int cur_offset = 0; cur_offset < size; cur_offset++)
 754             {
 755             Scalar4 cur_xyzf = next_xyzf;
 756             next_xyzf = tex2D(cell_xyzf_tex, cur_offset+1, neigh_cell);
 757             Scalar4 cur_tdb = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
 758             if (filter_diameter || filter_body)
 759                 cur_tdb = tex2D(cell_tdb_tex, cur_offset, neigh_cell);
 760             unsigned int neigh_body = __scalar_as_int(cur_tdb.z);
 761             Scalar neigh_diameter = cur_tdb.y;
 762
 763             Scalar3 neigh_pos = make_scalar3(cur_xyzf.x,
 764                                            cur_xyzf.y,
 765                                            cur_xyzf.z);
 766             int cur_neigh = __scalar_as_int(cur_xyzf.w);
 767
 768             // compute the distance between the two particles
 769             Scalar3 dx = my_pos - neigh_pos;
 770
 771             // wrap the periodic boundary conditions
 772             dx = box.minImage(dx);
 773             // compute dr squared
 774             Scalar drsq = dot(dx,dx);
 775
 776
 777             bool excluded = (my_pidx == cur_neigh);
 778
 779             if (filter_body && my_body != 0xffffffff)
 780                 excluded = excluded | (my_body == neigh_body);
 781
 782             Scalar sqshift = Scalar(0.0);
 783             if (filter_diameter)
 784                 {
 785                 // compute the shift in radius to accept neighbors based on their diameters
 786                 Scalar delta = (my_diameter + neigh_diameter) * Scalar(0.5) - Scalar(1.0);
 787                 // r^2 < (r_max + delta)^2
 788                 // r^2 < r_maxsq + delta^2 + 2*r_max*delta
 789                 sqshift = (delta + Scalar(2.0) * r_max) * delta;
 790                 }
 791
 792             if (drsq <= (r_maxsq + sqshift) && !excluded)
 793                 {
 794                 if (n_neigh < nli.getH())
 795                     d_nlist[nli(my_pidx, n_neigh)] = cur_neigh;
 796                 else
 797                     n_neigh_needed = n_neigh+1;
 798
 799                 n_neigh++;
 800                 }
 801             }
 802         }
 803
 804     d_n_neigh[my_pidx] = n_neigh;
 805     d_last_updated_pos[my_pidx] = my_postype;
 806
 807     if (n_neigh_needed > 0)
 808         atomicMax(&d_conditions[0], n_neigh_needed);
 809     }
 810 #endif  // #ifdef SINGLE_PRECISION
 811
 812 cudaError_t gpu_compute_nlist_binned_1x(unsigned int *d_nlist,
 813                                         unsigned int *d_n_neigh,
 814                                         Scalar4 *d_last_updated_pos,
 815                                         unsigned int *d_conditions,
 816                                         const Index2D& nli,
 817                                         const Scalar4 *d_pos,
 818                                         const unsigned int *d_body,
 819                                         const Scalar *d_diameter,
 820                                         const unsigned int N,
 821                                         const unsigned int *d_cell_size,
 822                                         const cudaArray *dca_cell_xyzf,
 823                                         const cudaArray *dca_cell_tdb,
 824                                         const cudaArray *dca_cell_adj,
 825                                         const Index3D& ci,
 826                                         const BoxDim& box,
 827                                         const Scalar r_maxsq,
 828                                         const unsigned int block_size,
 829                                         bool filter_body,
 830                                         bool filter_diameter,
 831                                         const Scalar3& ghost_width)
 832     {
 833     // don't compile the 1x nlist kernel in double precision builds
 834     #ifdef SINGLE_PRECISION
 835     int n_blocks = (int)ceil(double(N)/double(block_size));
 836
 837     cudaError_t err = cudaBindTextureToArray(cell_adj_tex, dca_cell_adj);
 838     if (err != cudaSuccess)
 839         return err;
 840
 841     err = cudaBindTextureToArray(cell_xyzf_tex, dca_cell_xyzf);
 842     if (err != cudaSuccess)
 843         return err;
 844
 845     err = cudaBindTextureToArray(cell_tdb_tex, dca_cell_tdb);
 846     if (err != cudaSuccess)
 847         return err;
 848
 849     err = cudaBindTexture(0, cell_size_tex, d_cell_size, sizeof(unsigned int)*ci.getNumElements());
 850     if (err != cudaSuccess)
 851         return err;
 852
 853     if (!filter_diameter && !filter_body)
 854         {
 855         gpu_compute_nlist_binned_1x_kernel<0><<<n_blocks, block_size>>>(d_nlist,
 856                                                                         d_n_neigh,
 857                                                                         d_last_updated_pos,
 858                                                                         d_conditions,
 859                                                                         nli,
 860                                                                         d_pos,
 861                                                                         d_body,
 862                                                                         d_diameter,
 863                                                                         N,
 864                                                                         ci,
 865                                                                         box,
 866                                                                         r_maxsq,
 867                                                                         sqrtf(r_maxsq),
 868                                                                         ghost_width);
 869         }
 870     if (!filter_diameter && filter_body)
 871         {
 872         gpu_compute_nlist_binned_1x_kernel<1><<<n_blocks, block_size>>>(d_nlist,
 873                                                                         d_n_neigh,
 874                                                                         d_last_updated_pos,
 875                                                                         d_conditions,
 876                                                                         nli,
 877                                                                         d_pos,
 878                                                                         d_body,
 879                                                                         d_diameter,
 880                                                                         N,
 881                                                                         ci,
 882                                                                         box,
 883                                                                         r_maxsq,
 884                                                                         sqrtf(r_maxsq),
 885                                                                         ghost_width);
 886         }
 887     if (filter_diameter && !filter_body)
 888         {
 889         gpu_compute_nlist_binned_1x_kernel<2><<<n_blocks, block_size>>>(d_nlist,
 890                                                                         d_n_neigh,
 891                                                                         d_last_updated_pos,
 892                                                                         d_conditions,
 893                                                                         nli,
 894                                                                         d_pos,
 895                                                                         d_body,
 896                                                                         d_diameter,
 897                                                                         N,
 898                                                                         ci,
 899                                                                         box,
 900                                                                         r_maxsq,
 901                                                                         sqrtf(r_maxsq),
 902                                                                         ghost_width);
 903         }
 904     if (filter_diameter && filter_body)
 905         {
 906         gpu_compute_nlist_binned_1x_kernel<3><<<n_blocks, block_size>>>(d_nlist,
 907                                                                         d_n_neigh,
 908                                                                         d_last_updated_pos,
 909                                                                         d_conditions,
 910                                                                         nli,
 911                                                                         d_pos,
 912                                                                         d_body,
 913                                                                         d_diameter,
 914                                                                         N,
 915                                                                         ci,
 916                                                                         box,
 917                                                                         r_maxsq,
 918                                                                         sqrtf(r_maxsq),
 919                                                                         ghost_width );
 920         }
 921     #endif // #ifdef SINGLE_PRECISION
 922
 923     return cudaSuccess;
 924     }