Enable parallel tests.
[hoomd-blue.git] / libhoomd / computes_gpu / NeighborListGPUBinned.cu
blob4bf01638fa69bcd2153e448491ba01e6fe2b9ae6
1 /*
2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
32 permission.
34 Disclaimer
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: joaander
52 #include "NeighborListGPUBinned.cuh"
53 #include "TextureTools.h"
55 /*! \file NeighborListGPUBinned.cu
56     \brief Defines GPU kernel code for O(N) neighbor list generation on the GPU
59 //! Texture for reading d_cell_xyzf
60 scalar4_tex_t cell_xyzf_1d_tex;
62 //! Warp-centric scan
63 template<int NT>
64 struct warp_scan
65     {
66     #if __CUDA_ARCH__ >= 300
67     enum { capacity = 0 }; // uses no shared memory
68     #else
69     enum { capacity = NT > 1 ? (2 * NT + 1) : 1};
70     #endif
72     __device__ static int Scan(int tid, unsigned char x, volatile unsigned char *shared, unsigned char* total)
73         {
74         #if __CUDA_ARCH__ >= 300
75         // Kepler version
76         unsigned int laneid;
77         //This command gets the lane ID within the current warp
78         asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
80         int first = laneid - tid;
82         #pragma unroll
83         for(int offset = 1; offset < NT; offset += offset)
84             {
85             int y = __shfl(x,(first + tid - offset) &(WARP_SIZE -1));
86             if(tid >= offset) x += y;
87             }
89         // all threads get the total from the last thread in the cta
90         *total = __shfl(x,first + NT - 1);
92         // shift by one (exclusive scan)
93         int y = __shfl(x,(first + tid - 1) &(WARP_SIZE-1));
94         x = tid ? y : 0;
96         #else // __CUDA_ARCH__ >= 300
98         shared[tid] = x;
99         int first = 0;
100         // no syncthreads here (inside warp)
102         for(int offset = 1; offset < NT; offset += offset)
103             {
104             if(tid >= offset)
105                 x = shared[first + tid - offset] + x;
106             first = NT - first;
107             shared[first + tid] = x;
108             // no syncthreads here (inside warp)
109             }
110         *total = shared[first + NT - 1];
112         // shift by one (exclusive scan)
113         x = tid ? shared[first + tid - 1] : 0;
114         #endif
115         // no syncthreads here (inside warp)
116         return x;
117         }
118     };
120 //! Kernel call for generating neighbor list on the GPU (shared memory version)
121 /*! \tparam flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering.
122     \param d_nlist Neighbor list data structure to write
123     \param d_n_neigh Number of neighbors to write
124     \param d_last_updated_pos Particle positions at this update are written to this array
125     \param d_conditions Conditions array for writing overflow condition
126     \param nli Indexer to access \a d_nlist
127     \param d_pos Particle positions
128     \param d_body Particle body indices
129     \param d_diameter Particle diameters
130     \param N Number of particles
131     \param d_cell_size Number of particles in each cell
132     \param d_cell_xyzf Cell contents (xyzf array from CellList with flag=type)
133     \param d_cell_tdb Cell contents (tdb array from CellList with)
134     \param d_cell_adj Cell adjacency list
135     \param ci Cell indexer for indexing cells
136     \param cli Cell list indexer for indexing into d_cell_xyzf
137     \param cadji Adjacent cell indexer listing the 27 neighboring cells
138     \param box Simulation box dimensions
139     \param r_maxsq The maximum radius for which to include particles as neighbors, squared
140     \param r_max The maximum radius for which to include particles as neighbors
141     \param ghost_width Width of ghost cell layer
143     \note optimized for Fermi
145 template<unsigned char flags, int threads_per_particle>
146 __global__ void gpu_compute_nlist_binned_shared_kernel(unsigned int *d_nlist,
147                                                     unsigned int *d_n_neigh,
148                                                     Scalar4 *d_last_updated_pos,
149                                                     unsigned int *d_conditions,
150                                                     const Index2D nli,
151                                                     const Scalar4 *d_pos,
152                                                     const unsigned int *d_body,
153                                                     const Scalar *d_diameter,
154                                                     const unsigned int N,
155                                                     const unsigned int *d_cell_size,
156                                                     const Scalar4 *d_cell_xyzf,
157                                                     const Scalar4 *d_cell_tdb,
158                                                     const unsigned int *d_cell_adj,
159                                                     const Index3D ci,
160                                                     const Index2D cli,
161                                                     const Index2D cadji,
162                                                     const BoxDim box,
163                                                     const Scalar r_maxsq,
164                                                     const Scalar r_max,
165                                                     const Scalar3 ghost_width)
166     {
167     bool filter_body = flags & 1;
168     bool filter_diameter = flags & 2;
170     // each set of threads_per_particle threads is going to compute the neighbor list for a single particle
171     int my_pidx;
172     if (gridDim.y > 1)
173         {
174         // fermi workaround
175         my_pidx = (blockIdx.x + blockIdx.y*65535) * (blockDim.x/threads_per_particle) + threadIdx.x/threads_per_particle;
176         }
177     else
178         {
179         my_pidx = blockIdx.x * (blockDim.x/threads_per_particle) + threadIdx.x/threads_per_particle;
180         }
182     // return early if out of bounds
183     if (my_pidx >= N) return;
185     // first, determine which bin this particle belongs to
186     Scalar4 my_postype = d_pos[my_pidx];
187     Scalar3 my_pos = make_scalar3(my_postype.x, my_postype.y, my_postype.z);
189     unsigned int my_body = d_body[my_pidx];
190     Scalar my_diameter = d_diameter[my_pidx];
192     Scalar3 f = box.makeFraction(my_pos, ghost_width);
194     // find the bin each particle belongs in
195     int ib = (int)(f.x * ci.getW());
196     int jb = (int)(f.y * ci.getH());
197     int kb = (int)(f.z * ci.getD());
199     uchar3 periodic = box.getPeriodic();
201     // need to handle the case where the particle is exactly at the box hi
202     if (ib == ci.getW() && periodic.x)
203         ib = 0;
204     if (jb == ci.getH() && periodic.y)
205         jb = 0;
206     if (kb == ci.getD() && periodic.z)
207         kb = 0;
209     int my_cell = ci(ib,jb,kb);
211     // shared memory (volatile is required, since we are doing warp-centric)
212     volatile extern __shared__ unsigned char sh[];
214     // index of current neighbor
215     unsigned int cur_adj = 0;
217     // current cell
218     unsigned int neigh_cell = d_cell_adj[cadji(cur_adj, my_cell)];
220     // size of current cell
221     unsigned int neigh_size = d_cell_size[neigh_cell];
223     // offset of cta in shared memory
224     int cta_offs = (threadIdx.x/threads_per_particle)*warp_scan<threads_per_particle>::capacity;
226     // current index in cell
227     int cur_offset = threadIdx.x % threads_per_particle;
229     bool done = false;
231     // total number of neighbors
232     unsigned int nneigh = 0;
234     while (! done)
235         {
236         // initalize with default
237         unsigned int neighbor;
238         unsigned char has_neighbor = 0;
240         // advance neighbor cell
241         while (cur_offset >= neigh_size && !done )
242             {
243             cur_offset -= neigh_size;
244             cur_adj++;
245             if (cur_adj < cadji.getW())
246                 {
247                 neigh_cell = d_cell_adj[cadji(cur_adj, my_cell)];
248                 neigh_size = d_cell_size[neigh_cell];
249                 }
250             else
251                 // we are past the end of the cell neighbors
252                 done = true;
253             }
255         // if the first thread in the cta has no work, terminate the loop
256         if (done && !(threadIdx.x % threads_per_particle)) break;
258         if (!done)
259             {
260             Scalar4 cur_xyzf = texFetchScalar4(d_cell_xyzf, cell_xyzf_1d_tex, cli(cur_offset, neigh_cell));
262             Scalar4 cur_tdb = make_scalar4(0, 0, 0, 0);
263             if (filter_diameter || filter_body)
264                 cur_tdb = d_cell_tdb[cli(cur_offset, neigh_cell)];
266             // advance cur_offset
267             cur_offset += threads_per_particle;
269             unsigned int neigh_body = __scalar_as_int(cur_tdb.z);
270             Scalar neigh_diameter = cur_tdb.y;
272             Scalar3 neigh_pos = make_scalar3(cur_xyzf.x,
273                                            cur_xyzf.y,
274                                            cur_xyzf.z);
275             int cur_neigh = __scalar_as_int(cur_xyzf.w);
277             // compute the distance between the two particles
278             Scalar3 dx = my_pos - neigh_pos;
280             // wrap the periodic boundary conditions
281             dx = box.minImage(dx);
283             // compute dr squared
284             Scalar drsq = dot(dx,dx);
286             bool excluded = (my_pidx == cur_neigh);
288             if (filter_body && my_body != 0xffffffff)
289                 excluded = excluded | (my_body == neigh_body);
291             Scalar sqshift(0.0);
292             if (filter_diameter)
293                 {
294                 // compute the shift in radius to accept neighbors based on their diameters
295                 Scalar delta = (my_diameter + neigh_diameter) * Scalar(0.5) - Scalar(1.0);
296                 // r^2 < (r_max + delta)^2
297                 // r^2 < r_maxsq + delta^2 + 2*r_max*delta
298                 sqshift = (delta + Scalar(2.0) * r_max) * delta;
299                 }
301             // store result in shared memory
302             if (drsq <= (r_maxsq + sqshift) && !excluded)
303                 {
304                 neighbor = cur_neigh;
305                 has_neighbor = 1;
306                 }
307             }
309         // no syncthreads here, we assume threads_per_particle < warp size
311         // scan over flags
312         unsigned char n;
313         int k = warp_scan<threads_per_particle>::Scan(threadIdx.x % threads_per_particle,
314             has_neighbor, &sh[cta_offs], &n);
316         if (has_neighbor && nneigh + k < nli.getH())
317             d_nlist[nli(my_pidx, nneigh + k)] = neighbor;
319         // increment total neighbor count
320         nneigh += n;
321         } // end while
323     if (threadIdx.x % threads_per_particle == 0)
324         {
325         // flag if we need to grow the neighbor list
326         if (nneigh >= nli.getH())
327             atomicMax(&d_conditions[0], nneigh);
329         d_n_neigh[my_pidx] = nneigh;
330         d_last_updated_pos[my_pidx] = my_postype;
331         }
332     }
334 //! determine maximum possible block size
335 template<typename T>
336 int get_max_block_size(T func)
337     {
338     cudaFuncAttributes attr;
339     cudaFuncGetAttributes(&attr, (const void*)func);
340     int max_threads = attr.maxThreadsPerBlock;
341     // number of threads has to be multiple of warp size
342     max_threads -= max_threads % max_threads_per_particle;
343     return max_threads;
344     }
346 void gpu_nlist_binned_bind_texture(const Scalar4 *d_cell_xyzf, unsigned int n_elements)
347     {
348     // bind the position texture
349     cell_xyzf_1d_tex.normalized = false;
350     cell_xyzf_1d_tex.filterMode = cudaFilterModePoint;
351     cudaBindTexture(0, cell_xyzf_1d_tex, d_cell_xyzf, sizeof(Scalar4)*n_elements);
352     }
354 //! recursive template to launch neighborlist with given template parameters
355 /* \tparam cur_tpp Number of threads per particle (assumed to be power of two) */
356 template<int cur_tpp>
357 inline void launcher(unsigned int *d_nlist,
358               unsigned int *d_n_neigh,
359               Scalar4 *d_last_updated_pos,
360               unsigned int *d_conditions,
361               const Index2D nli,
362               const Scalar4 *d_pos,
363               const unsigned int *d_body,
364               const Scalar *d_diameter,
365               const unsigned int N,
366               const unsigned int *d_cell_size,
367               const Scalar4 *d_cell_xyzf,
368               const Scalar4 *d_cell_tdb,
369               const unsigned int *d_cell_adj,
370               const Index3D ci,
371               const Index2D cli,
372               const Index2D cadji,
373               const BoxDim box,
374               const Scalar r_maxsq,
375               const Scalar r_max,
376               const Scalar3 ghost_width,
377               const unsigned int compute_capability,
378               unsigned int tpp,
379               bool filter_diameter,
380               bool filter_body,
381               unsigned int block_size)
382     {
383     unsigned int shared_size = 0;
385     if (tpp == cur_tpp && cur_tpp != 0)
386         {
387         if (!filter_diameter && !filter_body)
388             {
389             static unsigned int max_block_size = UINT_MAX;
390             if (max_block_size == UINT_MAX)
391                 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<0,cur_tpp>);
392             if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
394             block_size = block_size < max_block_size ? block_size : max_block_size;
395             dim3 grid(N / (block_size/tpp) + 1);
396             if (compute_capability < 30 && grid.x > 65535)
397                 {
398                 grid.y = grid.x/65535 + 1;
399                 grid.x = 65535;
400                 }
402             if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
404             gpu_compute_nlist_binned_shared_kernel<0,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
405                                                                              d_n_neigh,
406                                                                              d_last_updated_pos,
407                                                                              d_conditions,
408                                                                              nli,
409                                                                              d_pos,
410                                                                              d_body,
411                                                                              d_diameter,
412                                                                              N,
413                                                                              d_cell_size,
414                                                                              d_cell_xyzf,
415                                                                              d_cell_tdb,
416                                                                              d_cell_adj,
417                                                                              ci,
418                                                                              cli,
419                                                                              cadji,
420                                                                              box,
421                                                                              r_maxsq,
422                                                                              sqrtf(r_maxsq),
423                                                                              ghost_width);
424             }
425         else if (!filter_diameter && filter_body)
426             {
427             static unsigned int max_block_size = UINT_MAX;
428             if (max_block_size == UINT_MAX)
429                 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<1,cur_tpp>);
430             if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
432             block_size = block_size < max_block_size ? block_size : max_block_size;
433             dim3 grid(N / (block_size/tpp) + 1);
434             if (compute_capability < 30 && grid.x > 65535)
435                 {
436                 grid.y = grid.x/65535 + 1;
437                 grid.x = 65535;
438                 }
440             if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
442             gpu_compute_nlist_binned_shared_kernel<1,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
443                                                                              d_n_neigh,
444                                                                              d_last_updated_pos,
445                                                                              d_conditions,
446                                                                              nli,
447                                                                              d_pos,
448                                                                              d_body,
449                                                                              d_diameter,
450                                                                              N,
451                                                                              d_cell_size,
452                                                                              d_cell_xyzf,
453                                                                              d_cell_tdb,
454                                                                              d_cell_adj,
455                                                                              ci,
456                                                                              cli,
457                                                                              cadji,
458                                                                              box,
459                                                                              r_maxsq,
460                                                                              sqrtf(r_maxsq),
461                                                                              ghost_width);
462             }
463         else if (filter_diameter && !filter_body)
464             {
465             static unsigned int max_block_size = UINT_MAX;
466             if (max_block_size == UINT_MAX)
467                 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<2,cur_tpp>);
468             if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
470             block_size = block_size < max_block_size ? block_size : max_block_size;
471             dim3 grid(N / (block_size/tpp) + 1);
472             if (compute_capability < 30 && grid.x > 65535)
473                 {
474                 grid.y = grid.x/65535 + 1;
475                 grid.x = 65535;
476                 }
478             if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
480             gpu_compute_nlist_binned_shared_kernel<2,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
481                                                                              d_n_neigh,
482                                                                              d_last_updated_pos,
483                                                                              d_conditions,
484                                                                              nli,
485                                                                              d_pos,
486                                                                              d_body,
487                                                                              d_diameter,
488                                                                              N,
489                                                                              d_cell_size,
490                                                                              d_cell_xyzf,
491                                                                              d_cell_tdb,
492                                                                              d_cell_adj,
493                                                                              ci,
494                                                                              cli,
495                                                                              cadji,
496                                                                              box,
497                                                                              r_maxsq,
498                                                                              sqrtf(r_maxsq),
499                                                                              ghost_width);
500             }
501         else if (filter_diameter && filter_body)
502             {
503             static unsigned int max_block_size = UINT_MAX;
504             if (max_block_size == UINT_MAX)
505                 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<3,cur_tpp>);
506             if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
508             block_size = block_size < max_block_size ? block_size : max_block_size;
509             dim3 grid(N / (block_size/tpp) + 1);
510             if (compute_capability < 30 && grid.x > 65535)
511                 {
512                 grid.y = grid.x/65535 + 1;
513                 grid.x = 65535;
514                 }
516             if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
518             gpu_compute_nlist_binned_shared_kernel<3,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
519                                                                              d_n_neigh,
520                                                                              d_last_updated_pos,
521                                                                              d_conditions,
522                                                                              nli,
523                                                                              d_pos,
524                                                                              d_body,
525                                                                              d_diameter,
526                                                                              N,
527                                                                              d_cell_size,
528                                                                              d_cell_xyzf,
529                                                                              d_cell_tdb,
530                                                                              d_cell_adj,
531                                                                              ci,
532                                                                              cli,
533                                                                              cadji,
534                                                                              box,
535                                                                              r_maxsq,
536                                                                              sqrtf(r_maxsq),
537                                                                              ghost_width);
538             }
539         }
540     else
541         {
542         launcher<cur_tpp/2>(d_nlist,
543                      d_n_neigh,
544                      d_last_updated_pos,
545                      d_conditions,
546                      nli,
547                      d_pos,
548                      d_body,
549                      d_diameter,
550                      N,
551                      d_cell_size,
552                      d_cell_xyzf,
553                      d_cell_tdb,
554                      d_cell_adj,
555                      ci,
556                      cli,
557                      cadji,
558                      box,
559                      r_maxsq,
560                      sqrtf(r_maxsq),
561                      ghost_width,
562                      compute_capability,
563                      tpp,
564                      filter_diameter,
565                      filter_body,
566                      block_size
567                      );
568         }
569     }
571 //! template specialization to terminate recursion
572 template<>
573 inline void launcher<min_threads_per_particle/2>(unsigned int *d_nlist,
574               unsigned int *d_n_neigh,
575               Scalar4 *d_last_updated_pos,
576               unsigned int *d_conditions,
577               const Index2D nli,
578               const Scalar4 *d_pos,
579               const unsigned int *d_body,
580               const Scalar *d_diameter,
581               const unsigned int N,
582               const unsigned int *d_cell_size,
583               const Scalar4 *d_cell_xyzf,
584               const Scalar4 *d_cell_tdb,
585               const unsigned int *d_cell_adj,
586               const Index3D ci,
587               const Index2D cli,
588               const Index2D cadji,
589               const BoxDim box,
590               const Scalar r_maxsq,
591               const Scalar r_max,
592               const Scalar3 ghost_width,
593               const unsigned int compute_capability,
594               unsigned int tpp,
595               bool filter_diameter,
596               bool filter_body,
597               unsigned int block_size)
598     { }
600 cudaError_t gpu_compute_nlist_binned_shared(unsigned int *d_nlist,
601                                      unsigned int *d_n_neigh,
602                                      Scalar4 *d_last_updated_pos,
603                                      unsigned int *d_conditions,
604                                      const Index2D& nli,
605                                      const Scalar4 *d_pos,
606                                      const unsigned int *d_body,
607                                      const Scalar *d_diameter,
608                                      const unsigned int N,
609                                      const unsigned int *d_cell_size,
610                                      const Scalar4 *d_cell_xyzf,
611                                      const Scalar4 *d_cell_tdb,
612                                      const unsigned int *d_cell_adj,
613                                      const Index3D& ci,
614                                      const Index2D& cli,
615                                      const Index2D& cadji,
616                                      const BoxDim& box,
617                                      const Scalar r_maxsq,
618                                      const unsigned int threads_per_particle,
619                                      const unsigned int block_size,
620                                      bool filter_body,
621                                      bool filter_diameter,
622                                      const Scalar3& ghost_width,
623                                      const unsigned int compute_capability)
624     {
625     launcher<max_threads_per_particle>(d_nlist,
626                                    d_n_neigh,
627                                    d_last_updated_pos,
628                                    d_conditions,
629                                    nli,
630                                    d_pos,
631                                    d_body,
632                                    d_diameter,
633                                    N,
634                                    d_cell_size,
635                                    d_cell_xyzf,
636                                    d_cell_tdb,
637                                    d_cell_adj,
638                                    ci,
639                                    cli,
640                                    cadji,
641                                    box,
642                                    r_maxsq,
643                                    sqrtf(r_maxsq),
644                                    ghost_width,
645                                    compute_capability,
646                                    threads_per_particle,
647                                    filter_diameter,
648                                    filter_body,
649                                    block_size
650                                    );
652     return cudaSuccess;
653     }
655 // don't compile the 1x nlist kernel in double precision builds
656 #ifdef SINGLE_PRECISION
657 //! Texture for reading d_cell_adj
658 texture<unsigned int, 2, cudaReadModeElementType> cell_adj_tex;
659 //! Texture for reading d_cell_size
660 texture<unsigned int, 1, cudaReadModeElementType> cell_size_tex;
661 //! Texture for reading d_cell_xyzf
662 texture<Scalar4, 2, cudaReadModeElementType> cell_xyzf_tex;
663 //! Texture for reading d_cell_tdb
664 texture<Scalar4, 2, cudaReadModeElementType> cell_tdb_tex;
666 //! Kernel call for generating neighbor list on the GPU
667 /*! \tparam filter_flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering.
668     \param d_nlist Neighbor list data structure to write
669     \param d_n_neigh Number of neighbors to write
670     \param d_last_updated_pos Particle positions at this update are written to this array
671     \param d_conditions Conditions array for writing overflow condition
672     \param nli Indexer to access \a d_nlist
673     \param d_pos Particle positions
674     \param d_body Particle body indices
675     \param d_diameter Particle diameters
676     \param N Number of particles
677     \param ci Cell indexer for indexing cells
678     \param box Simulation box dimensions
679     \param r_maxsq The maximum radius for which to include particles as neighbors, squared
680     \param r_max The maximum radius for which to include particles as neighbors
681     \param ghost_width Width of ghost cell layer
683     \note optimized for compute 1.x devices
685 template<unsigned char filter_flags>
686 __global__ void gpu_compute_nlist_binned_1x_kernel(unsigned int *d_nlist,
687                                                    unsigned int *d_n_neigh,
688                                                    Scalar4 *d_last_updated_pos,
689                                                    unsigned int *d_conditions,
690                                                    const Index2D nli,
691                                                    const Scalar4 *d_pos,
692                                                    const unsigned int *d_body,
693                                                    const Scalar *d_diameter,
694                                                    const unsigned int N,
695                                                    const Index3D ci,
696                                                    const BoxDim box,
697                                                    const float r_maxsq,
698                                                    const float r_max,
699                                                    const Scalar3 ghost_width)
700     {
701     bool filter_body = filter_flags & 1;
702     bool filter_diameter = filter_flags & 2;
704     // each thread is going to compute the neighbor list for a single particle
705     int my_pidx = blockDim.x * blockIdx.x + threadIdx.x;
707     // count the number of neighbors needed
708     unsigned int n_neigh_needed = 0;
710     // quit early if we are past the end of the array
711     if (my_pidx >= N)
712         return;
714     // first, determine which bin this particle belongs to
715     Scalar4 my_postype = d_pos[my_pidx];
716     Scalar3 my_pos = make_scalar3(my_postype.x, my_postype.y, my_postype.z);
718     unsigned int my_body = d_body[my_pidx];
719     Scalar my_diameter = d_diameter[my_pidx];
721     // get periodic flags
722     uchar3 periodic = box.getPeriodic();
724     // find the bin each particle belongs in
725     Scalar3 f = box.makeFraction(my_pos,ghost_width);
726     unsigned int ib = (unsigned int)(f.x * ci.getW());
727     unsigned int jb = (unsigned int)(f.y * ci.getH());
728     unsigned int kb = (unsigned int)(f.z * ci.getD());
730     // need to handle the case where the particle is exactly at the box hi
731     if (ib == ci.getW() && periodic.x)
732         ib = 0;
733     if (jb == ci.getH() && periodic.y)
734         jb = 0;
735     if (kb == ci.getD() && periodic.z)
736         kb = 0;
738     int my_cell = ci(ib,jb,kb);
740     // each thread will determine the neighborlist of a single particle
741     // count number of neighbors found so far in n_neigh
742     int n_neigh = 0;
744     // loop over all adjacent bins
745     for (unsigned int cur_adj = 0; cur_adj < 27; cur_adj++)
746         {
747         int neigh_cell = tex2D(cell_adj_tex, cur_adj, my_cell);
748         unsigned int size = tex1Dfetch(cell_size_tex, neigh_cell);
750         Scalar4 next_xyzf = tex2D(cell_xyzf_tex, 0, neigh_cell);
752         // now, we are set to loop through the array
753         for (int cur_offset = 0; cur_offset < size; cur_offset++)
754             {
755             Scalar4 cur_xyzf = next_xyzf;
756             next_xyzf = tex2D(cell_xyzf_tex, cur_offset+1, neigh_cell);
757             Scalar4 cur_tdb = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
758             if (filter_diameter || filter_body)
759                 cur_tdb = tex2D(cell_tdb_tex, cur_offset, neigh_cell);
760             unsigned int neigh_body = __scalar_as_int(cur_tdb.z);
761             Scalar neigh_diameter = cur_tdb.y;
763             Scalar3 neigh_pos = make_scalar3(cur_xyzf.x,
764                                            cur_xyzf.y,
765                                            cur_xyzf.z);
766             int cur_neigh = __scalar_as_int(cur_xyzf.w);
768             // compute the distance between the two particles
769             Scalar3 dx = my_pos - neigh_pos;
771             // wrap the periodic boundary conditions
772             dx = box.minImage(dx);
773             // compute dr squared
774             Scalar drsq = dot(dx,dx);
777             bool excluded = (my_pidx == cur_neigh);
779             if (filter_body && my_body != 0xffffffff)
780                 excluded = excluded | (my_body == neigh_body);
782             Scalar sqshift = Scalar(0.0);
783             if (filter_diameter)
784                 {
785                 // compute the shift in radius to accept neighbors based on their diameters
786                 Scalar delta = (my_diameter + neigh_diameter) * Scalar(0.5) - Scalar(1.0);
787                 // r^2 < (r_max + delta)^2
788                 // r^2 < r_maxsq + delta^2 + 2*r_max*delta
789                 sqshift = (delta + Scalar(2.0) * r_max) * delta;
790                 }
792             if (drsq <= (r_maxsq + sqshift) && !excluded)
793                 {
794                 if (n_neigh < nli.getH())
795                     d_nlist[nli(my_pidx, n_neigh)] = cur_neigh;
796                 else
797                     n_neigh_needed = n_neigh+1;
799                 n_neigh++;
800                 }
801             }
802         }
804     d_n_neigh[my_pidx] = n_neigh;
805     d_last_updated_pos[my_pidx] = my_postype;
807     if (n_neigh_needed > 0)
808         atomicMax(&d_conditions[0], n_neigh_needed);
809     }
810 #endif  // #ifdef SINGLE_PRECISION
812 cudaError_t gpu_compute_nlist_binned_1x(unsigned int *d_nlist,
813                                         unsigned int *d_n_neigh,
814                                         Scalar4 *d_last_updated_pos,
815                                         unsigned int *d_conditions,
816                                         const Index2D& nli,
817                                         const Scalar4 *d_pos,
818                                         const unsigned int *d_body,
819                                         const Scalar *d_diameter,
820                                         const unsigned int N,
821                                         const unsigned int *d_cell_size,
822                                         const cudaArray *dca_cell_xyzf,
823                                         const cudaArray *dca_cell_tdb,
824                                         const cudaArray *dca_cell_adj,
825                                         const Index3D& ci,
826                                         const BoxDim& box,
827                                         const Scalar r_maxsq,
828                                         const unsigned int block_size,
829                                         bool filter_body,
830                                         bool filter_diameter,
831                                         const Scalar3& ghost_width)
832     {
833     // don't compile the 1x nlist kernel in double precision builds
834     #ifdef SINGLE_PRECISION
835     int n_blocks = (int)ceil(double(N)/double(block_size));
837     cudaError_t err = cudaBindTextureToArray(cell_adj_tex, dca_cell_adj);
838     if (err != cudaSuccess)
839         return err;
841     err = cudaBindTextureToArray(cell_xyzf_tex, dca_cell_xyzf);
842     if (err != cudaSuccess)
843         return err;
845     err = cudaBindTextureToArray(cell_tdb_tex, dca_cell_tdb);
846     if (err != cudaSuccess)
847         return err;
849     err = cudaBindTexture(0, cell_size_tex, d_cell_size, sizeof(unsigned int)*ci.getNumElements());
850     if (err != cudaSuccess)
851         return err;
853     if (!filter_diameter && !filter_body)
854         {
855         gpu_compute_nlist_binned_1x_kernel<0><<<n_blocks, block_size>>>(d_nlist,
856                                                                         d_n_neigh,
857                                                                         d_last_updated_pos,
858                                                                         d_conditions,
859                                                                         nli,
860                                                                         d_pos,
861                                                                         d_body,
862                                                                         d_diameter,
863                                                                         N,
864                                                                         ci,
865                                                                         box,
866                                                                         r_maxsq,
867                                                                         sqrtf(r_maxsq),
868                                                                         ghost_width);
869         }
870     if (!filter_diameter && filter_body)
871         {
872         gpu_compute_nlist_binned_1x_kernel<1><<<n_blocks, block_size>>>(d_nlist,
873                                                                         d_n_neigh,
874                                                                         d_last_updated_pos,
875                                                                         d_conditions,
876                                                                         nli,
877                                                                         d_pos,
878                                                                         d_body,
879                                                                         d_diameter,
880                                                                         N,
881                                                                         ci,
882                                                                         box,
883                                                                         r_maxsq,
884                                                                         sqrtf(r_maxsq),
885                                                                         ghost_width);
886         }
887     if (filter_diameter && !filter_body)
888         {
889         gpu_compute_nlist_binned_1x_kernel<2><<<n_blocks, block_size>>>(d_nlist,
890                                                                         d_n_neigh,
891                                                                         d_last_updated_pos,
892                                                                         d_conditions,
893                                                                         nli,
894                                                                         d_pos,
895                                                                         d_body,
896                                                                         d_diameter,
897                                                                         N,
898                                                                         ci,
899                                                                         box,
900                                                                         r_maxsq,
901                                                                         sqrtf(r_maxsq),
902                                                                         ghost_width);
903         }
904     if (filter_diameter && filter_body)
905         {
906         gpu_compute_nlist_binned_1x_kernel<3><<<n_blocks, block_size>>>(d_nlist,
907                                                                         d_n_neigh,
908                                                                         d_last_updated_pos,
909                                                                         d_conditions,
910                                                                         nli,
911                                                                         d_pos,
912                                                                         d_body,
913                                                                         d_diameter,
914                                                                         N,
915                                                                         ci,
916                                                                         box,
917                                                                         r_maxsq,
918                                                                         sqrtf(r_maxsq),
919                                                                         ghost_width );
920         }
921     #endif // #ifdef SINGLE_PRECISION
923     return cudaSuccess;
924     }