2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: joaander
52 #include "NeighborListGPUBinned.cuh"
53 #include "TextureTools.h"
55 /*! \file NeighborListGPUBinned.cu
56 \brief Defines GPU kernel code for O(N) neighbor list generation on the GPU
59 //! Texture for reading d_cell_xyzf
60 scalar4_tex_t cell_xyzf_1d_tex;
66 #if __CUDA_ARCH__ >= 300
67 enum { capacity = 0 }; // uses no shared memory
69 enum { capacity = NT > 1 ? (2 * NT + 1) : 1};
72 __device__ static int Scan(int tid, unsigned char x, volatile unsigned char *shared, unsigned char* total)
74 #if __CUDA_ARCH__ >= 300
77 //This command gets the lane ID within the current warp
78 asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
80 int first = laneid - tid;
83 for(int offset = 1; offset < NT; offset += offset)
85 int y = __shfl(x,(first + tid - offset) &(WARP_SIZE -1));
86 if(tid >= offset) x += y;
89 // all threads get the total from the last thread in the cta
90 *total = __shfl(x,first + NT - 1);
92 // shift by one (exclusive scan)
93 int y = __shfl(x,(first + tid - 1) &(WARP_SIZE-1));
96 #else // __CUDA_ARCH__ >= 300
100 // no syncthreads here (inside warp)
102 for(int offset = 1; offset < NT; offset += offset)
105 x = shared[first + tid - offset] + x;
107 shared[first + tid] = x;
108 // no syncthreads here (inside warp)
110 *total = shared[first + NT - 1];
112 // shift by one (exclusive scan)
113 x = tid ? shared[first + tid - 1] : 0;
115 // no syncthreads here (inside warp)
120 //! Kernel call for generating neighbor list on the GPU (shared memory version)
121 /*! \tparam flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering.
122 \param d_nlist Neighbor list data structure to write
123 \param d_n_neigh Number of neighbors to write
124 \param d_last_updated_pos Particle positions at this update are written to this array
125 \param d_conditions Conditions array for writing overflow condition
126 \param nli Indexer to access \a d_nlist
127 \param d_pos Particle positions
128 \param d_body Particle body indices
129 \param d_diameter Particle diameters
130 \param N Number of particles
131 \param d_cell_size Number of particles in each cell
132 \param d_cell_xyzf Cell contents (xyzf array from CellList with flag=type)
133 \param d_cell_tdb Cell contents (tdb array from CellList with)
134 \param d_cell_adj Cell adjacency list
135 \param ci Cell indexer for indexing cells
136 \param cli Cell list indexer for indexing into d_cell_xyzf
137 \param cadji Adjacent cell indexer listing the 27 neighboring cells
138 \param box Simulation box dimensions
139 \param r_maxsq The maximum radius for which to include particles as neighbors, squared
140 \param r_max The maximum radius for which to include particles as neighbors
141 \param ghost_width Width of ghost cell layer
143 \note optimized for Fermi
145 template<unsigned char flags, int threads_per_particle>
146 __global__ void gpu_compute_nlist_binned_shared_kernel(unsigned int *d_nlist,
147 unsigned int *d_n_neigh,
148 Scalar4 *d_last_updated_pos,
149 unsigned int *d_conditions,
151 const Scalar4 *d_pos,
152 const unsigned int *d_body,
153 const Scalar *d_diameter,
154 const unsigned int N,
155 const unsigned int *d_cell_size,
156 const Scalar4 *d_cell_xyzf,
157 const Scalar4 *d_cell_tdb,
158 const unsigned int *d_cell_adj,
163 const Scalar r_maxsq,
165 const Scalar3 ghost_width)
167 bool filter_body = flags & 1;
168 bool filter_diameter = flags & 2;
170 // each set of threads_per_particle threads is going to compute the neighbor list for a single particle
175 my_pidx = (blockIdx.x + blockIdx.y*65535) * (blockDim.x/threads_per_particle) + threadIdx.x/threads_per_particle;
179 my_pidx = blockIdx.x * (blockDim.x/threads_per_particle) + threadIdx.x/threads_per_particle;
182 // return early if out of bounds
183 if (my_pidx >= N) return;
185 // first, determine which bin this particle belongs to
186 Scalar4 my_postype = d_pos[my_pidx];
187 Scalar3 my_pos = make_scalar3(my_postype.x, my_postype.y, my_postype.z);
189 unsigned int my_body = d_body[my_pidx];
190 Scalar my_diameter = d_diameter[my_pidx];
192 Scalar3 f = box.makeFraction(my_pos, ghost_width);
194 // find the bin each particle belongs in
195 int ib = (int)(f.x * ci.getW());
196 int jb = (int)(f.y * ci.getH());
197 int kb = (int)(f.z * ci.getD());
199 uchar3 periodic = box.getPeriodic();
201 // need to handle the case where the particle is exactly at the box hi
202 if (ib == ci.getW() && periodic.x)
204 if (jb == ci.getH() && periodic.y)
206 if (kb == ci.getD() && periodic.z)
209 int my_cell = ci(ib,jb,kb);
211 // shared memory (volatile is required, since we are doing warp-centric)
212 volatile extern __shared__ unsigned char sh[];
214 // index of current neighbor
215 unsigned int cur_adj = 0;
218 unsigned int neigh_cell = d_cell_adj[cadji(cur_adj, my_cell)];
220 // size of current cell
221 unsigned int neigh_size = d_cell_size[neigh_cell];
223 // offset of cta in shared memory
224 int cta_offs = (threadIdx.x/threads_per_particle)*warp_scan<threads_per_particle>::capacity;
226 // current index in cell
227 int cur_offset = threadIdx.x % threads_per_particle;
231 // total number of neighbors
232 unsigned int nneigh = 0;
236 // initalize with default
237 unsigned int neighbor;
238 unsigned char has_neighbor = 0;
240 // advance neighbor cell
241 while (cur_offset >= neigh_size && !done )
243 cur_offset -= neigh_size;
245 if (cur_adj < cadji.getW())
247 neigh_cell = d_cell_adj[cadji(cur_adj, my_cell)];
248 neigh_size = d_cell_size[neigh_cell];
251 // we are past the end of the cell neighbors
255 // if the first thread in the cta has no work, terminate the loop
256 if (done && !(threadIdx.x % threads_per_particle)) break;
260 Scalar4 cur_xyzf = texFetchScalar4(d_cell_xyzf, cell_xyzf_1d_tex, cli(cur_offset, neigh_cell));
262 Scalar4 cur_tdb = make_scalar4(0, 0, 0, 0);
263 if (filter_diameter || filter_body)
264 cur_tdb = d_cell_tdb[cli(cur_offset, neigh_cell)];
266 // advance cur_offset
267 cur_offset += threads_per_particle;
269 unsigned int neigh_body = __scalar_as_int(cur_tdb.z);
270 Scalar neigh_diameter = cur_tdb.y;
272 Scalar3 neigh_pos = make_scalar3(cur_xyzf.x,
275 int cur_neigh = __scalar_as_int(cur_xyzf.w);
277 // compute the distance between the two particles
278 Scalar3 dx = my_pos - neigh_pos;
280 // wrap the periodic boundary conditions
281 dx = box.minImage(dx);
283 // compute dr squared
284 Scalar drsq = dot(dx,dx);
286 bool excluded = (my_pidx == cur_neigh);
288 if (filter_body && my_body != 0xffffffff)
289 excluded = excluded | (my_body == neigh_body);
294 // compute the shift in radius to accept neighbors based on their diameters
295 Scalar delta = (my_diameter + neigh_diameter) * Scalar(0.5) - Scalar(1.0);
296 // r^2 < (r_max + delta)^2
297 // r^2 < r_maxsq + delta^2 + 2*r_max*delta
298 sqshift = (delta + Scalar(2.0) * r_max) * delta;
301 // store result in shared memory
302 if (drsq <= (r_maxsq + sqshift) && !excluded)
304 neighbor = cur_neigh;
309 // no syncthreads here, we assume threads_per_particle < warp size
313 int k = warp_scan<threads_per_particle>::Scan(threadIdx.x % threads_per_particle,
314 has_neighbor, &sh[cta_offs], &n);
316 if (has_neighbor && nneigh + k < nli.getH())
317 d_nlist[nli(my_pidx, nneigh + k)] = neighbor;
319 // increment total neighbor count
323 if (threadIdx.x % threads_per_particle == 0)
325 // flag if we need to grow the neighbor list
326 if (nneigh >= nli.getH())
327 atomicMax(&d_conditions[0], nneigh);
329 d_n_neigh[my_pidx] = nneigh;
330 d_last_updated_pos[my_pidx] = my_postype;
334 //! determine maximum possible block size
336 int get_max_block_size(T func)
338 cudaFuncAttributes attr;
339 cudaFuncGetAttributes(&attr, (const void*)func);
340 int max_threads = attr.maxThreadsPerBlock;
341 // number of threads has to be multiple of warp size
342 max_threads -= max_threads % max_threads_per_particle;
346 void gpu_nlist_binned_bind_texture(const Scalar4 *d_cell_xyzf, unsigned int n_elements)
348 // bind the position texture
349 cell_xyzf_1d_tex.normalized = false;
350 cell_xyzf_1d_tex.filterMode = cudaFilterModePoint;
351 cudaBindTexture(0, cell_xyzf_1d_tex, d_cell_xyzf, sizeof(Scalar4)*n_elements);
354 //! recursive template to launch neighborlist with given template parameters
355 /* \tparam cur_tpp Number of threads per particle (assumed to be power of two) */
356 template<int cur_tpp>
357 inline void launcher(unsigned int *d_nlist,
358 unsigned int *d_n_neigh,
359 Scalar4 *d_last_updated_pos,
360 unsigned int *d_conditions,
362 const Scalar4 *d_pos,
363 const unsigned int *d_body,
364 const Scalar *d_diameter,
365 const unsigned int N,
366 const unsigned int *d_cell_size,
367 const Scalar4 *d_cell_xyzf,
368 const Scalar4 *d_cell_tdb,
369 const unsigned int *d_cell_adj,
374 const Scalar r_maxsq,
376 const Scalar3 ghost_width,
377 const unsigned int compute_capability,
379 bool filter_diameter,
381 unsigned int block_size)
383 unsigned int shared_size = 0;
385 if (tpp == cur_tpp && cur_tpp != 0)
387 if (!filter_diameter && !filter_body)
389 static unsigned int max_block_size = UINT_MAX;
390 if (max_block_size == UINT_MAX)
391 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<0,cur_tpp>);
392 if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
394 block_size = block_size < max_block_size ? block_size : max_block_size;
395 dim3 grid(N / (block_size/tpp) + 1);
396 if (compute_capability < 30 && grid.x > 65535)
398 grid.y = grid.x/65535 + 1;
402 if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
404 gpu_compute_nlist_binned_shared_kernel<0,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
425 else if (!filter_diameter && filter_body)
427 static unsigned int max_block_size = UINT_MAX;
428 if (max_block_size == UINT_MAX)
429 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<1,cur_tpp>);
430 if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
432 block_size = block_size < max_block_size ? block_size : max_block_size;
433 dim3 grid(N / (block_size/tpp) + 1);
434 if (compute_capability < 30 && grid.x > 65535)
436 grid.y = grid.x/65535 + 1;
440 if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
442 gpu_compute_nlist_binned_shared_kernel<1,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
463 else if (filter_diameter && !filter_body)
465 static unsigned int max_block_size = UINT_MAX;
466 if (max_block_size == UINT_MAX)
467 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<2,cur_tpp>);
468 if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
470 block_size = block_size < max_block_size ? block_size : max_block_size;
471 dim3 grid(N / (block_size/tpp) + 1);
472 if (compute_capability < 30 && grid.x > 65535)
474 grid.y = grid.x/65535 + 1;
478 if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
480 gpu_compute_nlist_binned_shared_kernel<2,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
501 else if (filter_diameter && filter_body)
503 static unsigned int max_block_size = UINT_MAX;
504 if (max_block_size == UINT_MAX)
505 max_block_size = get_max_block_size(gpu_compute_nlist_binned_shared_kernel<3,cur_tpp>);
506 if (compute_capability < 35) gpu_nlist_binned_bind_texture(d_cell_xyzf, cli.getNumElements());
508 block_size = block_size < max_block_size ? block_size : max_block_size;
509 dim3 grid(N / (block_size/tpp) + 1);
510 if (compute_capability < 30 && grid.x > 65535)
512 grid.y = grid.x/65535 + 1;
516 if (compute_capability < 30) shared_size = warp_scan<cur_tpp>::capacity*sizeof(unsigned char)*(block_size/cur_tpp);
518 gpu_compute_nlist_binned_shared_kernel<3,cur_tpp><<<grid, block_size,shared_size>>>(d_nlist,
542 launcher<cur_tpp/2>(d_nlist,
571 //! template specialization to terminate recursion
573 inline void launcher<min_threads_per_particle/2>(unsigned int *d_nlist,
574 unsigned int *d_n_neigh,
575 Scalar4 *d_last_updated_pos,
576 unsigned int *d_conditions,
578 const Scalar4 *d_pos,
579 const unsigned int *d_body,
580 const Scalar *d_diameter,
581 const unsigned int N,
582 const unsigned int *d_cell_size,
583 const Scalar4 *d_cell_xyzf,
584 const Scalar4 *d_cell_tdb,
585 const unsigned int *d_cell_adj,
590 const Scalar r_maxsq,
592 const Scalar3 ghost_width,
593 const unsigned int compute_capability,
595 bool filter_diameter,
597 unsigned int block_size)
600 cudaError_t gpu_compute_nlist_binned_shared(unsigned int *d_nlist,
601 unsigned int *d_n_neigh,
602 Scalar4 *d_last_updated_pos,
603 unsigned int *d_conditions,
605 const Scalar4 *d_pos,
606 const unsigned int *d_body,
607 const Scalar *d_diameter,
608 const unsigned int N,
609 const unsigned int *d_cell_size,
610 const Scalar4 *d_cell_xyzf,
611 const Scalar4 *d_cell_tdb,
612 const unsigned int *d_cell_adj,
615 const Index2D& cadji,
617 const Scalar r_maxsq,
618 const unsigned int threads_per_particle,
619 const unsigned int block_size,
621 bool filter_diameter,
622 const Scalar3& ghost_width,
623 const unsigned int compute_capability)
625 launcher<max_threads_per_particle>(d_nlist,
646 threads_per_particle,
655 // don't compile the 1x nlist kernel in double precision builds
656 #ifdef SINGLE_PRECISION
657 //! Texture for reading d_cell_adj
658 texture<unsigned int, 2, cudaReadModeElementType> cell_adj_tex;
659 //! Texture for reading d_cell_size
660 texture<unsigned int, 1, cudaReadModeElementType> cell_size_tex;
661 //! Texture for reading d_cell_xyzf
662 texture<Scalar4, 2, cudaReadModeElementType> cell_xyzf_tex;
663 //! Texture for reading d_cell_tdb
664 texture<Scalar4, 2, cudaReadModeElementType> cell_tdb_tex;
666 //! Kernel call for generating neighbor list on the GPU
667 /*! \tparam filter_flags Set bit 1 to enable body filtering. Set bit 2 to enable diameter filtering.
668 \param d_nlist Neighbor list data structure to write
669 \param d_n_neigh Number of neighbors to write
670 \param d_last_updated_pos Particle positions at this update are written to this array
671 \param d_conditions Conditions array for writing overflow condition
672 \param nli Indexer to access \a d_nlist
673 \param d_pos Particle positions
674 \param d_body Particle body indices
675 \param d_diameter Particle diameters
676 \param N Number of particles
677 \param ci Cell indexer for indexing cells
678 \param box Simulation box dimensions
679 \param r_maxsq The maximum radius for which to include particles as neighbors, squared
680 \param r_max The maximum radius for which to include particles as neighbors
681 \param ghost_width Width of ghost cell layer
683 \note optimized for compute 1.x devices
685 template<unsigned char filter_flags>
686 __global__ void gpu_compute_nlist_binned_1x_kernel(unsigned int *d_nlist,
687 unsigned int *d_n_neigh,
688 Scalar4 *d_last_updated_pos,
689 unsigned int *d_conditions,
691 const Scalar4 *d_pos,
692 const unsigned int *d_body,
693 const Scalar *d_diameter,
694 const unsigned int N,
699 const Scalar3 ghost_width)
701 bool filter_body = filter_flags & 1;
702 bool filter_diameter = filter_flags & 2;
704 // each thread is going to compute the neighbor list for a single particle
705 int my_pidx = blockDim.x * blockIdx.x + threadIdx.x;
707 // count the number of neighbors needed
708 unsigned int n_neigh_needed = 0;
710 // quit early if we are past the end of the array
714 // first, determine which bin this particle belongs to
715 Scalar4 my_postype = d_pos[my_pidx];
716 Scalar3 my_pos = make_scalar3(my_postype.x, my_postype.y, my_postype.z);
718 unsigned int my_body = d_body[my_pidx];
719 Scalar my_diameter = d_diameter[my_pidx];
721 // get periodic flags
722 uchar3 periodic = box.getPeriodic();
724 // find the bin each particle belongs in
725 Scalar3 f = box.makeFraction(my_pos,ghost_width);
726 unsigned int ib = (unsigned int)(f.x * ci.getW());
727 unsigned int jb = (unsigned int)(f.y * ci.getH());
728 unsigned int kb = (unsigned int)(f.z * ci.getD());
730 // need to handle the case where the particle is exactly at the box hi
731 if (ib == ci.getW() && periodic.x)
733 if (jb == ci.getH() && periodic.y)
735 if (kb == ci.getD() && periodic.z)
738 int my_cell = ci(ib,jb,kb);
740 // each thread will determine the neighborlist of a single particle
741 // count number of neighbors found so far in n_neigh
744 // loop over all adjacent bins
745 for (unsigned int cur_adj = 0; cur_adj < 27; cur_adj++)
747 int neigh_cell = tex2D(cell_adj_tex, cur_adj, my_cell);
748 unsigned int size = tex1Dfetch(cell_size_tex, neigh_cell);
750 Scalar4 next_xyzf = tex2D(cell_xyzf_tex, 0, neigh_cell);
752 // now, we are set to loop through the array
753 for (int cur_offset = 0; cur_offset < size; cur_offset++)
755 Scalar4 cur_xyzf = next_xyzf;
756 next_xyzf = tex2D(cell_xyzf_tex, cur_offset+1, neigh_cell);
757 Scalar4 cur_tdb = make_scalar4(Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0));
758 if (filter_diameter || filter_body)
759 cur_tdb = tex2D(cell_tdb_tex, cur_offset, neigh_cell);
760 unsigned int neigh_body = __scalar_as_int(cur_tdb.z);
761 Scalar neigh_diameter = cur_tdb.y;
763 Scalar3 neigh_pos = make_scalar3(cur_xyzf.x,
766 int cur_neigh = __scalar_as_int(cur_xyzf.w);
768 // compute the distance between the two particles
769 Scalar3 dx = my_pos - neigh_pos;
771 // wrap the periodic boundary conditions
772 dx = box.minImage(dx);
773 // compute dr squared
774 Scalar drsq = dot(dx,dx);
777 bool excluded = (my_pidx == cur_neigh);
779 if (filter_body && my_body != 0xffffffff)
780 excluded = excluded | (my_body == neigh_body);
782 Scalar sqshift = Scalar(0.0);
785 // compute the shift in radius to accept neighbors based on their diameters
786 Scalar delta = (my_diameter + neigh_diameter) * Scalar(0.5) - Scalar(1.0);
787 // r^2 < (r_max + delta)^2
788 // r^2 < r_maxsq + delta^2 + 2*r_max*delta
789 sqshift = (delta + Scalar(2.0) * r_max) * delta;
792 if (drsq <= (r_maxsq + sqshift) && !excluded)
794 if (n_neigh < nli.getH())
795 d_nlist[nli(my_pidx, n_neigh)] = cur_neigh;
797 n_neigh_needed = n_neigh+1;
804 d_n_neigh[my_pidx] = n_neigh;
805 d_last_updated_pos[my_pidx] = my_postype;
807 if (n_neigh_needed > 0)
808 atomicMax(&d_conditions[0], n_neigh_needed);
810 #endif // #ifdef SINGLE_PRECISION
812 cudaError_t gpu_compute_nlist_binned_1x(unsigned int *d_nlist,
813 unsigned int *d_n_neigh,
814 Scalar4 *d_last_updated_pos,
815 unsigned int *d_conditions,
817 const Scalar4 *d_pos,
818 const unsigned int *d_body,
819 const Scalar *d_diameter,
820 const unsigned int N,
821 const unsigned int *d_cell_size,
822 const cudaArray *dca_cell_xyzf,
823 const cudaArray *dca_cell_tdb,
824 const cudaArray *dca_cell_adj,
827 const Scalar r_maxsq,
828 const unsigned int block_size,
830 bool filter_diameter,
831 const Scalar3& ghost_width)
833 // don't compile the 1x nlist kernel in double precision builds
834 #ifdef SINGLE_PRECISION
835 int n_blocks = (int)ceil(double(N)/double(block_size));
837 cudaError_t err = cudaBindTextureToArray(cell_adj_tex, dca_cell_adj);
838 if (err != cudaSuccess)
841 err = cudaBindTextureToArray(cell_xyzf_tex, dca_cell_xyzf);
842 if (err != cudaSuccess)
845 err = cudaBindTextureToArray(cell_tdb_tex, dca_cell_tdb);
846 if (err != cudaSuccess)
849 err = cudaBindTexture(0, cell_size_tex, d_cell_size, sizeof(unsigned int)*ci.getNumElements());
850 if (err != cudaSuccess)
853 if (!filter_diameter && !filter_body)
855 gpu_compute_nlist_binned_1x_kernel<0><<<n_blocks, block_size>>>(d_nlist,
870 if (!filter_diameter && filter_body)
872 gpu_compute_nlist_binned_1x_kernel<1><<<n_blocks, block_size>>>(d_nlist,
887 if (filter_diameter && !filter_body)
889 gpu_compute_nlist_binned_1x_kernel<2><<<n_blocks, block_size>>>(d_nlist,
904 if (filter_diameter && filter_body)
906 gpu_compute_nlist_binned_1x_kernel<3><<<n_blocks, block_size>>>(d_nlist,
921 #endif // #ifdef SINGLE_PRECISION