Moved domdec structures out of commrec.h.
[gromacs.git] / src / gromacs / mdlib / nbnxn_grid.cpp
blobb0c92a1087417444b66c774b8a292aeda68fe389
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #include "gmxpre.h"
38 #include "nbnxn_grid.h"
40 #include "config.h"
42 #include <assert.h>
43 #include <string.h>
45 #include <cmath>
47 #include <algorithm>
49 #include "gromacs/domdec/domdec_struct.h"
50 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
51 #include "gromacs/math/utilities.h"
52 #include "gromacs/math/vec.h"
53 #include "gromacs/mdlib/nb_verlet.h"
54 #include "gromacs/mdlib/nbnxn_atomdata.h"
55 #include "gromacs/mdlib/nbnxn_consts.h"
56 #include "gromacs/mdlib/nbnxn_internal.h"
57 #include "gromacs/mdlib/nbnxn_search.h"
58 #include "gromacs/mdlib/nbnxn_simd.h"
59 #include "gromacs/mdlib/nbnxn_util.h"
60 #include "gromacs/simd/simd.h"
61 #include "gromacs/simd/vector_operations.h"
62 #include "gromacs/utility/exceptions.h"
63 #include "gromacs/utility/smalloc.h"
65 struct gmx_domdec_zones_t;
67 static void nbnxn_grid_init(nbnxn_grid_t * grid)
69 grid->cxy_na = NULL;
70 grid->cxy_ind = NULL;
71 grid->cxy_nalloc = 0;
72 grid->bb = NULL;
73 grid->bbj = NULL;
74 grid->nc_nalloc = 0;
77 void nbnxn_grids_init(nbnxn_search_t nbs, int ngrid)
79 int g;
81 nbs->ngrid = ngrid;
83 snew(nbs->grid, nbs->ngrid);
84 for (g = 0; g < nbs->ngrid; g++)
86 nbnxn_grid_init(&nbs->grid[g]);
90 static real grid_atom_density(int n, rvec corner0, rvec corner1)
92 rvec size;
94 if (n == 0)
96 /* To avoid zero density we use a minimum of 1 atom */
97 n = 1;
100 rvec_sub(corner1, corner0, size);
102 return n/(size[XX]*size[YY]*size[ZZ]);
105 static int set_grid_size_xy(const nbnxn_search_t nbs,
106 nbnxn_grid_t *grid,
107 int dd_zone,
108 int n, rvec corner0, rvec corner1,
109 real atom_density)
111 rvec size;
112 int na_c;
113 int nc_max;
114 real tlen, tlen_x, tlen_y;
116 rvec_sub(corner1, corner0, size);
118 if (n > grid->na_sc)
120 assert(atom_density > 0);
122 /* target cell length */
123 if (grid->bSimple)
125 /* To minimize the zero interactions, we should make
126 * the largest of the i/j cell cubic.
128 na_c = std::max(grid->na_c, grid->na_cj);
130 /* Approximately cubic cells */
131 tlen = std::pow(static_cast<real>(na_c/atom_density), static_cast<real>(1.0/3.0));
132 tlen_x = tlen;
133 tlen_y = tlen;
135 else
137 /* Approximately cubic sub cells */
138 tlen = std::pow(static_cast<real>(grid->na_c/atom_density), static_cast<real>(1.0/3.0));
139 tlen_x = tlen*GPU_NSUBCELL_X;
140 tlen_y = tlen*GPU_NSUBCELL_Y;
142 /* We round ncx and ncy down, because we get less cell pairs
143 * in the nbsist when the fixed cell dimensions (x,y) are
144 * larger than the variable one (z) than the other way around.
146 grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
147 grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
149 else
151 grid->ncx = 1;
152 grid->ncy = 1;
155 grid->sx = size[XX]/grid->ncx;
156 grid->sy = size[YY]/grid->ncy;
157 grid->inv_sx = 1/grid->sx;
158 grid->inv_sy = 1/grid->sy;
160 if (dd_zone > 0)
162 /* This is a non-home zone, add an extra row of cells
163 * for particles communicated for bonded interactions.
164 * These can be beyond the cut-off. It doesn't matter where
165 * they end up on the grid, but for performance it's better
166 * if they don't end up in cells that can be within cut-off range.
168 grid->ncx++;
169 grid->ncy++;
172 /* We need one additional cell entry for particles moved by DD */
173 if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
175 grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
176 srenew(grid->cxy_na, grid->cxy_nalloc);
177 srenew(grid->cxy_ind, grid->cxy_nalloc+1);
179 for (int t = 0; t < nbs->nthread_max; t++)
181 if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
183 nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
184 srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
188 /* Worst case scenario of 1 atom in each last cell */
189 if (grid->na_cj <= grid->na_c)
191 nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
193 else
195 nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
198 if (nc_max > grid->nc_nalloc)
200 grid->nc_nalloc = over_alloc_large(nc_max);
201 srenew(grid->nsubc, grid->nc_nalloc);
202 srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
204 sfree_aligned(grid->bb);
205 /* This snew also zeros the contents, this avoid possible
206 * floating exceptions in SIMD with the unused bb elements.
208 if (grid->bSimple)
210 snew_aligned(grid->bb, grid->nc_nalloc, 16);
212 else
214 #ifdef NBNXN_BBXXXX
215 int pbb_nalloc;
217 pbb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
218 snew_aligned(grid->pbb, pbb_nalloc, 16);
219 #else
220 snew_aligned(grid->bb, grid->nc_nalloc*GPU_NSUBCELL, 16);
221 #endif
224 if (grid->bSimple)
226 if (grid->na_cj == grid->na_c)
228 grid->bbj = grid->bb;
230 else
232 sfree_aligned(grid->bbj);
233 snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16);
237 srenew(grid->flags, grid->nc_nalloc);
238 if (nbs->bFEP)
240 srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c);
244 copy_rvec(corner0, grid->c0);
245 copy_rvec(corner1, grid->c1);
246 copy_rvec(size, grid->size);
248 return nc_max;
251 /* We need to sort paricles in grid columns on z-coordinate.
252 * As particle are very often distributed homogeneously, we a sorting
253 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
254 * by a factor, cast to an int and try to store in that hole. If the hole
255 * is full, we move this or another particle. A second pass is needed to make
256 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
257 * 4 is the optimal value for homogeneous particle distribution and allows
258 * for an O(#particles) sort up till distributions were all particles are
259 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
260 * as it can be expensive to detect imhomogeneous particle distributions.
261 * SGSF is the maximum ratio of holes used, in the worst case all particles
262 * end up in the last hole and we need #particles extra holes at the end.
264 #define SORT_GRID_OVERSIZE 4
265 #define SGSF (SORT_GRID_OVERSIZE + 1)
267 /* Sort particle index a on coordinates x along dim.
268 * Backwards tells if we want decreasing iso increasing coordinates.
269 * h0 is the minimum of the coordinate range.
270 * invh is the 1/length of the sorting range.
271 * n_per_h (>=n) is the expected average number of particles per 1/invh
272 * sort is the sorting work array.
273 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
274 * or easier, allocate at least n*SGSF elements.
276 static void sort_atoms(int dim, gmx_bool Backwards,
277 int gmx_unused dd_zone,
278 int *a, int n, rvec *x,
279 real h0, real invh, int n_per_h,
280 int *sort)
282 if (n <= 1)
284 /* Nothing to do */
285 return;
288 #ifndef NDEBUG
289 if (n > n_per_h)
291 gmx_incons("n > n_per_h");
293 #endif
295 /* Transform the inverse range height into the inverse hole height */
296 invh *= n_per_h*SORT_GRID_OVERSIZE;
298 /* Set nsort to the maximum possible number of holes used.
299 * In worst case all n elements end up in the last bin.
301 int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
303 /* Determine the index range used, so we can limit it for the second pass */
304 int zi_min = INT_MAX;
305 int zi_max = -1;
307 /* Sort the particles using a simple index sort */
308 for (int i = 0; i < n; i++)
310 /* The cast takes care of float-point rounding effects below zero.
311 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
312 * times the box height out of the box.
314 int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
316 #ifndef NDEBUG
317 /* As we can have rounding effect, we use > iso >= here */
318 if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
320 gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
321 a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
322 n_per_h, SORT_GRID_OVERSIZE);
324 #endif
326 /* In a non-local domain, particles communcated for bonded interactions
327 * can be far beyond the grid size, which is set by the non-bonded
328 * cut-off distance. We sort such particles into the last cell.
330 if (zi > n_per_h*SORT_GRID_OVERSIZE)
332 zi = n_per_h*SORT_GRID_OVERSIZE;
335 /* Ideally this particle should go in sort cell zi,
336 * but that might already be in use,
337 * in that case find the first empty cell higher up
339 if (sort[zi] < 0)
341 sort[zi] = a[i];
342 zi_min = std::min(zi_min, zi);
343 zi_max = std::max(zi_max, zi);
345 else
347 /* We have multiple atoms in the same sorting slot.
348 * Sort on real z for minimal bounding box size.
349 * There is an extra check for identical z to ensure
350 * well-defined output order, independent of input order
351 * to ensure binary reproducibility after restarts.
353 while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||
354 (x[a[i]][dim] == x[sort[zi]][dim] &&
355 a[i] > sort[zi])))
357 zi++;
360 if (sort[zi] >= 0)
362 /* Shift all elements by one slot until we find an empty slot */
363 int cp = sort[zi];
364 int zim = zi + 1;
365 while (sort[zim] >= 0)
367 int tmp = sort[zim];
368 sort[zim] = cp;
369 cp = tmp;
370 zim++;
372 sort[zim] = cp;
373 zi_max = std::max(zi_max, zim);
375 sort[zi] = a[i];
376 zi_max = std::max(zi_max, zi);
380 int c = 0;
381 if (!Backwards)
383 for (int zi = 0; zi < nsort; zi++)
385 if (sort[zi] >= 0)
387 a[c++] = sort[zi];
388 sort[zi] = -1;
392 else
394 for (int zi = zi_max; zi >= zi_min; zi--)
396 if (sort[zi] >= 0)
398 a[c++] = sort[zi];
399 sort[zi] = -1;
403 if (c < n)
405 gmx_incons("Lost particles while sorting");
409 #ifdef GMX_DOUBLE
410 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
411 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
412 #else
413 #define R2F_D(x) (x)
414 #define R2F_U(x) (x)
415 #endif
417 /* Coordinate order x,y,z, bb order xyz0 */
418 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
420 int i;
421 real xl, xh, yl, yh, zl, zh;
423 i = 0;
424 xl = x[i+XX];
425 xh = x[i+XX];
426 yl = x[i+YY];
427 yh = x[i+YY];
428 zl = x[i+ZZ];
429 zh = x[i+ZZ];
430 i += stride;
431 for (int j = 1; j < na; j++)
433 xl = std::min(xl, x[i+XX]);
434 xh = std::max(xh, x[i+XX]);
435 yl = std::min(yl, x[i+YY]);
436 yh = std::max(yh, x[i+YY]);
437 zl = std::min(zl, x[i+ZZ]);
438 zh = std::max(zh, x[i+ZZ]);
439 i += stride;
441 /* Note: possible double to float conversion here */
442 bb->lower[BB_X] = R2F_D(xl);
443 bb->lower[BB_Y] = R2F_D(yl);
444 bb->lower[BB_Z] = R2F_D(zl);
445 bb->upper[BB_X] = R2F_U(xh);
446 bb->upper[BB_Y] = R2F_U(yh);
447 bb->upper[BB_Z] = R2F_U(zh);
450 /* Packed coordinates, bb order xyz0 */
451 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
453 real xl, xh, yl, yh, zl, zh;
455 xl = x[XX*PACK_X4];
456 xh = x[XX*PACK_X4];
457 yl = x[YY*PACK_X4];
458 yh = x[YY*PACK_X4];
459 zl = x[ZZ*PACK_X4];
460 zh = x[ZZ*PACK_X4];
461 for (int j = 1; j < na; j++)
463 xl = std::min(xl, x[j+XX*PACK_X4]);
464 xh = std::max(xh, x[j+XX*PACK_X4]);
465 yl = std::min(yl, x[j+YY*PACK_X4]);
466 yh = std::max(yh, x[j+YY*PACK_X4]);
467 zl = std::min(zl, x[j+ZZ*PACK_X4]);
468 zh = std::max(zh, x[j+ZZ*PACK_X4]);
470 /* Note: possible double to float conversion here */
471 bb->lower[BB_X] = R2F_D(xl);
472 bb->lower[BB_Y] = R2F_D(yl);
473 bb->lower[BB_Z] = R2F_D(zl);
474 bb->upper[BB_X] = R2F_U(xh);
475 bb->upper[BB_Y] = R2F_U(yh);
476 bb->upper[BB_Z] = R2F_U(zh);
479 /* Packed coordinates, bb order xyz0 */
480 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
482 real xl, xh, yl, yh, zl, zh;
484 xl = x[XX*PACK_X8];
485 xh = x[XX*PACK_X8];
486 yl = x[YY*PACK_X8];
487 yh = x[YY*PACK_X8];
488 zl = x[ZZ*PACK_X8];
489 zh = x[ZZ*PACK_X8];
490 for (int j = 1; j < na; j++)
492 xl = std::min(xl, x[j+XX*PACK_X8]);
493 xh = std::max(xh, x[j+XX*PACK_X8]);
494 yl = std::min(yl, x[j+YY*PACK_X8]);
495 yh = std::max(yh, x[j+YY*PACK_X8]);
496 zl = std::min(zl, x[j+ZZ*PACK_X8]);
497 zh = std::max(zh, x[j+ZZ*PACK_X8]);
499 /* Note: possible double to float conversion here */
500 bb->lower[BB_X] = R2F_D(xl);
501 bb->lower[BB_Y] = R2F_D(yl);
502 bb->lower[BB_Z] = R2F_D(zl);
503 bb->upper[BB_X] = R2F_U(xh);
504 bb->upper[BB_Y] = R2F_U(yh);
505 bb->upper[BB_Z] = R2F_U(zh);
508 /* Packed coordinates, bb order xyz0 */
509 static void calc_bounding_box_x_x4_halves(int na, const real *x,
510 nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
512 calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
514 if (na > 2)
516 calc_bounding_box_x_x4(std::min(na-2, 2), x+(PACK_X4>>1), bbj+1);
518 else
520 /* Set the "empty" bounding box to the same as the first one,
521 * so we don't need to treat special cases in the rest of the code.
523 #ifdef NBNXN_SEARCH_BB_SIMD4
524 gmx_simd4_store_f(&bbj[1].lower[0], gmx_simd4_load_f(&bbj[0].lower[0]));
525 gmx_simd4_store_f(&bbj[1].upper[0], gmx_simd4_load_f(&bbj[0].upper[0]));
526 #else
527 bbj[1] = bbj[0];
528 #endif
531 #ifdef NBNXN_SEARCH_BB_SIMD4
532 gmx_simd4_store_f(&bb->lower[0],
533 gmx_simd4_min_f(gmx_simd4_load_f(&bbj[0].lower[0]),
534 gmx_simd4_load_f(&bbj[1].lower[0])));
535 gmx_simd4_store_f(&bb->upper[0],
536 gmx_simd4_max_f(gmx_simd4_load_f(&bbj[0].upper[0]),
537 gmx_simd4_load_f(&bbj[1].upper[0])));
538 #else
540 int i;
542 for (i = 0; i < NNBSBB_C; i++)
544 bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
545 bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
548 #endif
551 #ifdef NBNXN_SEARCH_BB_SIMD4
553 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
554 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
556 int i;
557 real xl, xh, yl, yh, zl, zh;
559 i = 0;
560 xl = x[i+XX];
561 xh = x[i+XX];
562 yl = x[i+YY];
563 yh = x[i+YY];
564 zl = x[i+ZZ];
565 zh = x[i+ZZ];
566 i += stride;
567 for (int j = 1; j < na; j++)
569 xl = std::min(xl, x[i+XX]);
570 xh = std::max(xh, x[i+XX]);
571 yl = std::min(yl, x[i+YY]);
572 yh = std::max(yh, x[i+YY]);
573 zl = std::min(zl, x[i+ZZ]);
574 zh = std::max(zh, x[i+ZZ]);
575 i += stride;
577 /* Note: possible double to float conversion here */
578 bb[0*STRIDE_PBB] = R2F_D(xl);
579 bb[1*STRIDE_PBB] = R2F_D(yl);
580 bb[2*STRIDE_PBB] = R2F_D(zl);
581 bb[3*STRIDE_PBB] = R2F_U(xh);
582 bb[4*STRIDE_PBB] = R2F_U(yh);
583 bb[5*STRIDE_PBB] = R2F_U(zh);
586 #endif /* NBNXN_SEARCH_BB_SIMD4 */
588 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
590 /* Coordinate order xyz?, bb order xyz0 */
591 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
593 gmx_simd4_float_t bb_0_S, bb_1_S;
594 gmx_simd4_float_t x_S;
596 bb_0_S = gmx_simd4_load_f(x);
597 bb_1_S = bb_0_S;
599 for (int i = 1; i < na; i++)
601 x_S = gmx_simd4_load_f(x+i*NNBSBB_C);
602 bb_0_S = gmx_simd4_min_f(bb_0_S, x_S);
603 bb_1_S = gmx_simd4_max_f(bb_1_S, x_S);
606 gmx_simd4_store_f(&bb->lower[0], bb_0_S);
607 gmx_simd4_store_f(&bb->upper[0], bb_1_S);
610 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
611 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
612 nbnxn_bb_t *bb_work_aligned,
613 real *bb)
615 calc_bounding_box_simd4(na, x, bb_work_aligned);
617 bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
618 bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
619 bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
620 bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
621 bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
622 bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
625 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
628 /* Combines pairs of consecutive bounding boxes */
629 static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
631 for (int i = 0; i < grid->ncx*grid->ncy; i++)
633 /* Starting bb in a column is expected to be 2-aligned */
634 int sc2 = grid->cxy_ind[i]>>1;
635 /* For odd numbers skip the last bb here */
636 int nc2 = (grid->cxy_na[i]+3)>>(2+1);
637 int c2;
638 for (c2 = sc2; c2 < sc2+nc2; c2++)
640 #ifdef NBNXN_SEARCH_BB_SIMD4
641 gmx_simd4_float_t min_S, max_S;
643 min_S = gmx_simd4_min_f(gmx_simd4_load_f(&bb[c2*2+0].lower[0]),
644 gmx_simd4_load_f(&bb[c2*2+1].lower[0]));
645 max_S = gmx_simd4_max_f(gmx_simd4_load_f(&bb[c2*2+0].upper[0]),
646 gmx_simd4_load_f(&bb[c2*2+1].upper[0]));
647 gmx_simd4_store_f(&grid->bbj[c2].lower[0], min_S);
648 gmx_simd4_store_f(&grid->bbj[c2].upper[0], max_S);
649 #else
650 for (int j = 0; j < NNBSBB_C; j++)
652 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
653 bb[c2*2+1].lower[j]);
654 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
655 bb[c2*2+1].upper[j]);
657 #endif
659 if (((grid->cxy_na[i]+3)>>2) & 1)
661 /* The bb count in this column is odd: duplicate the last bb */
662 for (int j = 0; j < NNBSBB_C; j++)
664 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
665 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
672 /* Prints the average bb size, used for debug output */
673 static void print_bbsizes_simple(FILE *fp,
674 const nbnxn_grid_t *grid)
676 dvec ba;
678 clear_dvec(ba);
679 for (int c = 0; c < grid->nc; c++)
681 for (int d = 0; d < DIM; d++)
683 ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
686 dsvmul(1.0/grid->nc, ba, ba);
688 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
689 grid->sx,
690 grid->sy,
691 grid->atom_density > 0 ?
692 grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
693 ba[XX], ba[YY], ba[ZZ],
694 ba[XX]/grid->sx,
695 ba[YY]/grid->sy,
696 grid->atom_density > 0 ?
697 ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
700 /* Prints the average bb size, used for debug output */
701 static void print_bbsizes_supersub(FILE *fp,
702 const nbnxn_grid_t *grid)
704 int ns;
705 dvec ba;
707 clear_dvec(ba);
708 ns = 0;
709 for (int c = 0; c < grid->nc; c++)
711 #ifdef NBNXN_BBXXXX
712 for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
714 int cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
715 for (int i = 0; i < STRIDE_PBB; i++)
717 for (int d = 0; d < DIM; d++)
719 ba[d] +=
720 grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
721 grid->pbb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
725 #else
726 for (int s = 0; s < grid->nsubc[c]; s++)
728 int cs = c*GPU_NSUBCELL + s;
729 for (int d = 0; d < DIM; d++)
731 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
734 #endif
735 ns += grid->nsubc[c];
737 dsvmul(1.0/ns, ba, ba);
739 fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
740 grid->sx/GPU_NSUBCELL_X,
741 grid->sy/GPU_NSUBCELL_Y,
742 grid->atom_density > 0 ?
743 grid->na_sc/(grid->atom_density*grid->sx*grid->sy*GPU_NSUBCELL_Z) : 0.0,
744 ba[XX], ba[YY], ba[ZZ],
745 ba[XX]*GPU_NSUBCELL_X/grid->sx,
746 ba[YY]*GPU_NSUBCELL_Y/grid->sy,
747 grid->atom_density > 0 ?
748 ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*GPU_NSUBCELL_Z)) : 0.0);
751 /* Set non-bonded interaction flags for the current cluster.
752 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
754 static void sort_cluster_on_flag(int na_c,
755 int a0, int a1, const int *atinfo,
756 int *order,
757 int *flags)
759 int subc;
760 int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
761 int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
763 *flags = 0;
765 subc = 0;
766 for (int s = a0; s < a1; s += na_c)
768 /* Make lists for this (sub-)cell on atoms with and without LJ */
769 int n1 = 0;
770 int n2 = 0;
771 gmx_bool haveQ = FALSE;
772 int a_lj_max = -1;
773 for (int a = s; a < std::min(s+na_c, a1); a++)
775 haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
777 if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
779 sort1[n1++] = order[a];
780 a_lj_max = a;
782 else
784 sort2[n2++] = order[a];
788 /* If we don't have atoms with LJ, there's nothing to sort */
789 if (n1 > 0)
791 *flags |= NBNXN_CI_DO_LJ(subc);
793 if (2*n1 <= na_c)
795 /* Only sort when strictly necessary. Ordering particles
796 * Ordering particles can lead to less accurate summation
797 * due to rounding, both for LJ and Coulomb interactions.
799 if (2*(a_lj_max - s) >= na_c)
801 for (int i = 0; i < n1; i++)
803 order[a0+i] = sort1[i];
805 for (int j = 0; j < n2; j++)
807 order[a0+n1+j] = sort2[j];
811 *flags |= NBNXN_CI_HALF_LJ(subc);
814 if (haveQ)
816 *flags |= NBNXN_CI_DO_COUL(subc);
818 subc++;
822 /* Fill a pair search cell with atoms.
823 * Potentially sorts atoms and sets the interaction flags.
825 static void fill_cell(const nbnxn_search_t nbs,
826 nbnxn_grid_t *grid,
827 nbnxn_atomdata_t *nbat,
828 int a0, int a1,
829 const int *atinfo,
830 rvec *x,
831 int sx, int sy, int sz,
832 nbnxn_bb_t gmx_unused *bb_work_aligned)
834 int na, a;
835 size_t offset;
836 nbnxn_bb_t *bb_ptr;
837 #ifdef NBNXN_BBXXXX
838 float *pbb_ptr;
839 #endif
841 na = a1 - a0;
843 if (grid->bSimple)
845 /* Note that non-local grids are already sorted.
846 * Then sort_cluster_on_flag will only set the flags and the sorting
847 * will not affect the atom order.
849 sort_cluster_on_flag(grid->na_c, a0, a1, atinfo, nbs->a,
850 grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
853 if (nbs->bFEP)
855 /* Set the fep flag for perturbed atoms in this (sub-)cell */
856 int c;
858 /* The grid-local cluster/(sub-)cell index */
859 c = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : GPU_NSUBCELL);
860 grid->fep[c] = 0;
861 for (int at = a0; at < a1; at++)
863 if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
865 grid->fep[c] |= (1 << (at - a0));
870 /* Now we have sorted the atoms, set the cell indices */
871 for (a = a0; a < a1; a++)
873 nbs->cell[nbs->a[a]] = a;
876 copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
877 nbat->XFormat, nbat->x, a0,
878 sx, sy, sz);
880 if (nbat->XFormat == nbatX4)
882 /* Store the bounding boxes as xyz.xyz. */
883 offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
884 bb_ptr = grid->bb + offset;
886 #if defined GMX_NBNXN_SIMD && GMX_SIMD_REAL_WIDTH == 2
887 if (2*grid->na_cj == grid->na_c)
889 calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
890 grid->bbj+offset*2);
892 else
893 #endif
895 calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0), bb_ptr);
898 else if (nbat->XFormat == nbatX8)
900 /* Store the bounding boxes as xyz.xyz. */
901 offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
902 bb_ptr = grid->bb + offset;
904 calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
906 #ifdef NBNXN_BBXXXX
907 else if (!grid->bSimple)
909 /* Store the bounding boxes in a format convenient
910 * for SIMD4 calculations: xxxxyyyyzzzz...
912 pbb_ptr =
913 grid->pbb +
914 ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
915 (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
917 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
918 if (nbat->XFormat == nbatXYZQ)
920 calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
921 bb_work_aligned, pbb_ptr);
923 else
924 #endif
926 calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
927 pbb_ptr);
929 if (gmx_debug_at)
931 fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
932 sx, sy, sz,
933 pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
934 pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
935 pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
938 #endif
939 else
941 /* Store the bounding boxes as xyz.xyz. */
942 bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
944 calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
945 bb_ptr);
947 if (gmx_debug_at)
949 int bbo;
950 bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
951 fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
952 sx, sy, sz,
953 grid->bb[bbo].lower[BB_X],
954 grid->bb[bbo].lower[BB_Y],
955 grid->bb[bbo].lower[BB_Z],
956 grid->bb[bbo].upper[BB_X],
957 grid->bb[bbo].upper[BB_Y],
958 grid->bb[bbo].upper[BB_Z]);
963 /* Spatially sort the atoms within one grid column */
964 static void sort_columns_simple(const nbnxn_search_t nbs,
965 int dd_zone,
966 nbnxn_grid_t *grid,
967 int a0, int a1,
968 const int *atinfo,
969 rvec *x,
970 nbnxn_atomdata_t *nbat,
971 int cxy_start, int cxy_end,
972 int *sort_work)
974 int cfilled, c;
976 if (debug)
978 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
979 grid->cell0, cxy_start, cxy_end, a0, a1);
982 /* Sort the atoms within each x,y column in 3 dimensions */
983 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
985 int cx = cxy/grid->ncy;
986 int cy = cxy - cx*grid->ncy;
988 int na = grid->cxy_na[cxy];
989 int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
990 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
992 /* Sort the atoms within each x,y column on z coordinate */
993 sort_atoms(ZZ, FALSE, dd_zone,
994 nbs->a+ash, na, x,
995 grid->c0[ZZ],
996 1.0/grid->size[ZZ], ncz*grid->na_sc,
997 sort_work);
999 /* Fill the ncz cells in this column */
1000 cfilled = grid->cxy_ind[cxy];
1001 for (int cz = 0; cz < ncz; cz++)
1003 c = grid->cxy_ind[cxy] + cz;
1005 int ash_c = ash + cz*grid->na_sc;
1006 int na_c = std::min(grid->na_sc, na-(ash_c-ash));
1008 fill_cell(nbs, grid, nbat,
1009 ash_c, ash_c+na_c, atinfo, x,
1010 grid->na_sc*cx + (dd_zone >> 2),
1011 grid->na_sc*cy + (dd_zone & 3),
1012 grid->na_sc*cz,
1013 NULL);
1015 /* This copy to bbcz is not really necessary.
1016 * But it allows to use the same grid search code
1017 * for the simple and supersub cell setups.
1019 if (na_c > 0)
1021 cfilled = c;
1023 grid->bbcz[c*NNBSBB_D ] = grid->bb[cfilled].lower[BB_Z];
1024 grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
1027 /* Set the unused atom indices to -1 */
1028 for (int ind = na; ind < ncz*grid->na_sc; ind++)
1030 nbs->a[ash+ind] = -1;
1035 /* Spatially sort the atoms within one grid column */
1036 static void sort_columns_supersub(const nbnxn_search_t nbs,
1037 int dd_zone,
1038 nbnxn_grid_t *grid,
1039 int a0, int a1,
1040 const int *atinfo,
1041 rvec *x,
1042 nbnxn_atomdata_t *nbat,
1043 int cxy_start, int cxy_end,
1044 int *sort_work)
1046 nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
1048 bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array+1)) & (~(static_cast<std::size_t>(15))));
1050 if (debug)
1052 fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1053 grid->cell0, cxy_start, cxy_end, a0, a1);
1056 int subdiv_x = grid->na_c;
1057 int subdiv_y = GPU_NSUBCELL_X*subdiv_x;
1058 int subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
1060 /* Sort the atoms within each x,y column in 3 dimensions */
1061 for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1063 int cx = cxy/grid->ncy;
1064 int cy = cxy - cx*grid->ncy;
1066 int na = grid->cxy_na[cxy];
1067 int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1068 int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1070 /* Sort the atoms within each x,y column on z coordinate */
1071 sort_atoms(ZZ, FALSE, dd_zone,
1072 nbs->a+ash, na, x,
1073 grid->c0[ZZ],
1074 1.0/grid->size[ZZ], ncz*grid->na_sc,
1075 sort_work);
1077 /* This loop goes over the supercells and subcells along z at once */
1078 for (int sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z; sub_z++)
1080 int ash_z = ash + sub_z*subdiv_z;
1081 int na_z = std::min(subdiv_z, na-(ash_z-ash));
1082 int cz = -1;
1083 /* We have already sorted on z */
1085 if (sub_z % GPU_NSUBCELL_Z == 0)
1087 cz = sub_z/GPU_NSUBCELL_Z;
1088 int c = grid->cxy_ind[cxy] + cz;
1090 /* The number of atoms in this supercell */
1091 int na_c = std::min(grid->na_sc, na-(ash_z-ash));
1093 grid->nsubc[c] = std::min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c);
1095 /* Store the z-boundaries of the super cell */
1096 grid->bbcz[c*NNBSBB_D ] = x[nbs->a[ash_z]][ZZ];
1097 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
1100 #if GPU_NSUBCELL_Y > 1
1101 /* Sort the atoms along y */
1102 sort_atoms(YY, (sub_z & 1), dd_zone,
1103 nbs->a+ash_z, na_z, x,
1104 grid->c0[YY]+cy*grid->sy,
1105 grid->inv_sy, subdiv_z,
1106 sort_work);
1107 #endif
1109 for (int sub_y = 0; sub_y < GPU_NSUBCELL_Y; sub_y++)
1111 int ash_y = ash_z + sub_y*subdiv_y;
1112 int na_y = std::min(subdiv_y, na-(ash_y-ash));
1114 #if GPU_NSUBCELL_X > 1
1115 /* Sort the atoms along x */
1116 sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1), dd_zone,
1117 nbs->a+ash_y, na_y, x,
1118 grid->c0[XX]+cx*grid->sx,
1119 grid->inv_sx, subdiv_y,
1120 sort_work);
1121 #endif
1123 for (int sub_x = 0; sub_x < GPU_NSUBCELL_X; sub_x++)
1125 int ash_x = ash_y + sub_x*subdiv_x;
1126 int na_x = std::min(subdiv_x, na-(ash_x-ash));
1128 fill_cell(nbs, grid, nbat,
1129 ash_x, ash_x+na_x, atinfo, x,
1130 grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
1131 grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
1132 grid->na_c*sub_z,
1133 bb_work_aligned);
1138 /* Set the unused atom indices to -1 */
1139 for (int ind = na; ind < ncz*grid->na_sc; ind++)
1141 nbs->a[ash+ind] = -1;
1146 /* Determine in which grid column atoms should go */
1147 static void calc_column_indices(nbnxn_grid_t *grid,
1148 int a0, int a1,
1149 rvec *x,
1150 int dd_zone, const int *move,
1151 int thread, int nthread,
1152 int *cell,
1153 int *cxy_na)
1155 /* We add one extra cell for particles which moved during DD */
1156 for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1158 cxy_na[i] = 0;
1161 int n0 = a0 + static_cast<int>((thread+0)*(a1 - a0))/nthread;
1162 int n1 = a0 + static_cast<int>((thread+1)*(a1 - a0))/nthread;
1163 if (dd_zone == 0)
1165 /* Home zone */
1166 for (int i = n0; i < n1; i++)
1168 if (move == NULL || move[i] >= 0)
1170 /* We need to be careful with rounding,
1171 * particles might be a few bits outside the local zone.
1172 * The int cast takes care of the lower bound,
1173 * we will explicitly take care of the upper bound.
1175 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1176 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1178 #ifndef NDEBUG
1179 if (cx < 0 || cx > grid->ncx ||
1180 cy < 0 || cy > grid->ncy)
1182 gmx_fatal(FARGS,
1183 "grid cell cx %d cy %d out of range (max %d %d)\n"
1184 "atom %f %f %f, grid->c0 %f %f",
1185 cx, cy, grid->ncx, grid->ncy,
1186 x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1188 #endif
1189 /* Take care of potential rouding issues */
1190 cx = std::min(cx, grid->ncx - 1);
1191 cy = std::min(cy, grid->ncy - 1);
1193 /* For the moment cell will contain only the, grid local,
1194 * x and y indices, not z.
1196 cell[i] = cx*grid->ncy + cy;
1198 else
1200 /* Put this moved particle after the end of the grid,
1201 * so we can process it later without using conditionals.
1203 cell[i] = grid->ncx*grid->ncy;
1206 cxy_na[cell[i]]++;
1209 else
1211 /* Non-home zone */
1212 for (int i = n0; i < n1; i++)
1214 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1215 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1217 /* For non-home zones there could be particles outside
1218 * the non-bonded cut-off range, which have been communicated
1219 * for bonded interactions only. For the result it doesn't
1220 * matter where these end up on the grid. For performance
1221 * we put them in an extra row at the border.
1223 cx = std::max(cx, 0);
1224 cx = std::min(cx, grid->ncx - 1);
1225 cy = std::max(cy, 0);
1226 cy = std::min(cy, grid->ncy - 1);
1228 /* For the moment cell will contain only the, grid local,
1229 * x and y indices, not z.
1231 cell[i] = cx*grid->ncy + cy;
1233 cxy_na[cell[i]]++;
1238 /* Determine in which grid cells the atoms should go */
1239 static void calc_cell_indices(const nbnxn_search_t nbs,
1240 int dd_zone,
1241 nbnxn_grid_t *grid,
1242 int a0, int a1,
1243 const int *atinfo,
1244 rvec *x,
1245 const int *move,
1246 nbnxn_atomdata_t *nbat)
1248 int n0, n1;
1249 int cx, cy, cxy, ncz_max, ncz;
1250 int nthread;
1251 int cxy_na_i;
1253 nthread = gmx_omp_nthreads_get(emntPairsearch);
1255 #pragma omp parallel for num_threads(nthread) schedule(static)
1256 for (int thread = 0; thread < nthread; thread++)
1260 calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
1261 nbs->cell, nbs->work[thread].cxy_na);
1263 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1266 /* Make the cell index as a function of x and y */
1267 ncz_max = 0;
1268 ncz = 0;
1269 grid->cxy_ind[0] = 0;
1270 for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1272 /* We set ncz_max at the beginning of the loop iso at the end
1273 * to skip i=grid->ncx*grid->ncy which are moved particles
1274 * that do not need to be ordered on the grid.
1276 if (ncz > ncz_max)
1278 ncz_max = ncz;
1280 cxy_na_i = nbs->work[0].cxy_na[i];
1281 for (int thread = 1; thread < nthread; thread++)
1283 cxy_na_i += nbs->work[thread].cxy_na[i];
1285 ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1286 if (nbat->XFormat == nbatX8)
1288 /* Make the number of cell a multiple of 2 */
1289 ncz = (ncz + 1) & ~1;
1291 grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1292 /* Clear cxy_na, so we can reuse the array below */
1293 grid->cxy_na[i] = 0;
1295 grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1297 nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1299 if (debug)
1301 fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1302 grid->na_sc, grid->na_c, grid->nc,
1303 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1304 ncz_max);
1305 if (gmx_debug_at)
1307 int i = 0;
1308 for (cy = 0; cy < grid->ncy; cy++)
1310 for (cx = 0; cx < grid->ncx; cx++)
1312 fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1313 i++;
1315 fprintf(debug, "\n");
1320 /* Make sure the work array for sorting is large enough */
1321 if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1323 for (int thread = 0; thread < nbs->nthread_max; thread++)
1325 nbs->work[thread].sort_work_nalloc =
1326 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1327 srenew(nbs->work[thread].sort_work,
1328 nbs->work[thread].sort_work_nalloc);
1329 /* When not in use, all elements should be -1 */
1330 for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1332 nbs->work[thread].sort_work[i] = -1;
1337 /* Now we know the dimensions we can fill the grid.
1338 * This is the first, unsorted fill. We sort the columns after this.
1340 for (int i = a0; i < a1; i++)
1342 /* At this point nbs->cell contains the local grid x,y indices */
1343 cxy = nbs->cell[i];
1344 nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1347 if (dd_zone == 0)
1349 /* Set the cell indices for the moved particles */
1350 n0 = grid->nc*grid->na_sc;
1351 n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1352 if (dd_zone == 0)
1354 for (int i = n0; i < n1; i++)
1356 nbs->cell[nbs->a[i]] = i;
1361 /* Sort the super-cell columns along z into the sub-cells. */
1362 #pragma omp parallel for num_threads(nthread) schedule(static)
1363 for (int thread = 0; thread < nthread; thread++)
1367 if (grid->bSimple)
1369 sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1370 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1371 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1372 nbs->work[thread].sort_work);
1374 else
1376 sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1377 ((thread+0)*grid->ncx*grid->ncy)/nthread,
1378 ((thread+1)*grid->ncx*grid->ncy)/nthread,
1379 nbs->work[thread].sort_work);
1382 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1385 if (grid->bSimple && nbat->XFormat == nbatX8)
1387 combine_bounding_box_pairs(grid, grid->bb);
1390 if (!grid->bSimple)
1392 grid->nsubc_tot = 0;
1393 for (int i = 0; i < grid->nc; i++)
1395 grid->nsubc_tot += grid->nsubc[i];
1399 if (debug)
1401 if (grid->bSimple)
1403 print_bbsizes_simple(debug, grid);
1405 else
1407 fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1408 grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
1410 print_bbsizes_supersub(debug, grid);
1415 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
1416 int natoms)
1418 flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
1419 if (flags->nflag > flags->flag_nalloc)
1421 flags->flag_nalloc = over_alloc_large(flags->nflag);
1422 srenew(flags->flag, flags->flag_nalloc);
1424 for (int b = 0; b < flags->nflag; b++)
1426 bitmask_clear(&(flags->flag[b]));
1430 /* Sets up a grid and puts the atoms on the grid.
1431 * This function only operates on one domain of the domain decompostion.
1432 * Note that without domain decomposition there is only one domain.
1434 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1435 int ePBC, matrix box,
1436 int dd_zone,
1437 rvec corner0, rvec corner1,
1438 int a0, int a1,
1439 real atom_density,
1440 const int *atinfo,
1441 rvec *x,
1442 int nmoved, int *move,
1443 int nb_kernel_type,
1444 nbnxn_atomdata_t *nbat)
1446 nbnxn_grid_t *grid;
1447 int n;
1448 int nc_max_grid, nc_max;
1450 grid = &nbs->grid[dd_zone];
1452 nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1454 grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1456 grid->na_c = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1457 grid->na_cj = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1458 grid->na_sc = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
1459 grid->na_c_2log = get_2log(grid->na_c);
1461 nbat->na_c = grid->na_c;
1463 if (dd_zone == 0)
1465 grid->cell0 = 0;
1467 else
1469 grid->cell0 =
1470 (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1471 nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1474 n = a1 - a0;
1476 if (dd_zone == 0)
1478 nbs->ePBC = ePBC;
1479 copy_mat(box, nbs->box);
1481 /* Avoid zero density */
1482 if (atom_density > 0)
1484 grid->atom_density = atom_density;
1486 else
1488 grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
1491 grid->cell0 = 0;
1493 nbs->natoms_local = a1 - nmoved;
1494 /* We assume that nbnxn_put_on_grid is called first
1495 * for the local atoms (dd_zone=0).
1497 nbs->natoms_nonlocal = a1 - nmoved;
1499 if (debug)
1501 fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1502 nbs->natoms_local, grid->atom_density);
1505 else
1507 nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, a1);
1510 /* We always use the home zone (grid[0]) for setting the cell size,
1511 * since determining densities for non-local zones is difficult.
1513 nc_max_grid = set_grid_size_xy(nbs, grid,
1514 dd_zone, n-nmoved, corner0, corner1,
1515 nbs->grid[0].atom_density);
1517 nc_max = grid->cell0 + nc_max_grid;
1519 if (a1 > nbs->cell_nalloc)
1521 nbs->cell_nalloc = over_alloc_large(a1);
1522 srenew(nbs->cell, nbs->cell_nalloc);
1525 /* To avoid conditionals we store the moved particles at the end of a,
1526 * make sure we have enough space.
1528 if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1530 nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1531 srenew(nbs->a, nbs->a_nalloc);
1534 /* We need padding up to a multiple of the buffer flag size: simply add */
1535 if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1537 nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1540 calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
1542 if (dd_zone == 0)
1544 nbat->natoms_local = nbat->natoms;
1547 nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1550 /* Calls nbnxn_put_on_grid for all non-local domains */
1551 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
1552 const struct gmx_domdec_zones_t *zones,
1553 const int *atinfo,
1554 rvec *x,
1555 int nb_kernel_type,
1556 nbnxn_atomdata_t *nbat)
1558 rvec c0, c1;
1560 for (int zone = 1; zone < zones->n; zone++)
1562 for (int d = 0; d < DIM; d++)
1564 c0[d] = zones->size[zone].bb_x0[d];
1565 c1[d] = zones->size[zone].bb_x1[d];
1568 nbnxn_put_on_grid(nbs, nbs->ePBC, NULL,
1569 zone, c0, c1,
1570 zones->cg_range[zone],
1571 zones->cg_range[zone+1],
1573 atinfo,
1575 0, NULL,
1576 nb_kernel_type,
1577 nbat);
1581 /* Add simple grid type information to the local super/sub grid */
1582 void nbnxn_grid_add_simple(nbnxn_search_t nbs,
1583 nbnxn_atomdata_t *nbat)
1585 nbnxn_grid_t *grid;
1586 float *bbcz;
1587 nbnxn_bb_t *bb;
1588 int ncd;
1590 grid = &nbs->grid[0];
1592 if (grid->bSimple)
1594 gmx_incons("nbnxn_grid_simple called with a simple grid");
1597 ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1599 if (grid->nc*ncd > grid->nc_nalloc_simple)
1601 grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1602 srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
1603 srenew(grid->bb_simple, grid->nc_nalloc_simple);
1604 srenew(grid->flags_simple, grid->nc_nalloc_simple);
1605 if (nbat->XFormat)
1607 sfree_aligned(grid->bbj);
1608 snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
1612 bbcz = grid->bbcz_simple;
1613 bb = grid->bb_simple;
1615 #if (defined GMX_OPENMP) && !(defined __clang_analyzer__)
1616 // cppcheck-suppress unreadVariable
1617 int nthreads = gmx_omp_nthreads_get(emntPairsearch);
1618 #endif
1620 #pragma omp parallel for num_threads(nthreads) schedule(static)
1621 for (int sc = 0; sc < grid->nc; sc++)
1625 for (int c = 0; c < ncd; c++)
1627 int tx = sc*ncd + c;
1628 int na = NBNXN_CPU_CLUSTER_I_SIZE;
1629 while (na > 0 &&
1630 nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1632 na--;
1635 if (na > 0)
1637 switch (nbat->XFormat)
1639 case nbatX4:
1640 /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1641 calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
1642 bb+tx);
1643 break;
1644 case nbatX8:
1645 /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1646 calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1647 bb+tx);
1648 break;
1649 default:
1650 calc_bounding_box(na, nbat->xstride,
1651 nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1652 bb+tx);
1653 break;
1655 bbcz[tx*NNBSBB_D+0] = bb[tx].lower[BB_Z];
1656 bbcz[tx*NNBSBB_D+1] = bb[tx].upper[BB_Z];
1658 /* No interaction optimization yet here */
1659 grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1661 else
1663 grid->flags_simple[tx] = 0;
1667 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1670 if (grid->bSimple && nbat->XFormat == nbatX8)
1672 combine_bounding_box_pairs(grid, grid->bb_simple);
1676 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1678 *ncx = nbs->grid[0].ncx;
1679 *ncy = nbs->grid[0].ncy;
1682 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1684 const nbnxn_grid_t *grid;
1686 grid = &nbs->grid[0];
1688 /* Return the atom order for the home cell (index 0) */
1689 *a = nbs->a;
1691 *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1694 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1696 /* Set the atom order for the home cell (index 0) */
1697 nbnxn_grid_t *grid = &nbs->grid[0];
1699 int ao = 0;
1700 for (int cx = 0; cx < grid->ncx; cx++)
1702 for (int cy = 0; cy < grid->ncy; cy++)
1704 int cxy = cx*grid->ncy + cy;
1705 int j = grid->cxy_ind[cxy]*grid->na_sc;
1706 for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1708 nbs->a[j] = ao;
1709 nbs->cell[ao] = j;
1710 ao++;
1711 j++;