2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_grid.h"
49 #include "gromacs/domdec/domdec_struct.h"
50 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
51 #include "gromacs/math/utilities.h"
52 #include "gromacs/math/vec.h"
53 #include "gromacs/mdlib/nb_verlet.h"
54 #include "gromacs/mdlib/nbnxn_atomdata.h"
55 #include "gromacs/mdlib/nbnxn_consts.h"
56 #include "gromacs/mdlib/nbnxn_internal.h"
57 #include "gromacs/mdlib/nbnxn_search.h"
58 #include "gromacs/mdlib/nbnxn_simd.h"
59 #include "gromacs/mdlib/nbnxn_util.h"
60 #include "gromacs/simd/simd.h"
61 #include "gromacs/simd/vector_operations.h"
62 #include "gromacs/utility/exceptions.h"
63 #include "gromacs/utility/smalloc.h"
65 struct gmx_domdec_zones_t
;
67 static void nbnxn_grid_init(nbnxn_grid_t
* grid
)
77 void nbnxn_grids_init(nbnxn_search_t nbs
, int ngrid
)
83 snew(nbs
->grid
, nbs
->ngrid
);
84 for (g
= 0; g
< nbs
->ngrid
; g
++)
86 nbnxn_grid_init(&nbs
->grid
[g
]);
90 static real
grid_atom_density(int n
, rvec corner0
, rvec corner1
)
96 /* To avoid zero density we use a minimum of 1 atom */
100 rvec_sub(corner1
, corner0
, size
);
102 return n
/(size
[XX
]*size
[YY
]*size
[ZZ
]);
105 static int set_grid_size_xy(const nbnxn_search_t nbs
,
108 int n
, rvec corner0
, rvec corner1
,
114 real tlen
, tlen_x
, tlen_y
;
116 rvec_sub(corner1
, corner0
, size
);
120 assert(atom_density
> 0);
122 /* target cell length */
125 /* To minimize the zero interactions, we should make
126 * the largest of the i/j cell cubic.
128 na_c
= std::max(grid
->na_c
, grid
->na_cj
);
130 /* Approximately cubic cells */
131 tlen
= std::pow(static_cast<real
>(na_c
/atom_density
), static_cast<real
>(1.0/3.0));
137 /* Approximately cubic sub cells */
138 tlen
= std::pow(static_cast<real
>(grid
->na_c
/atom_density
), static_cast<real
>(1.0/3.0));
139 tlen_x
= tlen
*GPU_NSUBCELL_X
;
140 tlen_y
= tlen
*GPU_NSUBCELL_Y
;
142 /* We round ncx and ncy down, because we get less cell pairs
143 * in the nbsist when the fixed cell dimensions (x,y) are
144 * larger than the variable one (z) than the other way around.
146 grid
->ncx
= std::max(1, static_cast<int>(size
[XX
]/tlen_x
));
147 grid
->ncy
= std::max(1, static_cast<int>(size
[YY
]/tlen_y
));
155 grid
->sx
= size
[XX
]/grid
->ncx
;
156 grid
->sy
= size
[YY
]/grid
->ncy
;
157 grid
->inv_sx
= 1/grid
->sx
;
158 grid
->inv_sy
= 1/grid
->sy
;
162 /* This is a non-home zone, add an extra row of cells
163 * for particles communicated for bonded interactions.
164 * These can be beyond the cut-off. It doesn't matter where
165 * they end up on the grid, but for performance it's better
166 * if they don't end up in cells that can be within cut-off range.
172 /* We need one additional cell entry for particles moved by DD */
173 if (grid
->ncx
*grid
->ncy
+1 > grid
->cxy_nalloc
)
175 grid
->cxy_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
176 srenew(grid
->cxy_na
, grid
->cxy_nalloc
);
177 srenew(grid
->cxy_ind
, grid
->cxy_nalloc
+1);
179 for (int t
= 0; t
< nbs
->nthread_max
; t
++)
181 if (grid
->ncx
*grid
->ncy
+1 > nbs
->work
[t
].cxy_na_nalloc
)
183 nbs
->work
[t
].cxy_na_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
184 srenew(nbs
->work
[t
].cxy_na
, nbs
->work
[t
].cxy_na_nalloc
);
188 /* Worst case scenario of 1 atom in each last cell */
189 if (grid
->na_cj
<= grid
->na_c
)
191 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
;
195 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
*grid
->na_cj
/grid
->na_c
;
198 if (nc_max
> grid
->nc_nalloc
)
200 grid
->nc_nalloc
= over_alloc_large(nc_max
);
201 srenew(grid
->nsubc
, grid
->nc_nalloc
);
202 srenew(grid
->bbcz
, grid
->nc_nalloc
*NNBSBB_D
);
204 sfree_aligned(grid
->bb
);
205 /* This snew also zeros the contents, this avoid possible
206 * floating exceptions in SIMD with the unused bb elements.
210 snew_aligned(grid
->bb
, grid
->nc_nalloc
, 16);
217 pbb_nalloc
= grid
->nc_nalloc
*GPU_NSUBCELL
/STRIDE_PBB
*NNBSBB_XXXX
;
218 snew_aligned(grid
->pbb
, pbb_nalloc
, 16);
220 snew_aligned(grid
->bb
, grid
->nc_nalloc
*GPU_NSUBCELL
, 16);
226 if (grid
->na_cj
== grid
->na_c
)
228 grid
->bbj
= grid
->bb
;
232 sfree_aligned(grid
->bbj
);
233 snew_aligned(grid
->bbj
, grid
->nc_nalloc
*grid
->na_c
/grid
->na_cj
, 16);
237 srenew(grid
->flags
, grid
->nc_nalloc
);
240 srenew(grid
->fep
, grid
->nc_nalloc
*grid
->na_sc
/grid
->na_c
);
244 copy_rvec(corner0
, grid
->c0
);
245 copy_rvec(corner1
, grid
->c1
);
246 copy_rvec(size
, grid
->size
);
251 /* We need to sort paricles in grid columns on z-coordinate.
252 * As particle are very often distributed homogeneously, we a sorting
253 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
254 * by a factor, cast to an int and try to store in that hole. If the hole
255 * is full, we move this or another particle. A second pass is needed to make
256 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
257 * 4 is the optimal value for homogeneous particle distribution and allows
258 * for an O(#particles) sort up till distributions were all particles are
259 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
260 * as it can be expensive to detect imhomogeneous particle distributions.
261 * SGSF is the maximum ratio of holes used, in the worst case all particles
262 * end up in the last hole and we need #particles extra holes at the end.
264 #define SORT_GRID_OVERSIZE 4
265 #define SGSF (SORT_GRID_OVERSIZE + 1)
267 /* Sort particle index a on coordinates x along dim.
268 * Backwards tells if we want decreasing iso increasing coordinates.
269 * h0 is the minimum of the coordinate range.
270 * invh is the 1/length of the sorting range.
271 * n_per_h (>=n) is the expected average number of particles per 1/invh
272 * sort is the sorting work array.
273 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
274 * or easier, allocate at least n*SGSF elements.
276 static void sort_atoms(int dim
, gmx_bool Backwards
,
277 int gmx_unused dd_zone
,
278 int *a
, int n
, rvec
*x
,
279 real h0
, real invh
, int n_per_h
,
291 gmx_incons("n > n_per_h");
295 /* Transform the inverse range height into the inverse hole height */
296 invh
*= n_per_h
*SORT_GRID_OVERSIZE
;
298 /* Set nsort to the maximum possible number of holes used.
299 * In worst case all n elements end up in the last bin.
301 int nsort
= n_per_h
*SORT_GRID_OVERSIZE
+ n
;
303 /* Determine the index range used, so we can limit it for the second pass */
304 int zi_min
= INT_MAX
;
307 /* Sort the particles using a simple index sort */
308 for (int i
= 0; i
< n
; i
++)
310 /* The cast takes care of float-point rounding effects below zero.
311 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
312 * times the box height out of the box.
314 int zi
= static_cast<int>((x
[a
[i
]][dim
] - h0
)*invh
);
317 /* As we can have rounding effect, we use > iso >= here */
318 if (zi
< 0 || (dd_zone
== 0 && zi
> n_per_h
*SORT_GRID_OVERSIZE
))
320 gmx_fatal(FARGS
, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
321 a
[i
], 'x'+dim
, x
[a
[i
]][dim
], h0
, invh
, zi
,
322 n_per_h
, SORT_GRID_OVERSIZE
);
326 /* In a non-local domain, particles communcated for bonded interactions
327 * can be far beyond the grid size, which is set by the non-bonded
328 * cut-off distance. We sort such particles into the last cell.
330 if (zi
> n_per_h
*SORT_GRID_OVERSIZE
)
332 zi
= n_per_h
*SORT_GRID_OVERSIZE
;
335 /* Ideally this particle should go in sort cell zi,
336 * but that might already be in use,
337 * in that case find the first empty cell higher up
342 zi_min
= std::min(zi_min
, zi
);
343 zi_max
= std::max(zi_max
, zi
);
347 /* We have multiple atoms in the same sorting slot.
348 * Sort on real z for minimal bounding box size.
349 * There is an extra check for identical z to ensure
350 * well-defined output order, independent of input order
351 * to ensure binary reproducibility after restarts.
353 while (sort
[zi
] >= 0 && ( x
[a
[i
]][dim
] > x
[sort
[zi
]][dim
] ||
354 (x
[a
[i
]][dim
] == x
[sort
[zi
]][dim
] &&
362 /* Shift all elements by one slot until we find an empty slot */
365 while (sort
[zim
] >= 0)
373 zi_max
= std::max(zi_max
, zim
);
376 zi_max
= std::max(zi_max
, zi
);
383 for (int zi
= 0; zi
< nsort
; zi
++)
394 for (int zi
= zi_max
; zi
>= zi_min
; zi
--)
405 gmx_incons("Lost particles while sorting");
410 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
411 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
417 /* Coordinate order x,y,z, bb order xyz0 */
418 static void calc_bounding_box(int na
, int stride
, const real
*x
, nbnxn_bb_t
*bb
)
421 real xl
, xh
, yl
, yh
, zl
, zh
;
431 for (int j
= 1; j
< na
; j
++)
433 xl
= std::min(xl
, x
[i
+XX
]);
434 xh
= std::max(xh
, x
[i
+XX
]);
435 yl
= std::min(yl
, x
[i
+YY
]);
436 yh
= std::max(yh
, x
[i
+YY
]);
437 zl
= std::min(zl
, x
[i
+ZZ
]);
438 zh
= std::max(zh
, x
[i
+ZZ
]);
441 /* Note: possible double to float conversion here */
442 bb
->lower
[BB_X
] = R2F_D(xl
);
443 bb
->lower
[BB_Y
] = R2F_D(yl
);
444 bb
->lower
[BB_Z
] = R2F_D(zl
);
445 bb
->upper
[BB_X
] = R2F_U(xh
);
446 bb
->upper
[BB_Y
] = R2F_U(yh
);
447 bb
->upper
[BB_Z
] = R2F_U(zh
);
450 /* Packed coordinates, bb order xyz0 */
451 static void calc_bounding_box_x_x4(int na
, const real
*x
, nbnxn_bb_t
*bb
)
453 real xl
, xh
, yl
, yh
, zl
, zh
;
461 for (int j
= 1; j
< na
; j
++)
463 xl
= std::min(xl
, x
[j
+XX
*PACK_X4
]);
464 xh
= std::max(xh
, x
[j
+XX
*PACK_X4
]);
465 yl
= std::min(yl
, x
[j
+YY
*PACK_X4
]);
466 yh
= std::max(yh
, x
[j
+YY
*PACK_X4
]);
467 zl
= std::min(zl
, x
[j
+ZZ
*PACK_X4
]);
468 zh
= std::max(zh
, x
[j
+ZZ
*PACK_X4
]);
470 /* Note: possible double to float conversion here */
471 bb
->lower
[BB_X
] = R2F_D(xl
);
472 bb
->lower
[BB_Y
] = R2F_D(yl
);
473 bb
->lower
[BB_Z
] = R2F_D(zl
);
474 bb
->upper
[BB_X
] = R2F_U(xh
);
475 bb
->upper
[BB_Y
] = R2F_U(yh
);
476 bb
->upper
[BB_Z
] = R2F_U(zh
);
479 /* Packed coordinates, bb order xyz0 */
480 static void calc_bounding_box_x_x8(int na
, const real
*x
, nbnxn_bb_t
*bb
)
482 real xl
, xh
, yl
, yh
, zl
, zh
;
490 for (int j
= 1; j
< na
; j
++)
492 xl
= std::min(xl
, x
[j
+XX
*PACK_X8
]);
493 xh
= std::max(xh
, x
[j
+XX
*PACK_X8
]);
494 yl
= std::min(yl
, x
[j
+YY
*PACK_X8
]);
495 yh
= std::max(yh
, x
[j
+YY
*PACK_X8
]);
496 zl
= std::min(zl
, x
[j
+ZZ
*PACK_X8
]);
497 zh
= std::max(zh
, x
[j
+ZZ
*PACK_X8
]);
499 /* Note: possible double to float conversion here */
500 bb
->lower
[BB_X
] = R2F_D(xl
);
501 bb
->lower
[BB_Y
] = R2F_D(yl
);
502 bb
->lower
[BB_Z
] = R2F_D(zl
);
503 bb
->upper
[BB_X
] = R2F_U(xh
);
504 bb
->upper
[BB_Y
] = R2F_U(yh
);
505 bb
->upper
[BB_Z
] = R2F_U(zh
);
508 /* Packed coordinates, bb order xyz0 */
509 static void calc_bounding_box_x_x4_halves(int na
, const real
*x
,
510 nbnxn_bb_t
*bb
, nbnxn_bb_t
*bbj
)
512 calc_bounding_box_x_x4(std::min(na
, 2), x
, bbj
);
516 calc_bounding_box_x_x4(std::min(na
-2, 2), x
+(PACK_X4
>>1), bbj
+1);
520 /* Set the "empty" bounding box to the same as the first one,
521 * so we don't need to treat special cases in the rest of the code.
523 #ifdef NBNXN_SEARCH_BB_SIMD4
524 gmx_simd4_store_f(&bbj
[1].lower
[0], gmx_simd4_load_f(&bbj
[0].lower
[0]));
525 gmx_simd4_store_f(&bbj
[1].upper
[0], gmx_simd4_load_f(&bbj
[0].upper
[0]));
531 #ifdef NBNXN_SEARCH_BB_SIMD4
532 gmx_simd4_store_f(&bb
->lower
[0],
533 gmx_simd4_min_f(gmx_simd4_load_f(&bbj
[0].lower
[0]),
534 gmx_simd4_load_f(&bbj
[1].lower
[0])));
535 gmx_simd4_store_f(&bb
->upper
[0],
536 gmx_simd4_max_f(gmx_simd4_load_f(&bbj
[0].upper
[0]),
537 gmx_simd4_load_f(&bbj
[1].upper
[0])));
542 for (i
= 0; i
< NNBSBB_C
; i
++)
544 bb
->lower
[i
] = std::min(bbj
[0].lower
[i
], bbj
[1].lower
[i
]);
545 bb
->upper
[i
] = std::max(bbj
[0].upper
[i
], bbj
[1].upper
[i
]);
551 #ifdef NBNXN_SEARCH_BB_SIMD4
553 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
554 static void calc_bounding_box_xxxx(int na
, int stride
, const real
*x
, float *bb
)
557 real xl
, xh
, yl
, yh
, zl
, zh
;
567 for (int j
= 1; j
< na
; j
++)
569 xl
= std::min(xl
, x
[i
+XX
]);
570 xh
= std::max(xh
, x
[i
+XX
]);
571 yl
= std::min(yl
, x
[i
+YY
]);
572 yh
= std::max(yh
, x
[i
+YY
]);
573 zl
= std::min(zl
, x
[i
+ZZ
]);
574 zh
= std::max(zh
, x
[i
+ZZ
]);
577 /* Note: possible double to float conversion here */
578 bb
[0*STRIDE_PBB
] = R2F_D(xl
);
579 bb
[1*STRIDE_PBB
] = R2F_D(yl
);
580 bb
[2*STRIDE_PBB
] = R2F_D(zl
);
581 bb
[3*STRIDE_PBB
] = R2F_U(xh
);
582 bb
[4*STRIDE_PBB
] = R2F_U(yh
);
583 bb
[5*STRIDE_PBB
] = R2F_U(zh
);
586 #endif /* NBNXN_SEARCH_BB_SIMD4 */
588 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
590 /* Coordinate order xyz?, bb order xyz0 */
591 static void calc_bounding_box_simd4(int na
, const float *x
, nbnxn_bb_t
*bb
)
593 gmx_simd4_float_t bb_0_S
, bb_1_S
;
594 gmx_simd4_float_t x_S
;
596 bb_0_S
= gmx_simd4_load_f(x
);
599 for (int i
= 1; i
< na
; i
++)
601 x_S
= gmx_simd4_load_f(x
+i
*NNBSBB_C
);
602 bb_0_S
= gmx_simd4_min_f(bb_0_S
, x_S
);
603 bb_1_S
= gmx_simd4_max_f(bb_1_S
, x_S
);
606 gmx_simd4_store_f(&bb
->lower
[0], bb_0_S
);
607 gmx_simd4_store_f(&bb
->upper
[0], bb_1_S
);
610 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
611 static void calc_bounding_box_xxxx_simd4(int na
, const float *x
,
612 nbnxn_bb_t
*bb_work_aligned
,
615 calc_bounding_box_simd4(na
, x
, bb_work_aligned
);
617 bb
[0*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_X
];
618 bb
[1*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Y
];
619 bb
[2*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Z
];
620 bb
[3*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_X
];
621 bb
[4*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Y
];
622 bb
[5*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Z
];
625 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
628 /* Combines pairs of consecutive bounding boxes */
629 static void combine_bounding_box_pairs(nbnxn_grid_t
*grid
, const nbnxn_bb_t
*bb
)
631 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
633 /* Starting bb in a column is expected to be 2-aligned */
634 int sc2
= grid
->cxy_ind
[i
]>>1;
635 /* For odd numbers skip the last bb here */
636 int nc2
= (grid
->cxy_na
[i
]+3)>>(2+1);
638 for (c2
= sc2
; c2
< sc2
+nc2
; c2
++)
640 #ifdef NBNXN_SEARCH_BB_SIMD4
641 gmx_simd4_float_t min_S
, max_S
;
643 min_S
= gmx_simd4_min_f(gmx_simd4_load_f(&bb
[c2
*2+0].lower
[0]),
644 gmx_simd4_load_f(&bb
[c2
*2+1].lower
[0]));
645 max_S
= gmx_simd4_max_f(gmx_simd4_load_f(&bb
[c2
*2+0].upper
[0]),
646 gmx_simd4_load_f(&bb
[c2
*2+1].upper
[0]));
647 gmx_simd4_store_f(&grid
->bbj
[c2
].lower
[0], min_S
);
648 gmx_simd4_store_f(&grid
->bbj
[c2
].upper
[0], max_S
);
650 for (int j
= 0; j
< NNBSBB_C
; j
++)
652 grid
->bbj
[c2
].lower
[j
] = std::min(bb
[c2
*2+0].lower
[j
],
653 bb
[c2
*2+1].lower
[j
]);
654 grid
->bbj
[c2
].upper
[j
] = std::max(bb
[c2
*2+0].upper
[j
],
655 bb
[c2
*2+1].upper
[j
]);
659 if (((grid
->cxy_na
[i
]+3)>>2) & 1)
661 /* The bb count in this column is odd: duplicate the last bb */
662 for (int j
= 0; j
< NNBSBB_C
; j
++)
664 grid
->bbj
[c2
].lower
[j
] = bb
[c2
*2].lower
[j
];
665 grid
->bbj
[c2
].upper
[j
] = bb
[c2
*2].upper
[j
];
672 /* Prints the average bb size, used for debug output */
673 static void print_bbsizes_simple(FILE *fp
,
674 const nbnxn_grid_t
*grid
)
679 for (int c
= 0; c
< grid
->nc
; c
++)
681 for (int d
= 0; d
< DIM
; d
++)
683 ba
[d
] += grid
->bb
[c
].upper
[d
] - grid
->bb
[c
].lower
[d
];
686 dsvmul(1.0/grid
->nc
, ba
, ba
);
688 fprintf(fp
, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
691 grid
->atom_density
> 0 ?
692 grid
->na_c
/(grid
->atom_density
*grid
->sx
*grid
->sy
) : 0.0,
693 ba
[XX
], ba
[YY
], ba
[ZZ
],
696 grid
->atom_density
> 0 ?
697 ba
[ZZ
]/(grid
->na_c
/(grid
->atom_density
*grid
->sx
*grid
->sy
)) : 0.0);
700 /* Prints the average bb size, used for debug output */
701 static void print_bbsizes_supersub(FILE *fp
,
702 const nbnxn_grid_t
*grid
)
709 for (int c
= 0; c
< grid
->nc
; c
++)
712 for (int s
= 0; s
< grid
->nsubc
[c
]; s
+= STRIDE_PBB
)
714 int cs_w
= (c
*GPU_NSUBCELL
+ s
)/STRIDE_PBB
;
715 for (int i
= 0; i
< STRIDE_PBB
; i
++)
717 for (int d
= 0; d
< DIM
; d
++)
720 grid
->pbb
[cs_w
*NNBSBB_XXXX
+(DIM
+d
)*STRIDE_PBB
+i
] -
721 grid
->pbb
[cs_w
*NNBSBB_XXXX
+ d
*STRIDE_PBB
+i
];
726 for (int s
= 0; s
< grid
->nsubc
[c
]; s
++)
728 int cs
= c
*GPU_NSUBCELL
+ s
;
729 for (int d
= 0; d
< DIM
; d
++)
731 ba
[d
] += grid
->bb
[cs
].upper
[d
] - grid
->bb
[cs
].lower
[d
];
735 ns
+= grid
->nsubc
[c
];
737 dsvmul(1.0/ns
, ba
, ba
);
739 fprintf(fp
, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
740 grid
->sx
/GPU_NSUBCELL_X
,
741 grid
->sy
/GPU_NSUBCELL_Y
,
742 grid
->atom_density
> 0 ?
743 grid
->na_sc
/(grid
->atom_density
*grid
->sx
*grid
->sy
*GPU_NSUBCELL_Z
) : 0.0,
744 ba
[XX
], ba
[YY
], ba
[ZZ
],
745 ba
[XX
]*GPU_NSUBCELL_X
/grid
->sx
,
746 ba
[YY
]*GPU_NSUBCELL_Y
/grid
->sy
,
747 grid
->atom_density
> 0 ?
748 ba
[ZZ
]/(grid
->na_sc
/(grid
->atom_density
*grid
->sx
*grid
->sy
*GPU_NSUBCELL_Z
)) : 0.0);
751 /* Set non-bonded interaction flags for the current cluster.
752 * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
754 static void sort_cluster_on_flag(int na_c
,
755 int a0
, int a1
, const int *atinfo
,
760 int sort1
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
761 int sort2
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
766 for (int s
= a0
; s
< a1
; s
+= na_c
)
768 /* Make lists for this (sub-)cell on atoms with and without LJ */
771 gmx_bool haveQ
= FALSE
;
773 for (int a
= s
; a
< std::min(s
+na_c
, a1
); a
++)
775 haveQ
= haveQ
|| GET_CGINFO_HAS_Q(atinfo
[order
[a
]]);
777 if (GET_CGINFO_HAS_VDW(atinfo
[order
[a
]]))
779 sort1
[n1
++] = order
[a
];
784 sort2
[n2
++] = order
[a
];
788 /* If we don't have atoms with LJ, there's nothing to sort */
791 *flags
|= NBNXN_CI_DO_LJ(subc
);
795 /* Only sort when strictly necessary. Ordering particles
796 * Ordering particles can lead to less accurate summation
797 * due to rounding, both for LJ and Coulomb interactions.
799 if (2*(a_lj_max
- s
) >= na_c
)
801 for (int i
= 0; i
< n1
; i
++)
803 order
[a0
+i
] = sort1
[i
];
805 for (int j
= 0; j
< n2
; j
++)
807 order
[a0
+n1
+j
] = sort2
[j
];
811 *flags
|= NBNXN_CI_HALF_LJ(subc
);
816 *flags
|= NBNXN_CI_DO_COUL(subc
);
822 /* Fill a pair search cell with atoms.
823 * Potentially sorts atoms and sets the interaction flags.
825 static void fill_cell(const nbnxn_search_t nbs
,
827 nbnxn_atomdata_t
*nbat
,
831 int sx
, int sy
, int sz
,
832 nbnxn_bb_t gmx_unused
*bb_work_aligned
)
845 /* Note that non-local grids are already sorted.
846 * Then sort_cluster_on_flag will only set the flags and the sorting
847 * will not affect the atom order.
849 sort_cluster_on_flag(grid
->na_c
, a0
, a1
, atinfo
, nbs
->a
,
850 grid
->flags
+(a0
>>grid
->na_c_2log
)-grid
->cell0
);
855 /* Set the fep flag for perturbed atoms in this (sub-)cell */
858 /* The grid-local cluster/(sub-)cell index */
859 c
= (a0
>> grid
->na_c_2log
) - grid
->cell0
*(grid
->bSimple
? 1 : GPU_NSUBCELL
);
861 for (int at
= a0
; at
< a1
; at
++)
863 if (nbs
->a
[at
] >= 0 && GET_CGINFO_FEP(atinfo
[nbs
->a
[at
]]))
865 grid
->fep
[c
] |= (1 << (at
- a0
));
870 /* Now we have sorted the atoms, set the cell indices */
871 for (a
= a0
; a
< a1
; a
++)
873 nbs
->cell
[nbs
->a
[a
]] = a
;
876 copy_rvec_to_nbat_real(nbs
->a
+a0
, a1
-a0
, grid
->na_c
, x
,
877 nbat
->XFormat
, nbat
->x
, a0
,
880 if (nbat
->XFormat
== nbatX4
)
882 /* Store the bounding boxes as xyz.xyz. */
883 offset
= (a0
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
884 bb_ptr
= grid
->bb
+ offset
;
886 #if defined GMX_NBNXN_SIMD && GMX_SIMD_REAL_WIDTH == 2
887 if (2*grid
->na_cj
== grid
->na_c
)
889 calc_bounding_box_x_x4_halves(na
, nbat
->x
+X4_IND_A(a0
), bb_ptr
,
895 calc_bounding_box_x_x4(na
, nbat
->x
+X4_IND_A(a0
), bb_ptr
);
898 else if (nbat
->XFormat
== nbatX8
)
900 /* Store the bounding boxes as xyz.xyz. */
901 offset
= (a0
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
902 bb_ptr
= grid
->bb
+ offset
;
904 calc_bounding_box_x_x8(na
, nbat
->x
+X8_IND_A(a0
), bb_ptr
);
907 else if (!grid
->bSimple
)
909 /* Store the bounding boxes in a format convenient
910 * for SIMD4 calculations: xxxxyyyyzzzz...
914 ((a0
-grid
->cell0
*grid
->na_sc
)>>(grid
->na_c_2log
+STRIDE_PBB_2LOG
))*NNBSBB_XXXX
+
915 (((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
) & (STRIDE_PBB
-1));
917 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
918 if (nbat
->XFormat
== nbatXYZQ
)
920 calc_bounding_box_xxxx_simd4(na
, nbat
->x
+a0
*nbat
->xstride
,
921 bb_work_aligned
, pbb_ptr
);
926 calc_bounding_box_xxxx(na
, nbat
->xstride
, nbat
->x
+a0
*nbat
->xstride
,
931 fprintf(debug
, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
933 pbb_ptr
[0*STRIDE_PBB
], pbb_ptr
[3*STRIDE_PBB
],
934 pbb_ptr
[1*STRIDE_PBB
], pbb_ptr
[4*STRIDE_PBB
],
935 pbb_ptr
[2*STRIDE_PBB
], pbb_ptr
[5*STRIDE_PBB
]);
941 /* Store the bounding boxes as xyz.xyz. */
942 bb_ptr
= grid
->bb
+((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
);
944 calc_bounding_box(na
, nbat
->xstride
, nbat
->x
+a0
*nbat
->xstride
,
950 bbo
= (a0
- grid
->cell0
*grid
->na_sc
)/grid
->na_c
;
951 fprintf(debug
, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
953 grid
->bb
[bbo
].lower
[BB_X
],
954 grid
->bb
[bbo
].lower
[BB_Y
],
955 grid
->bb
[bbo
].lower
[BB_Z
],
956 grid
->bb
[bbo
].upper
[BB_X
],
957 grid
->bb
[bbo
].upper
[BB_Y
],
958 grid
->bb
[bbo
].upper
[BB_Z
]);
963 /* Spatially sort the atoms within one grid column */
964 static void sort_columns_simple(const nbnxn_search_t nbs
,
970 nbnxn_atomdata_t
*nbat
,
971 int cxy_start
, int cxy_end
,
978 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
979 grid
->cell0
, cxy_start
, cxy_end
, a0
, a1
);
982 /* Sort the atoms within each x,y column in 3 dimensions */
983 for (int cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
985 int cx
= cxy
/grid
->ncy
;
986 int cy
= cxy
- cx
*grid
->ncy
;
988 int na
= grid
->cxy_na
[cxy
];
989 int ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
990 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
992 /* Sort the atoms within each x,y column on z coordinate */
993 sort_atoms(ZZ
, FALSE
, dd_zone
,
996 1.0/grid
->size
[ZZ
], ncz
*grid
->na_sc
,
999 /* Fill the ncz cells in this column */
1000 cfilled
= grid
->cxy_ind
[cxy
];
1001 for (int cz
= 0; cz
< ncz
; cz
++)
1003 c
= grid
->cxy_ind
[cxy
] + cz
;
1005 int ash_c
= ash
+ cz
*grid
->na_sc
;
1006 int na_c
= std::min(grid
->na_sc
, na
-(ash_c
-ash
));
1008 fill_cell(nbs
, grid
, nbat
,
1009 ash_c
, ash_c
+na_c
, atinfo
, x
,
1010 grid
->na_sc
*cx
+ (dd_zone
>> 2),
1011 grid
->na_sc
*cy
+ (dd_zone
& 3),
1015 /* This copy to bbcz is not really necessary.
1016 * But it allows to use the same grid search code
1017 * for the simple and supersub cell setups.
1023 grid
->bbcz
[c
*NNBSBB_D
] = grid
->bb
[cfilled
].lower
[BB_Z
];
1024 grid
->bbcz
[c
*NNBSBB_D
+1] = grid
->bb
[cfilled
].upper
[BB_Z
];
1027 /* Set the unused atom indices to -1 */
1028 for (int ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
1030 nbs
->a
[ash
+ind
] = -1;
1035 /* Spatially sort the atoms within one grid column */
1036 static void sort_columns_supersub(const nbnxn_search_t nbs
,
1042 nbnxn_atomdata_t
*nbat
,
1043 int cxy_start
, int cxy_end
,
1046 nbnxn_bb_t bb_work_array
[2], *bb_work_aligned
;
1048 bb_work_aligned
= reinterpret_cast<nbnxn_bb_t
*>((reinterpret_cast<std::size_t>(bb_work_array
+1)) & (~(static_cast<std::size_t>(15))));
1052 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1053 grid
->cell0
, cxy_start
, cxy_end
, a0
, a1
);
1056 int subdiv_x
= grid
->na_c
;
1057 int subdiv_y
= GPU_NSUBCELL_X
*subdiv_x
;
1058 int subdiv_z
= GPU_NSUBCELL_Y
*subdiv_y
;
1060 /* Sort the atoms within each x,y column in 3 dimensions */
1061 for (int cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
1063 int cx
= cxy
/grid
->ncy
;
1064 int cy
= cxy
- cx
*grid
->ncy
;
1066 int na
= grid
->cxy_na
[cxy
];
1067 int ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1068 int ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1070 /* Sort the atoms within each x,y column on z coordinate */
1071 sort_atoms(ZZ
, FALSE
, dd_zone
,
1074 1.0/grid
->size
[ZZ
], ncz
*grid
->na_sc
,
1077 /* This loop goes over the supercells and subcells along z at once */
1078 for (int sub_z
= 0; sub_z
< ncz
*GPU_NSUBCELL_Z
; sub_z
++)
1080 int ash_z
= ash
+ sub_z
*subdiv_z
;
1081 int na_z
= std::min(subdiv_z
, na
-(ash_z
-ash
));
1083 /* We have already sorted on z */
1085 if (sub_z
% GPU_NSUBCELL_Z
== 0)
1087 cz
= sub_z
/GPU_NSUBCELL_Z
;
1088 int c
= grid
->cxy_ind
[cxy
] + cz
;
1090 /* The number of atoms in this supercell */
1091 int na_c
= std::min(grid
->na_sc
, na
-(ash_z
-ash
));
1093 grid
->nsubc
[c
] = std::min(GPU_NSUBCELL
, (na_c
+grid
->na_c
-1)/grid
->na_c
);
1095 /* Store the z-boundaries of the super cell */
1096 grid
->bbcz
[c
*NNBSBB_D
] = x
[nbs
->a
[ash_z
]][ZZ
];
1097 grid
->bbcz
[c
*NNBSBB_D
+1] = x
[nbs
->a
[ash_z
+na_c
-1]][ZZ
];
1100 #if GPU_NSUBCELL_Y > 1
1101 /* Sort the atoms along y */
1102 sort_atoms(YY
, (sub_z
& 1), dd_zone
,
1103 nbs
->a
+ash_z
, na_z
, x
,
1104 grid
->c0
[YY
]+cy
*grid
->sy
,
1105 grid
->inv_sy
, subdiv_z
,
1109 for (int sub_y
= 0; sub_y
< GPU_NSUBCELL_Y
; sub_y
++)
1111 int ash_y
= ash_z
+ sub_y
*subdiv_y
;
1112 int na_y
= std::min(subdiv_y
, na
-(ash_y
-ash
));
1114 #if GPU_NSUBCELL_X > 1
1115 /* Sort the atoms along x */
1116 sort_atoms(XX
, ((cz
*GPU_NSUBCELL_Y
+ sub_y
) & 1), dd_zone
,
1117 nbs
->a
+ash_y
, na_y
, x
,
1118 grid
->c0
[XX
]+cx
*grid
->sx
,
1119 grid
->inv_sx
, subdiv_y
,
1123 for (int sub_x
= 0; sub_x
< GPU_NSUBCELL_X
; sub_x
++)
1125 int ash_x
= ash_y
+ sub_x
*subdiv_x
;
1126 int na_x
= std::min(subdiv_x
, na
-(ash_x
-ash
));
1128 fill_cell(nbs
, grid
, nbat
,
1129 ash_x
, ash_x
+na_x
, atinfo
, x
,
1130 grid
->na_c
*(cx
*GPU_NSUBCELL_X
+sub_x
) + (dd_zone
>> 2),
1131 grid
->na_c
*(cy
*GPU_NSUBCELL_Y
+sub_y
) + (dd_zone
& 3),
1138 /* Set the unused atom indices to -1 */
1139 for (int ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
1141 nbs
->a
[ash
+ind
] = -1;
1146 /* Determine in which grid column atoms should go */
1147 static void calc_column_indices(nbnxn_grid_t
*grid
,
1150 int dd_zone
, const int *move
,
1151 int thread
, int nthread
,
1155 /* We add one extra cell for particles which moved during DD */
1156 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1161 int n0
= a0
+ static_cast<int>((thread
+0)*(a1
- a0
))/nthread
;
1162 int n1
= a0
+ static_cast<int>((thread
+1)*(a1
- a0
))/nthread
;
1166 for (int i
= n0
; i
< n1
; i
++)
1168 if (move
== NULL
|| move
[i
] >= 0)
1170 /* We need to be careful with rounding,
1171 * particles might be a few bits outside the local zone.
1172 * The int cast takes care of the lower bound,
1173 * we will explicitly take care of the upper bound.
1175 int cx
= static_cast<int>((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1176 int cy
= static_cast<int>((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1179 if (cx
< 0 || cx
> grid
->ncx
||
1180 cy
< 0 || cy
> grid
->ncy
)
1183 "grid cell cx %d cy %d out of range (max %d %d)\n"
1184 "atom %f %f %f, grid->c0 %f %f",
1185 cx
, cy
, grid
->ncx
, grid
->ncy
,
1186 x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], grid
->c0
[XX
], grid
->c0
[YY
]);
1189 /* Take care of potential rouding issues */
1190 cx
= std::min(cx
, grid
->ncx
- 1);
1191 cy
= std::min(cy
, grid
->ncy
- 1);
1193 /* For the moment cell will contain only the, grid local,
1194 * x and y indices, not z.
1196 cell
[i
] = cx
*grid
->ncy
+ cy
;
1200 /* Put this moved particle after the end of the grid,
1201 * so we can process it later without using conditionals.
1203 cell
[i
] = grid
->ncx
*grid
->ncy
;
1212 for (int i
= n0
; i
< n1
; i
++)
1214 int cx
= static_cast<int>((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1215 int cy
= static_cast<int>((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1217 /* For non-home zones there could be particles outside
1218 * the non-bonded cut-off range, which have been communicated
1219 * for bonded interactions only. For the result it doesn't
1220 * matter where these end up on the grid. For performance
1221 * we put them in an extra row at the border.
1223 cx
= std::max(cx
, 0);
1224 cx
= std::min(cx
, grid
->ncx
- 1);
1225 cy
= std::max(cy
, 0);
1226 cy
= std::min(cy
, grid
->ncy
- 1);
1228 /* For the moment cell will contain only the, grid local,
1229 * x and y indices, not z.
1231 cell
[i
] = cx
*grid
->ncy
+ cy
;
1238 /* Determine in which grid cells the atoms should go */
1239 static void calc_cell_indices(const nbnxn_search_t nbs
,
1246 nbnxn_atomdata_t
*nbat
)
1249 int cx
, cy
, cxy
, ncz_max
, ncz
;
1253 nthread
= gmx_omp_nthreads_get(emntPairsearch
);
1255 #pragma omp parallel for num_threads(nthread) schedule(static)
1256 for (int thread
= 0; thread
< nthread
; thread
++)
1260 calc_column_indices(grid
, a0
, a1
, x
, dd_zone
, move
, thread
, nthread
,
1261 nbs
->cell
, nbs
->work
[thread
].cxy_na
);
1263 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1266 /* Make the cell index as a function of x and y */
1269 grid
->cxy_ind
[0] = 0;
1270 for (int i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1272 /* We set ncz_max at the beginning of the loop iso at the end
1273 * to skip i=grid->ncx*grid->ncy which are moved particles
1274 * that do not need to be ordered on the grid.
1280 cxy_na_i
= nbs
->work
[0].cxy_na
[i
];
1281 for (int thread
= 1; thread
< nthread
; thread
++)
1283 cxy_na_i
+= nbs
->work
[thread
].cxy_na
[i
];
1285 ncz
= (cxy_na_i
+ grid
->na_sc
- 1)/grid
->na_sc
;
1286 if (nbat
->XFormat
== nbatX8
)
1288 /* Make the number of cell a multiple of 2 */
1289 ncz
= (ncz
+ 1) & ~1;
1291 grid
->cxy_ind
[i
+1] = grid
->cxy_ind
[i
] + ncz
;
1292 /* Clear cxy_na, so we can reuse the array below */
1293 grid
->cxy_na
[i
] = 0;
1295 grid
->nc
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
] - grid
->cxy_ind
[0];
1297 nbat
->natoms
= (grid
->cell0
+ grid
->nc
)*grid
->na_sc
;
1301 fprintf(debug
, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1302 grid
->na_sc
, grid
->na_c
, grid
->nc
,
1303 grid
->ncx
, grid
->ncy
, grid
->nc
/((double)(grid
->ncx
*grid
->ncy
)),
1308 for (cy
= 0; cy
< grid
->ncy
; cy
++)
1310 for (cx
= 0; cx
< grid
->ncx
; cx
++)
1312 fprintf(debug
, " %2d", grid
->cxy_ind
[i
+1]-grid
->cxy_ind
[i
]);
1315 fprintf(debug
, "\n");
1320 /* Make sure the work array for sorting is large enough */
1321 if (ncz_max
*grid
->na_sc
*SGSF
> nbs
->work
[0].sort_work_nalloc
)
1323 for (int thread
= 0; thread
< nbs
->nthread_max
; thread
++)
1325 nbs
->work
[thread
].sort_work_nalloc
=
1326 over_alloc_large(ncz_max
*grid
->na_sc
*SGSF
);
1327 srenew(nbs
->work
[thread
].sort_work
,
1328 nbs
->work
[thread
].sort_work_nalloc
);
1329 /* When not in use, all elements should be -1 */
1330 for (int i
= 0; i
< nbs
->work
[thread
].sort_work_nalloc
; i
++)
1332 nbs
->work
[thread
].sort_work
[i
] = -1;
1337 /* Now we know the dimensions we can fill the grid.
1338 * This is the first, unsorted fill. We sort the columns after this.
1340 for (int i
= a0
; i
< a1
; i
++)
1342 /* At this point nbs->cell contains the local grid x,y indices */
1344 nbs
->a
[(grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
+ grid
->cxy_na
[cxy
]++] = i
;
1349 /* Set the cell indices for the moved particles */
1350 n0
= grid
->nc
*grid
->na_sc
;
1351 n1
= grid
->nc
*grid
->na_sc
+grid
->cxy_na
[grid
->ncx
*grid
->ncy
];
1354 for (int i
= n0
; i
< n1
; i
++)
1356 nbs
->cell
[nbs
->a
[i
]] = i
;
1361 /* Sort the super-cell columns along z into the sub-cells. */
1362 #pragma omp parallel for num_threads(nthread) schedule(static)
1363 for (int thread
= 0; thread
< nthread
; thread
++)
1369 sort_columns_simple(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, nbat
,
1370 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1371 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1372 nbs
->work
[thread
].sort_work
);
1376 sort_columns_supersub(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, nbat
,
1377 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1378 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1379 nbs
->work
[thread
].sort_work
);
1382 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1385 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1387 combine_bounding_box_pairs(grid
, grid
->bb
);
1392 grid
->nsubc_tot
= 0;
1393 for (int i
= 0; i
< grid
->nc
; i
++)
1395 grid
->nsubc_tot
+= grid
->nsubc
[i
];
1403 print_bbsizes_simple(debug
, grid
);
1407 fprintf(debug
, "ns non-zero sub-cells: %d average atoms %.2f\n",
1408 grid
->nsubc_tot
, (a1
-a0
)/(double)grid
->nsubc_tot
);
1410 print_bbsizes_supersub(debug
, grid
);
1415 static void init_buffer_flags(nbnxn_buffer_flags_t
*flags
,
1418 flags
->nflag
= (natoms
+ NBNXN_BUFFERFLAG_SIZE
- 1)/NBNXN_BUFFERFLAG_SIZE
;
1419 if (flags
->nflag
> flags
->flag_nalloc
)
1421 flags
->flag_nalloc
= over_alloc_large(flags
->nflag
);
1422 srenew(flags
->flag
, flags
->flag_nalloc
);
1424 for (int b
= 0; b
< flags
->nflag
; b
++)
1426 bitmask_clear(&(flags
->flag
[b
]));
1430 /* Sets up a grid and puts the atoms on the grid.
1431 * This function only operates on one domain of the domain decompostion.
1432 * Note that without domain decomposition there is only one domain.
1434 void nbnxn_put_on_grid(nbnxn_search_t nbs
,
1435 int ePBC
, matrix box
,
1437 rvec corner0
, rvec corner1
,
1442 int nmoved
, int *move
,
1444 nbnxn_atomdata_t
*nbat
)
1448 int nc_max_grid
, nc_max
;
1450 grid
= &nbs
->grid
[dd_zone
];
1452 nbs_cycle_start(&nbs
->cc
[enbsCCgrid
]);
1454 grid
->bSimple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
1456 grid
->na_c
= nbnxn_kernel_to_cluster_i_size(nb_kernel_type
);
1457 grid
->na_cj
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
1458 grid
->na_sc
= (grid
->bSimple
? 1 : GPU_NSUBCELL
)*grid
->na_c
;
1459 grid
->na_c_2log
= get_2log(grid
->na_c
);
1461 nbat
->na_c
= grid
->na_c
;
1470 (nbs
->grid
[dd_zone
-1].cell0
+ nbs
->grid
[dd_zone
-1].nc
)*
1471 nbs
->grid
[dd_zone
-1].na_sc
/grid
->na_sc
;
1479 copy_mat(box
, nbs
->box
);
1481 /* Avoid zero density */
1482 if (atom_density
> 0)
1484 grid
->atom_density
= atom_density
;
1488 grid
->atom_density
= grid_atom_density(n
-nmoved
, corner0
, corner1
);
1493 nbs
->natoms_local
= a1
- nmoved
;
1494 /* We assume that nbnxn_put_on_grid is called first
1495 * for the local atoms (dd_zone=0).
1497 nbs
->natoms_nonlocal
= a1
- nmoved
;
1501 fprintf(debug
, "natoms_local = %5d atom_density = %5.1f\n",
1502 nbs
->natoms_local
, grid
->atom_density
);
1507 nbs
->natoms_nonlocal
= std::max(nbs
->natoms_nonlocal
, a1
);
1510 /* We always use the home zone (grid[0]) for setting the cell size,
1511 * since determining densities for non-local zones is difficult.
1513 nc_max_grid
= set_grid_size_xy(nbs
, grid
,
1514 dd_zone
, n
-nmoved
, corner0
, corner1
,
1515 nbs
->grid
[0].atom_density
);
1517 nc_max
= grid
->cell0
+ nc_max_grid
;
1519 if (a1
> nbs
->cell_nalloc
)
1521 nbs
->cell_nalloc
= over_alloc_large(a1
);
1522 srenew(nbs
->cell
, nbs
->cell_nalloc
);
1525 /* To avoid conditionals we store the moved particles at the end of a,
1526 * make sure we have enough space.
1528 if (nc_max
*grid
->na_sc
+ nmoved
> nbs
->a_nalloc
)
1530 nbs
->a_nalloc
= over_alloc_large(nc_max
*grid
->na_sc
+ nmoved
);
1531 srenew(nbs
->a
, nbs
->a_nalloc
);
1534 /* We need padding up to a multiple of the buffer flag size: simply add */
1535 if (nc_max
*grid
->na_sc
+ NBNXN_BUFFERFLAG_SIZE
> nbat
->nalloc
)
1537 nbnxn_atomdata_realloc(nbat
, nc_max
*grid
->na_sc
+NBNXN_BUFFERFLAG_SIZE
);
1540 calc_cell_indices(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, move
, nbat
);
1544 nbat
->natoms_local
= nbat
->natoms
;
1547 nbs_cycle_stop(&nbs
->cc
[enbsCCgrid
]);
1550 /* Calls nbnxn_put_on_grid for all non-local domains */
1551 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs
,
1552 const struct gmx_domdec_zones_t
*zones
,
1556 nbnxn_atomdata_t
*nbat
)
1560 for (int zone
= 1; zone
< zones
->n
; zone
++)
1562 for (int d
= 0; d
< DIM
; d
++)
1564 c0
[d
] = zones
->size
[zone
].bb_x0
[d
];
1565 c1
[d
] = zones
->size
[zone
].bb_x1
[d
];
1568 nbnxn_put_on_grid(nbs
, nbs
->ePBC
, NULL
,
1570 zones
->cg_range
[zone
],
1571 zones
->cg_range
[zone
+1],
1581 /* Add simple grid type information to the local super/sub grid */
1582 void nbnxn_grid_add_simple(nbnxn_search_t nbs
,
1583 nbnxn_atomdata_t
*nbat
)
1590 grid
= &nbs
->grid
[0];
1594 gmx_incons("nbnxn_grid_simple called with a simple grid");
1597 ncd
= grid
->na_sc
/NBNXN_CPU_CLUSTER_I_SIZE
;
1599 if (grid
->nc
*ncd
> grid
->nc_nalloc_simple
)
1601 grid
->nc_nalloc_simple
= over_alloc_large(grid
->nc
*ncd
);
1602 srenew(grid
->bbcz_simple
, grid
->nc_nalloc_simple
*NNBSBB_D
);
1603 srenew(grid
->bb_simple
, grid
->nc_nalloc_simple
);
1604 srenew(grid
->flags_simple
, grid
->nc_nalloc_simple
);
1607 sfree_aligned(grid
->bbj
);
1608 snew_aligned(grid
->bbj
, grid
->nc_nalloc_simple
/2, 16);
1612 bbcz
= grid
->bbcz_simple
;
1613 bb
= grid
->bb_simple
;
1615 #if (defined GMX_OPENMP) && !(defined __clang_analyzer__)
1616 // cppcheck-suppress unreadVariable
1617 int nthreads
= gmx_omp_nthreads_get(emntPairsearch
);
1620 #pragma omp parallel for num_threads(nthreads) schedule(static)
1621 for (int sc
= 0; sc
< grid
->nc
; sc
++)
1625 for (int c
= 0; c
< ncd
; c
++)
1627 int tx
= sc
*ncd
+ c
;
1628 int na
= NBNXN_CPU_CLUSTER_I_SIZE
;
1630 nbat
->type
[tx
*NBNXN_CPU_CLUSTER_I_SIZE
+na
-1] == nbat
->ntype
-1)
1637 switch (nbat
->XFormat
)
1640 /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1641 calc_bounding_box_x_x4(na
, nbat
->x
+tx
*STRIDE_P4
,
1645 /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1646 calc_bounding_box_x_x8(na
, nbat
->x
+X8_IND_A(tx
*NBNXN_CPU_CLUSTER_I_SIZE
),
1650 calc_bounding_box(na
, nbat
->xstride
,
1651 nbat
->x
+tx
*NBNXN_CPU_CLUSTER_I_SIZE
*nbat
->xstride
,
1655 bbcz
[tx
*NNBSBB_D
+0] = bb
[tx
].lower
[BB_Z
];
1656 bbcz
[tx
*NNBSBB_D
+1] = bb
[tx
].upper
[BB_Z
];
1658 /* No interaction optimization yet here */
1659 grid
->flags_simple
[tx
] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1663 grid
->flags_simple
[tx
] = 0;
1667 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1670 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1672 combine_bounding_box_pairs(grid
, grid
->bb_simple
);
1676 void nbnxn_get_ncells(nbnxn_search_t nbs
, int *ncx
, int *ncy
)
1678 *ncx
= nbs
->grid
[0].ncx
;
1679 *ncy
= nbs
->grid
[0].ncy
;
1682 void nbnxn_get_atomorder(const nbnxn_search_t nbs
, const int **a
, int *n
)
1684 const nbnxn_grid_t
*grid
;
1686 grid
= &nbs
->grid
[0];
1688 /* Return the atom order for the home cell (index 0) */
1691 *n
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
]*grid
->na_sc
;
1694 void nbnxn_set_atomorder(nbnxn_search_t nbs
)
1696 /* Set the atom order for the home cell (index 0) */
1697 nbnxn_grid_t
*grid
= &nbs
->grid
[0];
1700 for (int cx
= 0; cx
< grid
->ncx
; cx
++)
1702 for (int cy
= 0; cy
< grid
->ncy
; cy
++)
1704 int cxy
= cx
*grid
->ncy
+ cy
;
1705 int j
= grid
->cxy_ind
[cxy
]*grid
->na_sc
;
1706 for (int cz
= 0; cz
< grid
->cxy_na
[cxy
]; cz
++)