2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
49 #include "gromacs/domdec/domdec_struct.h"
50 #include "gromacs/gmxlib/nrnb.h"
51 #include "gromacs/math/functions.h"
52 #include "gromacs/math/utilities.h"
53 #include "gromacs/math/vec.h"
54 #include "gromacs/mdlib/gmx_omp_nthreads.h"
55 #include "gromacs/mdtypes/group.h"
56 #include "gromacs/mdtypes/md_enums.h"
57 #include "gromacs/nbnxm/atomdata.h"
58 #include "gromacs/nbnxm/gpu_data_mgmt.h"
59 #include "gromacs/pbcutil/ishift.h"
60 #include "gromacs/pbcutil/pbc.h"
61 #include "gromacs/simd/simd.h"
62 #include "gromacs/simd/vector_operations.h"
63 #include "gromacs/utility/exceptions.h"
64 #include "gromacs/utility/fatalerror.h"
65 #include "gromacs/utility/gmxomp.h"
66 #include "gromacs/utility/listoflists.h"
67 #include "gromacs/utility/smalloc.h"
69 #include "boundingboxes.h"
70 #include "clusterdistancekerneltype.h"
72 #include "nbnxm_geometry.h"
73 #include "nbnxm_simd.h"
74 #include "pairlistset.h"
75 #include "pairlistsets.h"
76 #include "pairlistwork.h"
77 #include "pairsearch.h"
79 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
81 using BoundingBox
= Nbnxm::BoundingBox
; // TODO: Remove when refactoring this file
82 using BoundingBox1D
= Nbnxm::BoundingBox1D
; // TODO: Remove when refactoring this file
84 using Grid
= Nbnxm::Grid
; // TODO: Remove when refactoring this file
86 // Convience alias for partial Nbnxn namespace usage
87 using InteractionLocality
= gmx::InteractionLocality
;
89 /* We shift the i-particles backward for PBC.
90 * This leads to more conditionals than shifting forward.
91 * We do this to get more balanced pair lists.
93 constexpr bool c_pbcShiftBackward
= true;
95 /* Layout for the nonbonded NxN pair lists */
96 enum class NbnxnLayout
98 NoSimd4x4
, // i-cluster size 4, j-cluster size 4
99 Simd4xN
, // i-cluster size 4, j-cluster size SIMD width
100 Simd2xNN
, // i-cluster size 4, j-cluster size half SIMD width
101 Gpu8x8x8
// i-cluster size 8, j-cluster size 8 + super-clustering
105 /* Returns the j-cluster size */
106 template<NbnxnLayout layout
>
107 static constexpr int jClusterSize()
109 static_assert(layout
== NbnxnLayout::NoSimd4x4
|| layout
== NbnxnLayout::Simd4xN
110 || layout
== NbnxnLayout::Simd2xNN
,
111 "Currently jClusterSize only supports CPU layouts");
113 return layout
== NbnxnLayout::Simd4xN
114 ? GMX_SIMD_REAL_WIDTH
115 : (layout
== NbnxnLayout::Simd2xNN
? GMX_SIMD_REAL_WIDTH
/ 2 : c_nbnxnCpuIClusterSize
);
118 /*! \brief Returns the j-cluster index given the i-cluster index.
120 * \tparam jClusterSize The number of atoms in a j-cluster
121 * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) <
122 * size(i-cluster) \param[in] ci The i-cluster index
124 template<int jClusterSize
, int jSubClusterIndex
>
125 static inline int cjFromCi(int ci
)
127 static_assert(jClusterSize
== c_nbnxnCpuIClusterSize
/ 2 || jClusterSize
== c_nbnxnCpuIClusterSize
128 || jClusterSize
== c_nbnxnCpuIClusterSize
* 2,
129 "Only j-cluster sizes 2, 4 and 8 are currently implemented");
131 static_assert(jSubClusterIndex
== 0 || jSubClusterIndex
== 1,
132 "Only sub-cluster indices 0 and 1 are supported");
134 if (jClusterSize
== c_nbnxnCpuIClusterSize
/ 2)
136 if (jSubClusterIndex
== 0)
142 return ((ci
+ 1) << 1) - 1;
145 else if (jClusterSize
== c_nbnxnCpuIClusterSize
)
155 /*! \brief Returns the j-cluster index given the i-cluster index.
157 * \tparam layout The pair-list layout
158 * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) <
159 * size(i-cluster) \param[in] ci The i-cluster index
161 template<NbnxnLayout layout
, int jSubClusterIndex
>
162 static inline int cjFromCi(int ci
)
164 constexpr int clusterSize
= jClusterSize
<layout
>();
166 return cjFromCi
<clusterSize
, jSubClusterIndex
>(ci
);
169 /* Returns the nbnxn coordinate data index given the i-cluster index */
170 template<NbnxnLayout layout
>
171 static inline int xIndexFromCi(int ci
)
173 constexpr int clusterSize
= jClusterSize
<layout
>();
175 static_assert(clusterSize
== c_nbnxnCpuIClusterSize
/ 2 || clusterSize
== c_nbnxnCpuIClusterSize
176 || clusterSize
== c_nbnxnCpuIClusterSize
* 2,
177 "Only j-cluster sizes 2, 4 and 8 are currently implemented");
179 if (clusterSize
<= c_nbnxnCpuIClusterSize
)
181 /* Coordinates are stored packed in groups of 4 */
182 return ci
* STRIDE_P4
;
186 /* Coordinates packed in 8, i-cluster size is half the packing width */
187 return (ci
>> 1) * STRIDE_P8
+ (ci
& 1) * (c_packX8
>> 1);
191 /* Returns the nbnxn coordinate data index given the j-cluster index */
192 template<NbnxnLayout layout
>
193 static inline int xIndexFromCj(int cj
)
195 constexpr int clusterSize
= jClusterSize
<layout
>();
197 static_assert(clusterSize
== c_nbnxnCpuIClusterSize
/ 2 || clusterSize
== c_nbnxnCpuIClusterSize
198 || clusterSize
== c_nbnxnCpuIClusterSize
* 2,
199 "Only j-cluster sizes 2, 4 and 8 are currently implemented");
201 if (clusterSize
== c_nbnxnCpuIClusterSize
/ 2)
203 /* Coordinates are stored packed in groups of 4 */
204 return (cj
>> 1) * STRIDE_P4
+ (cj
& 1) * (c_packX4
>> 1);
206 else if (clusterSize
== c_nbnxnCpuIClusterSize
)
208 /* Coordinates are stored packed in groups of 4 */
209 return cj
* STRIDE_P4
;
213 /* Coordinates are stored packed in groups of 8 */
214 return cj
* STRIDE_P8
;
220 void nbnxn_init_pairlist_fep(t_nblist
* nl
)
222 nl
->type
= GMX_NBLIST_INTERACTION_FREE_ENERGY
;
223 nl
->igeometry
= GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE
;
224 /* The interaction functions are set in the free energy kernel fuction */
237 nl
->jindex
= nullptr;
239 nl
->excl_fep
= nullptr;
242 static void init_buffer_flags(nbnxn_buffer_flags_t
* flags
, int natoms
)
244 flags
->nflag
= (natoms
+ NBNXN_BUFFERFLAG_SIZE
- 1) / NBNXN_BUFFERFLAG_SIZE
;
245 if (flags
->nflag
> flags
->flag_nalloc
)
247 flags
->flag_nalloc
= over_alloc_large(flags
->nflag
);
248 srenew(flags
->flag
, flags
->flag_nalloc
);
250 for (int b
= 0; b
< flags
->nflag
; b
++)
252 bitmask_clear(&(flags
->flag
[b
]));
256 /* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
258 * Given a cutoff distance between atoms, this functions returns the cutoff
259 * distance2 between a bounding box of a group of atoms and a grid cell.
260 * Since atoms can be geometrically outside of the cell they have been
261 * assigned to (when atom groups instead of individual atoms are assigned
262 * to cells), this distance returned can be larger than the input.
264 static real
listRangeForBoundingBoxToGridCell(real rlist
, const Grid::Dimensions
& gridDims
)
266 return rlist
+ gridDims
.maxAtomGroupRadius
;
268 /* Returns the pair-list cutoff between a grid cells given an atom-to-atom pair-list cutoff
270 * Given a cutoff distance between atoms, this functions returns the cutoff
271 * distance2 between two grid cells.
272 * Since atoms can be geometrically outside of the cell they have been
273 * assigned to (when atom groups instead of individual atoms are assigned
274 * to cells), this distance returned can be larger than the input.
276 static real
listRangeForGridCellToGridCell(real rlist
,
277 const Grid::Dimensions
& iGridDims
,
278 const Grid::Dimensions
& jGridDims
)
280 return rlist
+ iGridDims
.maxAtomGroupRadius
+ jGridDims
.maxAtomGroupRadius
;
283 /* Determines the cell range along one dimension that
284 * the bounding box b0 - b1 sees.
288 get_cell_range(real b0
, real b1
, const Grid::Dimensions
& jGridDims
, real d2
, real rlist
, int* cf
, int* cl
)
290 real listRangeBBToCell2
= gmx::square(listRangeForBoundingBoxToGridCell(rlist
, jGridDims
));
291 real distanceInCells
= (b0
- jGridDims
.lowerCorner
[dim
]) * jGridDims
.invCellSize
[dim
];
292 *cf
= std::max(static_cast<int>(distanceInCells
), 0);
295 && d2
+ gmx::square((b0
- jGridDims
.lowerCorner
[dim
]) - (*cf
- 1 + 1) * jGridDims
.cellSize
[dim
])
296 < listRangeBBToCell2
)
301 *cl
= std::min(static_cast<int>((b1
- jGridDims
.lowerCorner
[dim
]) * jGridDims
.invCellSize
[dim
]),
302 jGridDims
.numCells
[dim
] - 1);
303 while (*cl
< jGridDims
.numCells
[dim
] - 1
304 && d2
+ gmx::square((*cl
+ 1) * jGridDims
.cellSize
[dim
] - (b1
- jGridDims
.lowerCorner
[dim
]))
305 < listRangeBBToCell2
)
311 /* Reference code calculating the distance^2 between two bounding boxes */
313 static float box_dist2(float bx0, float bx1, float by0,
314 float by1, float bz0, float bz1,
315 const BoundingBox *bb)
318 float dl, dh, dm, dm0;
322 dl = bx0 - bb->upper.x;
323 dh = bb->lower.x - bx1;
324 dm = std::max(dl, dh);
325 dm0 = std::max(dm, 0.0f);
328 dl = by0 - bb->upper.y;
329 dh = bb->lower.y - by1;
330 dm = std::max(dl, dh);
331 dm0 = std::max(dm, 0.0f);
334 dl = bz0 - bb->upper.z;
335 dh = bb->lower.z - bz1;
336 dm = std::max(dl, dh);
337 dm0 = std::max(dm, 0.0f);
344 #if !NBNXN_SEARCH_BB_SIMD4
346 /*! \brief Plain C code calculating the distance^2 between two bounding boxes in xyz0 format
348 * \param[in] bb_i First bounding box
349 * \param[in] bb_j Second bounding box
351 static float clusterBoundingBoxDistance2(const BoundingBox
& bb_i
, const BoundingBox
& bb_j
)
353 float dl
= bb_i
.lower
.x
- bb_j
.upper
.x
;
354 float dh
= bb_j
.lower
.x
- bb_i
.upper
.x
;
355 float dm
= std::max(dl
, dh
);
356 float dm0
= std::max(dm
, 0.0F
);
357 float d2
= dm0
* dm0
;
359 dl
= bb_i
.lower
.y
- bb_j
.upper
.y
;
360 dh
= bb_j
.lower
.y
- bb_i
.upper
.y
;
361 dm
= std::max(dl
, dh
);
362 dm0
= std::max(dm
, 0.0F
);
365 dl
= bb_i
.lower
.z
- bb_j
.upper
.z
;
366 dh
= bb_j
.lower
.z
- bb_i
.upper
.z
;
367 dm
= std::max(dl
, dh
);
368 dm0
= std::max(dm
, 0.0F
);
374 #else /* NBNXN_SEARCH_BB_SIMD4 */
376 /*! \brief 4-wide SIMD code calculating the distance^2 between two bounding boxes in xyz0 format
378 * \param[in] bb_i First bounding box, should be aligned for 4-wide SIMD
379 * \param[in] bb_j Second bounding box, should be aligned for 4-wide SIMD
381 static float clusterBoundingBoxDistance2(const BoundingBox
& bb_i
, const BoundingBox
& bb_j
)
383 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
386 const Simd4Float bb_i_S0
= load4(bb_i
.lower
.ptr());
387 const Simd4Float bb_i_S1
= load4(bb_i
.upper
.ptr());
388 const Simd4Float bb_j_S0
= load4(bb_j
.lower
.ptr());
389 const Simd4Float bb_j_S1
= load4(bb_j
.upper
.ptr());
391 const Simd4Float dl_S
= bb_i_S0
- bb_j_S1
;
392 const Simd4Float dh_S
= bb_j_S0
- bb_i_S1
;
394 const Simd4Float dm_S
= max(dl_S
, dh_S
);
395 const Simd4Float dm0_S
= max(dm_S
, simd4SetZeroF());
397 return dotProduct(dm0_S
, dm0_S
);
400 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
401 template<int boundingBoxStart
>
402 static inline void gmx_simdcall
clusterBoundingBoxDistance2_xxxx_simd4_inner(const float* bb_i
,
404 const Simd4Float xj_l
,
405 const Simd4Float yj_l
,
406 const Simd4Float zj_l
,
407 const Simd4Float xj_h
,
408 const Simd4Float yj_h
,
409 const Simd4Float zj_h
)
411 constexpr int stride
= c_packedBoundingBoxesDimSize
;
413 const int shi
= boundingBoxStart
* Nbnxm::c_numBoundingBoxBounds1D
* DIM
;
415 const Simd4Float zero
= setZero();
417 const Simd4Float xi_l
= load4(bb_i
+ shi
+ 0 * stride
);
418 const Simd4Float yi_l
= load4(bb_i
+ shi
+ 1 * stride
);
419 const Simd4Float zi_l
= load4(bb_i
+ shi
+ 2 * stride
);
420 const Simd4Float xi_h
= load4(bb_i
+ shi
+ 3 * stride
);
421 const Simd4Float yi_h
= load4(bb_i
+ shi
+ 4 * stride
);
422 const Simd4Float zi_h
= load4(bb_i
+ shi
+ 5 * stride
);
424 const Simd4Float dx_0
= xi_l
- xj_h
;
425 const Simd4Float dy_0
= yi_l
- yj_h
;
426 const Simd4Float dz_0
= zi_l
- zj_h
;
428 const Simd4Float dx_1
= xj_l
- xi_h
;
429 const Simd4Float dy_1
= yj_l
- yi_h
;
430 const Simd4Float dz_1
= zj_l
- zi_h
;
432 const Simd4Float mx
= max(dx_0
, dx_1
);
433 const Simd4Float my
= max(dy_0
, dy_1
);
434 const Simd4Float mz
= max(dz_0
, dz_1
);
436 const Simd4Float m0x
= max(mx
, zero
);
437 const Simd4Float m0y
= max(my
, zero
);
438 const Simd4Float m0z
= max(mz
, zero
);
440 const Simd4Float d2x
= m0x
* m0x
;
441 const Simd4Float d2y
= m0y
* m0y
;
442 const Simd4Float d2z
= m0z
* m0z
;
444 const Simd4Float d2s
= d2x
+ d2y
;
445 const Simd4Float d2t
= d2s
+ d2z
;
447 store4(d2
+ boundingBoxStart
, d2t
);
450 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
451 static void clusterBoundingBoxDistance2_xxxx_simd4(const float* bb_j
, const int nsi
, const float* bb_i
, float* d2
)
453 constexpr int stride
= c_packedBoundingBoxesDimSize
;
455 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
458 const Simd4Float xj_l
= Simd4Float(bb_j
[0 * stride
]);
459 const Simd4Float yj_l
= Simd4Float(bb_j
[1 * stride
]);
460 const Simd4Float zj_l
= Simd4Float(bb_j
[2 * stride
]);
461 const Simd4Float xj_h
= Simd4Float(bb_j
[3 * stride
]);
462 const Simd4Float yj_h
= Simd4Float(bb_j
[4 * stride
]);
463 const Simd4Float zj_h
= Simd4Float(bb_j
[5 * stride
]);
465 /* Here we "loop" over si (0,stride) from 0 to nsi with step stride.
466 * But as we know the number of iterations is 1 or 2, we unroll manually.
468 clusterBoundingBoxDistance2_xxxx_simd4_inner
<0>(bb_i
, d2
, xj_l
, yj_l
, zj_l
, xj_h
, yj_h
, zj_h
);
471 clusterBoundingBoxDistance2_xxxx_simd4_inner
<stride
>(bb_i
, d2
, xj_l
, yj_l
, zj_l
, xj_h
, yj_h
, zj_h
);
475 #endif /* NBNXN_SEARCH_BB_SIMD4 */
478 /* Returns if any atom pair from two clusters is within distance sqrt(rlist2) */
479 static inline gmx_bool
480 clusterpair_in_range(const NbnxnPairlistGpuWork
& work
, int si
, int csj
, int stride
, const real
* x_j
, real rlist2
)
482 #if !GMX_SIMD4_HAVE_REAL
485 * All coordinates are stored as xyzxyz...
488 const real
* x_i
= work
.iSuperClusterData
.x
.data();
490 for (int i
= 0; i
< c_nbnxnGpuClusterSize
; i
++)
492 int i0
= (si
* c_nbnxnGpuClusterSize
+ i
) * DIM
;
493 for (int j
= 0; j
< c_nbnxnGpuClusterSize
; j
++)
495 int j0
= (csj
* c_nbnxnGpuClusterSize
+ j
) * stride
;
497 real d2
= gmx::square(x_i
[i0
] - x_j
[j0
]) + gmx::square(x_i
[i0
+ 1] - x_j
[j0
+ 1])
498 + gmx::square(x_i
[i0
+ 2] - x_j
[j0
+ 2]);
509 #else /* !GMX_SIMD4_HAVE_REAL */
511 /* 4-wide SIMD version.
512 * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
513 * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
515 static_assert(c_nbnxnGpuClusterSize
== 8 || c_nbnxnGpuClusterSize
== 4,
516 "A cluster is hard-coded to 4/8 atoms.");
518 Simd4Real rc2_S
= Simd4Real(rlist2
);
520 const real
* x_i
= work
.iSuperClusterData
.xSimd
.data();
522 int dim_stride
= c_nbnxnGpuClusterSize
* DIM
;
523 Simd4Real ix_S0
= load4(x_i
+ si
* dim_stride
+ 0 * GMX_SIMD4_WIDTH
);
524 Simd4Real iy_S0
= load4(x_i
+ si
* dim_stride
+ 1 * GMX_SIMD4_WIDTH
);
525 Simd4Real iz_S0
= load4(x_i
+ si
* dim_stride
+ 2 * GMX_SIMD4_WIDTH
);
527 Simd4Real ix_S1
, iy_S1
, iz_S1
;
528 if (c_nbnxnGpuClusterSize
== 8)
530 ix_S1
= load4(x_i
+ si
* dim_stride
+ 3 * GMX_SIMD4_WIDTH
);
531 iy_S1
= load4(x_i
+ si
* dim_stride
+ 4 * GMX_SIMD4_WIDTH
);
532 iz_S1
= load4(x_i
+ si
* dim_stride
+ 5 * GMX_SIMD4_WIDTH
);
534 /* We loop from the outer to the inner particles to maximize
535 * the chance that we find a pair in range quickly and return.
537 int j0
= csj
* c_nbnxnGpuClusterSize
;
538 int j1
= j0
+ c_nbnxnGpuClusterSize
- 1;
541 Simd4Real jx0_S
, jy0_S
, jz0_S
;
542 Simd4Real jx1_S
, jy1_S
, jz1_S
;
544 Simd4Real dx_S0
, dy_S0
, dz_S0
;
545 Simd4Real dx_S1
, dy_S1
, dz_S1
;
546 Simd4Real dx_S2
, dy_S2
, dz_S2
;
547 Simd4Real dx_S3
, dy_S3
, dz_S3
;
558 Simd4Bool wco_any_S01
, wco_any_S23
, wco_any_S
;
560 jx0_S
= Simd4Real(x_j
[j0
* stride
+ 0]);
561 jy0_S
= Simd4Real(x_j
[j0
* stride
+ 1]);
562 jz0_S
= Simd4Real(x_j
[j0
* stride
+ 2]);
564 jx1_S
= Simd4Real(x_j
[j1
* stride
+ 0]);
565 jy1_S
= Simd4Real(x_j
[j1
* stride
+ 1]);
566 jz1_S
= Simd4Real(x_j
[j1
* stride
+ 2]);
568 /* Calculate distance */
569 dx_S0
= ix_S0
- jx0_S
;
570 dy_S0
= iy_S0
- jy0_S
;
571 dz_S0
= iz_S0
- jz0_S
;
572 dx_S2
= ix_S0
- jx1_S
;
573 dy_S2
= iy_S0
- jy1_S
;
574 dz_S2
= iz_S0
- jz1_S
;
575 if (c_nbnxnGpuClusterSize
== 8)
577 dx_S1
= ix_S1
- jx0_S
;
578 dy_S1
= iy_S1
- jy0_S
;
579 dz_S1
= iz_S1
- jz0_S
;
580 dx_S3
= ix_S1
- jx1_S
;
581 dy_S3
= iy_S1
- jy1_S
;
582 dz_S3
= iz_S1
- jz1_S
;
585 /* rsq = dx*dx+dy*dy+dz*dz */
586 rsq_S0
= norm2(dx_S0
, dy_S0
, dz_S0
);
587 rsq_S2
= norm2(dx_S2
, dy_S2
, dz_S2
);
588 if (c_nbnxnGpuClusterSize
== 8)
590 rsq_S1
= norm2(dx_S1
, dy_S1
, dz_S1
);
591 rsq_S3
= norm2(dx_S3
, dy_S3
, dz_S3
);
594 wco_S0
= (rsq_S0
< rc2_S
);
595 wco_S2
= (rsq_S2
< rc2_S
);
596 if (c_nbnxnGpuClusterSize
== 8)
598 wco_S1
= (rsq_S1
< rc2_S
);
599 wco_S3
= (rsq_S3
< rc2_S
);
601 if (c_nbnxnGpuClusterSize
== 8)
603 wco_any_S01
= wco_S0
|| wco_S1
;
604 wco_any_S23
= wco_S2
|| wco_S3
;
605 wco_any_S
= wco_any_S01
|| wco_any_S23
;
609 wco_any_S
= wco_S0
|| wco_S2
;
612 if (anyTrue(wco_any_S
))
623 #endif /* !GMX_SIMD4_HAVE_REAL */
626 /* Returns the j-cluster index for index cjIndex in a cj list */
627 static inline int nblCj(gmx::ArrayRef
<const nbnxn_cj_t
> cjList
, int cjIndex
)
629 return cjList
[cjIndex
].cj
;
632 /* Returns the j-cluster index for index cjIndex in a cj4 list */
633 static inline int nblCj(gmx::ArrayRef
<const nbnxn_cj4_t
> cj4List
, int cjIndex
)
635 return cj4List
[cjIndex
/ c_nbnxnGpuJgroupSize
].cj
[cjIndex
& (c_nbnxnGpuJgroupSize
- 1)];
638 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
639 static unsigned int nbl_imask0(const NbnxnPairlistGpu
* nbl
, int cj_ind
)
641 return nbl
->cj4
[cj_ind
/ c_nbnxnGpuJgroupSize
].imei
[0].imask
;
644 NbnxnPairlistCpu::NbnxnPairlistCpu() :
645 na_ci(c_nbnxnCpuIClusterSize
),
650 work(std::make_unique
<NbnxnPairlistCpuWork
>())
654 NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy
) :
655 na_ci(c_nbnxnGpuClusterSize
),
656 na_cj(c_nbnxnGpuClusterSize
),
657 na_sc(c_gpuNumClusterPerCell
* c_nbnxnGpuClusterSize
),
659 sci({}, { pinningPolicy
}),
660 cj4({}, { pinningPolicy
}),
661 excl({}, { pinningPolicy
}),
663 work(std::make_unique
<NbnxnPairlistGpuWork
>())
665 static_assert(c_nbnxnGpuNumClusterPerSupercluster
== c_gpuNumClusterPerCell
,
666 "The search code assumes that the a super-cluster matches a search grid cell");
668 static_assert(sizeof(cj4
[0].imei
[0].imask
) * 8 >= c_nbnxnGpuJgroupSize
* c_gpuNumClusterPerCell
,
669 "The i super-cluster cluster interaction mask does not contain a sufficient "
672 static_assert(sizeof(excl
[0]) * 8 >= c_nbnxnGpuJgroupSize
* c_gpuNumClusterPerCell
,
673 "The GPU exclusion mask does not contain a sufficient number of bits");
675 // We always want a first entry without any exclusions
679 // TODO: Move to pairlistset.cpp
680 PairlistSet::PairlistSet(const InteractionLocality locality
, const PairlistParams
& pairlistParams
) :
682 params_(pairlistParams
)
684 isCpuType_
= (params_
.pairlistType
== PairlistType::Simple4x2
685 || params_
.pairlistType
== PairlistType::Simple4x4
686 || params_
.pairlistType
== PairlistType::Simple4x8
);
687 // Currently GPU lists are always combined
688 combineLists_
= !isCpuType_
;
690 const int numLists
= gmx_omp_nthreads_get(emntNonbonded
);
692 if (!combineLists_
&& numLists
> NBNXN_BUFFERFLAG_MAX_THREADS
)
695 "%d OpenMP threads were requested. Since the non-bonded force buffer reduction "
696 "is prohibitively slow with more than %d threads, we do not allow this. Use %d "
697 "or less OpenMP threads.",
698 numLists
, NBNXN_BUFFERFLAG_MAX_THREADS
, NBNXN_BUFFERFLAG_MAX_THREADS
);
703 cpuLists_
.resize(numLists
);
706 cpuListsWork_
.resize(numLists
);
711 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
712 gpuLists_
.emplace_back(gmx::PinningPolicy::PinnedIfSupported
);
713 /* Lists 0 to numLists are use for constructing lists in parallel
714 * on the CPU using numLists threads (and then merged into list 0).
716 for (int i
= 1; i
< numLists
; i
++)
718 gpuLists_
.emplace_back(gmx::PinningPolicy::CannotBePinned
);
723 fepLists_
.resize(numLists
);
725 /* Execute in order to avoid memory interleaving between threads */
726 #pragma omp parallel for num_threads(numLists) schedule(static)
727 for (int i
= 0; i
< numLists
; i
++)
731 /* We used to allocate all normal lists locally on each thread
732 * as well. The question is if allocating the object on the
733 * master thread (but all contained list memory thread local)
734 * impacts performance.
736 fepLists_
[i
] = std::make_unique
<t_nblist
>();
737 nbnxn_init_pairlist_fep(fepLists_
[i
].get());
739 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
744 /* Print statistics of a pair list, used for debug output */
745 static void print_nblist_statistics(FILE* fp
,
746 const NbnxnPairlistCpu
& nbl
,
747 const Nbnxm::GridSet
& gridSet
,
750 const Grid
& grid
= gridSet
.grids()[0];
751 const Grid::Dimensions
& dims
= grid
.dimensions();
753 fprintf(fp
, "nbl nci %zu ncj %d\n", nbl
.ci
.size(), nbl
.ncjInUse
);
754 const int numAtomsJCluster
= grid
.geometry().numAtomsJCluster
;
755 const double numAtomsPerCell
= nbl
.ncjInUse
/ static_cast<double>(grid
.numCells()) * numAtomsJCluster
;
756 fprintf(fp
, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n", nbl
.na_cj
, rl
,
757 nbl
.ncjInUse
, nbl
.ncjInUse
/ static_cast<double>(grid
.numCells()), numAtomsPerCell
,
759 / (0.5 * 4.0 / 3.0 * M_PI
* rl
* rl
* rl
* grid
.numCells() * numAtomsJCluster
760 / (dims
.gridSize
[XX
] * dims
.gridSize
[YY
] * dims
.gridSize
[ZZ
])));
762 fprintf(fp
, "nbl average j cell list length %.1f\n",
763 0.25 * nbl
.ncjInUse
/ std::max(static_cast<double>(nbl
.ci
.size()), 1.0));
765 int cs
[SHIFTS
] = { 0 };
767 for (const nbnxn_ci_t
& ciEntry
: nbl
.ci
)
769 cs
[ciEntry
.shift
& NBNXN_CI_SHIFT
] += ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
;
771 int j
= ciEntry
.cj_ind_start
;
772 while (j
< ciEntry
.cj_ind_end
&& nbl
.cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
778 fprintf(fp
, "nbl cell pairs, total: %zu excl: %d %.1f%%\n", nbl
.cj
.size(), npexcl
,
779 100 * npexcl
/ std::max(static_cast<double>(nbl
.cj
.size()), 1.0));
780 for (int s
= 0; s
< SHIFTS
; s
++)
784 fprintf(fp
, "nbl shift %2d ncj %3d\n", s
, cs
[s
]);
789 /* Print statistics of a pair lists, used for debug output */
790 static void print_nblist_statistics(FILE* fp
,
791 const NbnxnPairlistGpu
& nbl
,
792 const Nbnxm::GridSet
& gridSet
,
795 const Grid
& grid
= gridSet
.grids()[0];
796 const Grid::Dimensions
& dims
= grid
.dimensions();
798 fprintf(fp
, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n", nbl
.sci
.size(), nbl
.cj4
.size(),
799 nbl
.nci_tot
, nbl
.excl
.size());
800 const int numAtomsCluster
= grid
.geometry().numAtomsICluster
;
801 const double numAtomsPerCell
= nbl
.nci_tot
/ static_cast<double>(grid
.numClusters()) * numAtomsCluster
;
802 fprintf(fp
, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n", nbl
.na_ci
, rl
,
803 nbl
.nci_tot
, nbl
.nci_tot
/ static_cast<double>(grid
.numClusters()), numAtomsPerCell
,
805 / (0.5 * 4.0 / 3.0 * M_PI
* rl
* rl
* rl
* grid
.numClusters() * numAtomsCluster
806 / (dims
.gridSize
[XX
] * dims
.gridSize
[YY
] * dims
.gridSize
[ZZ
])));
811 int c
[c_gpuNumClusterPerCell
+ 1] = { 0 };
812 for (const nbnxn_sci_t
& sci
: nbl
.sci
)
815 for (int j4
= sci
.cj4_ind_start
; j4
< sci
.cj4_ind_end
; j4
++)
817 for (int j
= 0; j
< c_nbnxnGpuJgroupSize
; j
++)
820 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
822 if (nbl
.cj4
[j4
].imei
[0].imask
& (1U << (j
* c_gpuNumClusterPerCell
+ si
)))
832 sum_nsp2
+= nsp
* nsp
;
833 nsp_max
= std::max(nsp_max
, nsp
);
835 if (!nbl
.sci
.empty())
837 sum_nsp
/= nbl
.sci
.size();
838 sum_nsp2
/= nbl
.sci
.size();
840 fprintf(fp
, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n", sum_nsp
,
841 std::sqrt(sum_nsp2
- sum_nsp
* sum_nsp
), nsp_max
);
843 if (!nbl
.cj4
.empty())
845 for (int b
= 0; b
<= c_gpuNumClusterPerCell
; b
++)
847 fprintf(fp
, "nbl j-list #i-subcell %d %7d %4.1f\n", b
, c
[b
],
848 100.0 * c
[b
] / size_t{ nbl
.cj4
.size() * c_nbnxnGpuJgroupSize
});
853 /* Returns a reference to the exclusion mask for j-cluster-group \p cj4 and warp \p warp
854 * Generates a new exclusion entry when the j-cluster-group uses
855 * the default all-interaction mask at call time, so the returned mask
856 * can be modified when needed.
858 static nbnxn_excl_t
& get_exclusion_mask(NbnxnPairlistGpu
* nbl
, int cj4
, int warp
)
860 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
862 /* No exclusions set, make a new list entry */
863 const size_t oldSize
= nbl
->excl
.size();
864 GMX_ASSERT(oldSize
>= 1, "We should always have entry [0]");
865 /* Add entry with default values: no exclusions */
866 nbl
->excl
.resize(oldSize
+ 1);
867 nbl
->cj4
[cj4
].imei
[warp
].excl_ind
= oldSize
;
870 return nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
873 /* Sets self exclusions and excludes half of the double pairs in the self cluster-pair \p nbl->cj4[cj4Index].cj[jOffsetInGroup]
875 * \param[in,out] nbl The cluster pair list
876 * \param[in] cj4Index The j-cluster group index into \p nbl->cj4
877 * \param[in] jOffsetInGroup The j-entry offset in \p nbl->cj4[cj4Index]
878 * \param[in] iClusterInCell The i-cluster index in the cell
880 static void setSelfAndNewtonExclusionsGpu(NbnxnPairlistGpu
* nbl
,
882 const int jOffsetInGroup
,
883 const int iClusterInCell
)
885 constexpr int numJatomsPerPart
= c_nbnxnGpuClusterSize
/ c_nbnxnGpuClusterpairSplit
;
887 /* The exclusions are stored separately for each part of the split */
888 for (int part
= 0; part
< c_nbnxnGpuClusterpairSplit
; part
++)
890 const int jOffset
= part
* numJatomsPerPart
;
891 /* Make a new exclusion mask entry for each part, if we don't already have one yet */
892 nbnxn_excl_t
& excl
= get_exclusion_mask(nbl
, cj4Index
, part
);
894 /* Set all bits with j-index <= i-index */
895 for (int jIndexInPart
= 0; jIndexInPart
< numJatomsPerPart
; jIndexInPart
++)
897 for (int i
= jOffset
+ jIndexInPart
; i
< c_nbnxnGpuClusterSize
; i
++)
899 excl
.pair
[jIndexInPart
* c_nbnxnGpuClusterSize
+ i
] &=
900 ~(1U << (jOffsetInGroup
* c_gpuNumClusterPerCell
+ iClusterInCell
));
906 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
907 static unsigned int get_imask(gmx_bool rdiag
, int ci
, int cj
)
909 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
912 /* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
913 gmx_unused
static unsigned int get_imask_simd_j2(gmx_bool rdiag
, int ci
, int cj
)
915 return (rdiag
&& ci
* 2 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_0
916 : (rdiag
&& ci
* 2 + 1 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_1
917 : NBNXN_INTERACTION_MASK_ALL
));
920 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
921 gmx_unused
static unsigned int get_imask_simd_j4(gmx_bool rdiag
, int ci
, int cj
)
923 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
926 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
927 gmx_unused
static unsigned int get_imask_simd_j8(gmx_bool rdiag
, int ci
, int cj
)
929 return (rdiag
&& ci
== cj
* 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
930 : (rdiag
&& ci
== cj
* 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
931 : NBNXN_INTERACTION_MASK_ALL
));
935 # if GMX_SIMD_REAL_WIDTH == 2
936 # define get_imask_simd_4xn get_imask_simd_j2
938 # if GMX_SIMD_REAL_WIDTH == 4
939 # define get_imask_simd_4xn get_imask_simd_j4
941 # if GMX_SIMD_REAL_WIDTH == 8
942 # define get_imask_simd_4xn get_imask_simd_j8
943 # define get_imask_simd_2xnn get_imask_simd_j4
945 # if GMX_SIMD_REAL_WIDTH == 16
946 # define get_imask_simd_2xnn get_imask_simd_j8
950 /* Plain C code for checking and adding cluster-pairs to the list.
952 * \param[in] gridj The j-grid
953 * \param[in,out] nbl The pair-list to store the cluster pairs in
954 * \param[in] icluster The index of the i-cluster
955 * \param[in] jclusterFirst The first cluster in the j-range
956 * \param[in] jclusterLast The last cluster in the j-range
957 * \param[in] excludeSubDiagonal Exclude atom pairs with i-index > j-index
958 * \param[in] x_j Coordinates for the j-atom, in xyz format
959 * \param[in] rlist2 The squared list cut-off
960 * \param[in] rbb2 The squared cut-off for putting cluster-pairs in the list based on bounding box distance only
961 * \param[in,out] numDistanceChecks The number of distance checks performed
963 static void makeClusterListSimple(const Grid
& jGrid
,
964 NbnxnPairlistCpu
* nbl
,
968 bool excludeSubDiagonal
,
969 const real
* gmx_restrict x_j
,
972 int* gmx_restrict numDistanceChecks
)
974 const BoundingBox
* gmx_restrict bb_ci
= nbl
->work
->iClusterData
.bb
.data();
975 const real
* gmx_restrict x_ci
= nbl
->work
->iClusterData
.x
.data();
980 while (!InRange
&& jclusterFirst
<= jclusterLast
)
982 real d2
= clusterBoundingBoxDistance2(bb_ci
[0], jGrid
.jBoundingBoxes()[jclusterFirst
]);
983 *numDistanceChecks
+= 2;
985 /* Check if the distance is within the distance where
986 * we use only the bounding box distance rbb,
987 * or within the cut-off and there is at least one atom pair
988 * within the cut-off.
994 else if (d2
< rlist2
)
996 int cjf_gl
= jGrid
.cellOffset() + jclusterFirst
;
997 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
&& !InRange
; i
++)
999 for (int j
= 0; j
< c_nbnxnCpuIClusterSize
; j
++)
1003 || (gmx::square(x_ci
[i
* STRIDE_XYZ
+ XX
]
1004 - x_j
[(cjf_gl
* c_nbnxnCpuIClusterSize
+ j
) * STRIDE_XYZ
+ XX
])
1005 + gmx::square(x_ci
[i
* STRIDE_XYZ
+ YY
]
1006 - x_j
[(cjf_gl
* c_nbnxnCpuIClusterSize
+ j
) * STRIDE_XYZ
+ YY
])
1007 + gmx::square(x_ci
[i
* STRIDE_XYZ
+ ZZ
]
1008 - x_j
[(cjf_gl
* c_nbnxnCpuIClusterSize
+ j
) * STRIDE_XYZ
+ ZZ
])
1012 *numDistanceChecks
+= c_nbnxnCpuIClusterSize
* c_nbnxnCpuIClusterSize
;
1025 while (!InRange
&& jclusterLast
> jclusterFirst
)
1027 real d2
= clusterBoundingBoxDistance2(bb_ci
[0], jGrid
.jBoundingBoxes()[jclusterLast
]);
1028 *numDistanceChecks
+= 2;
1030 /* Check if the distance is within the distance where
1031 * we use only the bounding box distance rbb,
1032 * or within the cut-off and there is at least one atom pair
1033 * within the cut-off.
1039 else if (d2
< rlist2
)
1041 int cjl_gl
= jGrid
.cellOffset() + jclusterLast
;
1042 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
&& !InRange
; i
++)
1044 for (int j
= 0; j
< c_nbnxnCpuIClusterSize
; j
++)
1048 || (gmx::square(x_ci
[i
* STRIDE_XYZ
+ XX
]
1049 - x_j
[(cjl_gl
* c_nbnxnCpuIClusterSize
+ j
) * STRIDE_XYZ
+ XX
])
1050 + gmx::square(x_ci
[i
* STRIDE_XYZ
+ YY
]
1051 - x_j
[(cjl_gl
* c_nbnxnCpuIClusterSize
+ j
) * STRIDE_XYZ
+ YY
])
1052 + gmx::square(x_ci
[i
* STRIDE_XYZ
+ ZZ
]
1053 - x_j
[(cjl_gl
* c_nbnxnCpuIClusterSize
+ j
) * STRIDE_XYZ
+ ZZ
])
1057 *numDistanceChecks
+= c_nbnxnCpuIClusterSize
* c_nbnxnCpuIClusterSize
;
1065 if (jclusterFirst
<= jclusterLast
)
1067 for (int jcluster
= jclusterFirst
; jcluster
<= jclusterLast
; jcluster
++)
1069 /* Store cj and the interaction mask */
1071 cjEntry
.cj
= jGrid
.cellOffset() + jcluster
;
1072 cjEntry
.excl
= get_imask(excludeSubDiagonal
, icluster
, jcluster
);
1073 nbl
->cj
.push_back(cjEntry
);
1075 /* Increase the closing index in the i list */
1076 nbl
->ci
.back().cj_ind_end
= nbl
->cj
.size();
1080 #ifdef GMX_NBNXN_SIMD_4XN
1081 # include "pairlist_simd_4xm.h"
1083 #ifdef GMX_NBNXN_SIMD_2XNN
1084 # include "pairlist_simd_2xmm.h"
1087 /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
1088 * Checks bounding box distances and possibly atom pair distances.
1090 static void make_cluster_list_supersub(const Grid
& iGrid
,
1092 NbnxnPairlistGpu
* nbl
,
1095 const bool excludeSubDiagonal
,
1100 int* numDistanceChecks
)
1102 NbnxnPairlistGpuWork
& work
= *nbl
->work
;
1105 const float* pbb_ci
= work
.iSuperClusterData
.bbPacked
.data();
1107 const BoundingBox
* bb_ci
= work
.iSuperClusterData
.bb
.data();
1110 assert(c_nbnxnGpuClusterSize
== iGrid
.geometry().numAtomsICluster
);
1111 assert(c_nbnxnGpuClusterSize
== jGrid
.geometry().numAtomsICluster
);
1113 /* We generate the pairlist mainly based on bounding-box distances
1114 * and do atom pair distance based pruning on the GPU.
1115 * Only if a j-group contains a single cluster-pair, we try to prune
1116 * that pair based on atom distances on the CPU to avoid empty j-groups.
1118 #define PRUNE_LIST_CPU_ONE 1
1119 #define PRUNE_LIST_CPU_ALL 0
1121 #if PRUNE_LIST_CPU_ONE
1125 float* d2l
= work
.distanceBuffer
.data();
1127 for (int subc
= 0; subc
< jGrid
.numClustersPerCell()[scj
]; subc
++)
1129 const int cj4_ind
= work
.cj_ind
/ c_nbnxnGpuJgroupSize
;
1130 const int cj_offset
= work
.cj_ind
- cj4_ind
* c_nbnxnGpuJgroupSize
;
1131 const int cj
= scj
* c_gpuNumClusterPerCell
+ subc
;
1133 const int cj_gl
= jGrid
.cellOffset() * c_gpuNumClusterPerCell
+ cj
;
1136 if (excludeSubDiagonal
&& sci
== scj
)
1142 ci1
= iGrid
.numClustersPerCell()[sci
];
1146 /* Determine all ci1 bb distances in one call with SIMD4 */
1147 const int offset
= packedBoundingBoxesIndex(cj
) + (cj
& (c_packedBoundingBoxesDimSize
- 1));
1148 clusterBoundingBoxDistance2_xxxx_simd4(jGrid
.packedBoundingBoxes().data() + offset
, ci1
,
1150 *numDistanceChecks
+= c_nbnxnGpuClusterSize
* 2;
1154 unsigned int imask
= 0;
1155 /* We use a fixed upper-bound instead of ci1 to help optimization */
1156 for (int ci
= 0; ci
< c_gpuNumClusterPerCell
; ci
++)
1164 /* Determine the bb distance between ci and cj */
1165 d2l
[ci
] = clusterBoundingBoxDistance2(bb_ci
[ci
], jGrid
.jBoundingBoxes()[cj
]);
1166 *numDistanceChecks
+= 2;
1170 #if PRUNE_LIST_CPU_ALL
1171 /* Check if the distance is within the distance where
1172 * we use only the bounding box distance rbb,
1173 * or within the cut-off and there is at least one atom pair
1174 * within the cut-off. This check is very costly.
1176 *numDistanceChecks
+= c_nbnxnGpuClusterSize
* c_nbnxnGpuClusterSize
;
1177 if (d2
< rbb2
|| (d2
< rlist2
&& clusterpair_in_range(work
, ci
, cj_gl
, stride
, x
, rlist2
)))
1179 /* Check if the distance between the two bounding boxes
1180 * in within the pair-list cut-off.
1185 /* Flag this i-subcell to be taken into account */
1186 imask
|= (1U << (cj_offset
* c_gpuNumClusterPerCell
+ ci
));
1188 #if PRUNE_LIST_CPU_ONE
1196 #if PRUNE_LIST_CPU_ONE
1197 /* If we only found 1 pair, check if any atoms are actually
1198 * within the cut-off, so we could get rid of it.
1200 if (npair
== 1 && d2l
[ci_last
] >= rbb2
1201 && !clusterpair_in_range(work
, ci_last
, cj_gl
, stride
, x
, rlist2
))
1203 imask
&= ~(1U << (cj_offset
* c_gpuNumClusterPerCell
+ ci_last
));
1210 /* We have at least one cluster pair: add a j-entry */
1211 if (static_cast<size_t>(cj4_ind
) == nbl
->cj4
.size())
1213 nbl
->cj4
.resize(nbl
->cj4
.size() + 1);
1215 nbnxn_cj4_t
* cj4
= &nbl
->cj4
[cj4_ind
];
1217 cj4
->cj
[cj_offset
] = cj_gl
;
1219 /* Set the exclusions for the ci==sj entry.
1220 * Here we don't bother to check if this entry is actually flagged,
1221 * as it will nearly always be in the list.
1223 if (excludeSubDiagonal
&& sci
== scj
)
1225 setSelfAndNewtonExclusionsGpu(nbl
, cj4_ind
, cj_offset
, subc
);
1228 /* Copy the cluster interaction mask to the list */
1229 for (int w
= 0; w
< c_nbnxnGpuClusterpairSplit
; w
++)
1231 cj4
->imei
[w
].imask
|= imask
;
1234 nbl
->work
->cj_ind
++;
1236 /* Keep the count */
1237 nbl
->nci_tot
+= npair
;
1239 /* Increase the closing index in i super-cell list */
1240 nbl
->sci
.back().cj4_ind_end
=
1241 (nbl
->work
->cj_ind
+ c_nbnxnGpuJgroupSize
- 1) / c_nbnxnGpuJgroupSize
;
1246 /* Returns how many contiguous j-clusters we have starting in the i-list */
1247 template<typename CjListType
>
1248 static int numContiguousJClusters(const int cjIndexStart
,
1249 const int cjIndexEnd
,
1250 gmx::ArrayRef
<const CjListType
> cjList
)
1252 const int firstJCluster
= nblCj(cjList
, cjIndexStart
);
1254 int numContiguous
= 0;
1256 while (cjIndexStart
+ numContiguous
< cjIndexEnd
1257 && nblCj(cjList
, cjIndexStart
+ numContiguous
) == firstJCluster
+ numContiguous
)
1262 return numContiguous
;
1266 * \brief Helper struct for efficient searching for excluded atoms in a j-list
1270 /*! \brief Constructs a j-list range from \p cjList with the given index range */
1271 template<typename CjListType
>
1272 JListRanges(int cjIndexStart
, int cjIndexEnd
, gmx::ArrayRef
<const CjListType
> cjList
);
1274 int cjIndexStart
; //!< The start index in the j-list
1275 int cjIndexEnd
; //!< The end index in the j-list
1276 int cjFirst
; //!< The j-cluster with index cjIndexStart
1277 int cjLast
; //!< The j-cluster with index cjIndexEnd-1
1278 int numDirect
; //!< Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
1282 template<typename CjListType
>
1283 JListRanges::JListRanges(int cjIndexStart
, int cjIndexEnd
, gmx::ArrayRef
<const CjListType
> cjList
) :
1284 cjIndexStart(cjIndexStart
),
1285 cjIndexEnd(cjIndexEnd
)
1287 GMX_ASSERT(cjIndexEnd
> cjIndexStart
, "JListRanges should only be called with non-empty lists");
1289 cjFirst
= nblCj(cjList
, cjIndexStart
);
1290 cjLast
= nblCj(cjList
, cjIndexEnd
- 1);
1292 /* Determine how many contiguous j-cells we have starting
1293 * from the first i-cell. This number can be used to directly
1294 * calculate j-cell indices for excluded atoms.
1296 numDirect
= numContiguousJClusters(cjIndexStart
, cjIndexEnd
, cjList
);
1300 /* Return the index of \p jCluster in the given range or -1 when not present
1302 * Note: This code is executed very often and therefore performance is
1303 * important. It should be inlined and fully optimized.
1305 template<typename CjListType
>
1306 static inline int findJClusterInJList(int jCluster
,
1307 const JListRanges
& ranges
,
1308 gmx::ArrayRef
<const CjListType
> cjList
)
1312 if (jCluster
< ranges
.cjFirst
+ ranges
.numDirect
)
1314 /* We can calculate the index directly using the offset */
1315 index
= ranges
.cjIndexStart
+ jCluster
- ranges
.cjFirst
;
1319 /* Search for jCluster using bisection */
1321 int rangeStart
= ranges
.cjIndexStart
+ ranges
.numDirect
;
1322 int rangeEnd
= ranges
.cjIndexEnd
;
1324 while (index
== -1 && rangeStart
< rangeEnd
)
1326 rangeMiddle
= (rangeStart
+ rangeEnd
) >> 1;
1328 const int clusterMiddle
= nblCj(cjList
, rangeMiddle
);
1330 if (jCluster
== clusterMiddle
)
1332 index
= rangeMiddle
;
1334 else if (jCluster
< clusterMiddle
)
1336 rangeEnd
= rangeMiddle
;
1340 rangeStart
= rangeMiddle
+ 1;
1348 // TODO: Get rid of the two functions below by renaming sci to ci (or something better)
1350 /* Return the i-entry in the list we are currently operating on */
1351 static nbnxn_ci_t
* getOpenIEntry(NbnxnPairlistCpu
* nbl
)
1353 return &nbl
->ci
.back();
1356 /* Return the i-entry in the list we are currently operating on */
1357 static nbnxn_sci_t
* getOpenIEntry(NbnxnPairlistGpu
* nbl
)
1359 return &nbl
->sci
.back();
1362 /* Set all atom-pair exclusions for a simple type list i-entry
1364 * Set all atom-pair exclusions from the topology stored in exclusions
1365 * as masks in the pair-list for simple list entry iEntry.
1367 static void setExclusionsForIEntry(const Nbnxm::GridSet
& gridSet
,
1368 NbnxnPairlistCpu
* nbl
,
1369 gmx_bool diagRemoved
,
1371 const nbnxn_ci_t
& iEntry
,
1372 const ListOfLists
<int>& exclusions
)
1374 if (iEntry
.cj_ind_end
== iEntry
.cj_ind_start
)
1376 /* Empty list: no exclusions */
1380 const JListRanges
ranges(iEntry
.cj_ind_start
, iEntry
.cj_ind_end
, gmx::makeConstArrayRef(nbl
->cj
));
1382 const int iCluster
= iEntry
.ci
;
1384 gmx::ArrayRef
<const int> cell
= gridSet
.cells();
1385 gmx::ArrayRef
<const int> atomIndices
= gridSet
.atomIndices();
1387 /* Loop over the atoms in the i-cluster */
1388 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1390 const int iIndex
= iCluster
* nbl
->na_ci
+ i
;
1391 const int iAtom
= atomIndices
[iIndex
];
1394 /* Loop over the topology-based exclusions for this i-atom */
1395 for (const int jAtom
: exclusions
[iAtom
])
1399 /* The self exclusion are already set, save some time */
1403 /* Get the index of the j-atom in the nbnxn atom data */
1404 const int jIndex
= cell
[jAtom
];
1406 /* Without shifts we only calculate interactions j>i
1407 * for one-way pair-lists.
1409 if (diagRemoved
&& jIndex
<= iIndex
)
1414 const int jCluster
= (jIndex
>> na_cj_2log
);
1416 /* Could the cluster se be in our list? */
1417 if (jCluster
>= ranges
.cjFirst
&& jCluster
<= ranges
.cjLast
)
1420 findJClusterInJList(jCluster
, ranges
, gmx::makeConstArrayRef(nbl
->cj
));
1424 /* We found an exclusion, clear the corresponding
1427 const int innerJ
= jIndex
- (jCluster
<< na_cj_2log
);
1429 nbl
->cj
[index
].excl
&= ~(1U << ((i
<< na_cj_2log
) + innerJ
));
1437 /* Add a new i-entry to the FEP list and copy the i-properties */
1438 static inline void fep_list_new_nri_copy(t_nblist
* nlist
)
1440 /* Add a new i-entry */
1443 assert(nlist
->nri
< nlist
->maxnri
);
1445 /* Duplicate the last i-entry, except for jindex, which continues */
1446 nlist
->iinr
[nlist
->nri
] = nlist
->iinr
[nlist
->nri
- 1];
1447 nlist
->shift
[nlist
->nri
] = nlist
->shift
[nlist
->nri
- 1];
1448 nlist
->gid
[nlist
->nri
] = nlist
->gid
[nlist
->nri
- 1];
1449 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1452 /* Rellocate FEP list for size nl->maxnri, TODO: replace by C++ */
1453 static void reallocate_nblist(t_nblist
* nl
)
1458 "reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
1459 nl
->ielec
, nl
->ivdw
, nl
->igeometry
, nl
->type
, nl
->maxnri
);
1461 srenew(nl
->iinr
, nl
->maxnri
);
1462 srenew(nl
->gid
, nl
->maxnri
);
1463 srenew(nl
->shift
, nl
->maxnri
);
1464 srenew(nl
->jindex
, nl
->maxnri
+ 1);
1467 /* For load balancing of the free-energy lists over threads, we set
1468 * the maximum nrj size of an i-entry to 40. This leads to good
1469 * load balancing in the worst case scenario of a single perturbed
1470 * particle on 16 threads, while not introducing significant overhead.
1471 * Note that half of the perturbed pairs will anyhow end up in very small lists,
1472 * since non perturbed i-particles will see few perturbed j-particles).
1474 const int max_nrj_fep
= 40;
1476 /* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
1477 * singularities for overlapping particles (0/0), since the charges and
1478 * LJ parameters have been zeroed in the nbnxn data structure.
1479 * Simultaneously make a group pair list for the perturbed pairs.
1481 static void make_fep_list(gmx::ArrayRef
<const int> atomIndices
,
1482 const nbnxn_atomdata_t
* nbat
,
1483 NbnxnPairlistCpu
* nbl
,
1484 gmx_bool bDiagRemoved
,
1486 real gmx_unused shx
,
1487 real gmx_unused shy
,
1488 real gmx_unused shz
,
1489 real gmx_unused rlist_fep2
,
1494 int ci
, cj_ind_start
, cj_ind_end
, cja
, cjr
;
1496 int gid_i
= 0, gid_j
, gid
;
1497 int egp_shift
, egp_mask
;
1499 int ind_i
, ind_j
, ai
, aj
;
1501 gmx_bool bFEP_i
, bFEP_i_all
;
1503 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
1511 cj_ind_start
= nbl_ci
->cj_ind_start
;
1512 cj_ind_end
= nbl_ci
->cj_ind_end
;
1514 /* In worst case we have alternating energy groups
1515 * and create #atom-pair lists, which means we need the size
1516 * of a cluster pair (na_ci*na_cj) times the number of cj's.
1518 nri_max
= nbl
->na_ci
* nbl
->na_cj
* (cj_ind_end
- cj_ind_start
);
1519 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
1521 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
1522 reallocate_nblist(nlist
);
1525 const int numAtomsJCluster
= jGrid
.geometry().numAtomsJCluster
;
1527 const nbnxn_atomdata_t::Params
& nbatParams
= nbat
->params();
1529 const int ngid
= nbatParams
.nenergrp
;
1531 /* TODO: Consider adding a check in grompp and changing this to an assert */
1532 const int numBitsInEnergyGroupIdsForAtomsInJCluster
= sizeof(gid_cj
) * 8;
1533 if (ngid
* numAtomsJCluster
> numBitsInEnergyGroupIdsForAtomsInJCluster
)
1536 "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu "
1538 iGrid
.geometry().numAtomsICluster
, numAtomsJCluster
,
1539 (sizeof(gid_cj
) * 8) / numAtomsJCluster
);
1542 egp_shift
= nbatParams
.neg_2log
;
1543 egp_mask
= (1 << egp_shift
) - 1;
1545 /* Loop over the atoms in the i sub-cell */
1547 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1549 ind_i
= ci
* nbl
->na_ci
+ i
;
1550 ai
= atomIndices
[ind_i
];
1554 nlist
->jindex
[nri
+ 1] = nlist
->jindex
[nri
];
1555 nlist
->iinr
[nri
] = ai
;
1556 /* The actual energy group pair index is set later */
1557 nlist
->gid
[nri
] = 0;
1558 nlist
->shift
[nri
] = nbl_ci
->shift
& NBNXN_CI_SHIFT
;
1560 bFEP_i
= iGrid
.atomIsPerturbed(ci
- iGrid
.cellOffset(), i
);
1562 bFEP_i_all
= bFEP_i_all
&& bFEP_i
;
1564 if (nlist
->nrj
+ (cj_ind_end
- cj_ind_start
) * nbl
->na_cj
> nlist
->maxnrj
)
1566 nlist
->maxnrj
= over_alloc_small(nlist
->nrj
+ (cj_ind_end
- cj_ind_start
) * nbl
->na_cj
);
1567 srenew(nlist
->jjnr
, nlist
->maxnrj
);
1568 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
1573 gid_i
= (nbatParams
.energrp
[ci
] >> (egp_shift
* i
)) & egp_mask
;
1576 for (int cj_ind
= cj_ind_start
; cj_ind
< cj_ind_end
; cj_ind
++)
1578 unsigned int fep_cj
;
1580 cja
= nbl
->cj
[cj_ind
].cj
;
1582 if (numAtomsJCluster
== jGrid
.geometry().numAtomsICluster
)
1584 cjr
= cja
- jGrid
.cellOffset();
1585 fep_cj
= jGrid
.fepBits(cjr
);
1588 gid_cj
= nbatParams
.energrp
[cja
];
1591 else if (2 * numAtomsJCluster
== jGrid
.geometry().numAtomsICluster
)
1593 cjr
= cja
- jGrid
.cellOffset() * 2;
1594 /* Extract half of the ci fep/energrp mask */
1595 fep_cj
= (jGrid
.fepBits(cjr
>> 1) >> ((cjr
& 1) * numAtomsJCluster
))
1596 & ((1 << numAtomsJCluster
) - 1);
1599 gid_cj
= nbatParams
.energrp
[cja
>> 1] >> ((cja
& 1) * numAtomsJCluster
* egp_shift
)
1600 & ((1 << (numAtomsJCluster
* egp_shift
)) - 1);
1605 cjr
= cja
- (jGrid
.cellOffset() >> 1);
1606 /* Combine two ci fep masks/energrp */
1607 fep_cj
= jGrid
.fepBits(cjr
* 2)
1608 + (jGrid
.fepBits(cjr
* 2 + 1) << jGrid
.geometry().numAtomsICluster
);
1611 gid_cj
= nbatParams
.energrp
[cja
* 2]
1612 + (nbatParams
.energrp
[cja
* 2 + 1]
1613 << (jGrid
.geometry().numAtomsICluster
* egp_shift
));
1617 if (bFEP_i
|| fep_cj
!= 0)
1619 for (int j
= 0; j
< nbl
->na_cj
; j
++)
1621 /* Is this interaction perturbed and not excluded? */
1622 ind_j
= cja
* nbl
->na_cj
+ j
;
1623 aj
= atomIndices
[ind_j
];
1624 if (aj
>= 0 && (bFEP_i
|| (fep_cj
& (1 << j
))) && (!bDiagRemoved
|| ind_j
>= ind_i
))
1628 gid_j
= (gid_cj
>> (j
* egp_shift
)) & egp_mask
;
1629 gid
= GID(gid_i
, gid_j
, ngid
);
1631 if (nlist
->nrj
> nlist
->jindex
[nri
] && nlist
->gid
[nri
] != gid
)
1633 /* Energy group pair changed: new list */
1634 fep_list_new_nri_copy(nlist
);
1637 nlist
->gid
[nri
] = gid
;
1640 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
1642 fep_list_new_nri_copy(nlist
);
1646 /* Add it to the FEP list */
1647 nlist
->jjnr
[nlist
->nrj
] = aj
;
1648 nlist
->excl_fep
[nlist
->nrj
] =
1649 (nbl
->cj
[cj_ind
].excl
>> (i
* nbl
->na_cj
+ j
)) & 1;
1652 /* Exclude it from the normal list.
1653 * Note that the charge has been set to zero,
1654 * but we need to avoid 0/0, as perturbed atoms
1655 * can be on top of each other.
1657 nbl
->cj
[cj_ind
].excl
&= ~(1U << (i
* nbl
->na_cj
+ j
));
1663 if (nlist
->nrj
> nlist
->jindex
[nri
])
1665 /* Actually add this new, non-empty, list */
1667 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1674 /* All interactions are perturbed, we can skip this entry */
1675 nbl_ci
->cj_ind_end
= cj_ind_start
;
1676 nbl
->ncjInUse
-= cj_ind_end
- cj_ind_start
;
1680 /* Return the index of atom a within a cluster */
1681 static inline int cj_mod_cj4(int cj
)
1683 return cj
& (c_nbnxnGpuJgroupSize
- 1);
1686 /* Convert a j-cluster to a cj4 group */
1687 static inline int cj_to_cj4(int cj
)
1689 return cj
/ c_nbnxnGpuJgroupSize
;
1692 /* Return the index of an j-atom within a warp */
1693 static inline int a_mod_wj(int a
)
1695 return a
& (c_nbnxnGpuClusterSize
/ c_nbnxnGpuClusterpairSplit
- 1);
1698 /* As make_fep_list above, but for super/sub lists. */
1699 static void make_fep_list(gmx::ArrayRef
<const int> atomIndices
,
1700 const nbnxn_atomdata_t
* nbat
,
1701 NbnxnPairlistGpu
* nbl
,
1702 gmx_bool bDiagRemoved
,
1703 const nbnxn_sci_t
* nbl_sci
,
1714 int ind_i
, ind_j
, ai
, aj
;
1718 const nbnxn_cj4_t
* cj4
;
1720 const int numJClusterGroups
= nbl_sci
->numJClusterGroups();
1721 if (numJClusterGroups
== 0)
1727 const int sci
= nbl_sci
->sci
;
1729 const int cj4_ind_start
= nbl_sci
->cj4_ind_start
;
1730 const int cj4_ind_end
= nbl_sci
->cj4_ind_end
;
1732 /* Here we process one super-cell, max #atoms na_sc, versus a list
1733 * cj4 entries, each with max c_nbnxnGpuJgroupSize cj's, each
1734 * of size na_cj atoms.
1735 * On the GPU we don't support energy groups (yet).
1736 * So for each of the na_sc i-atoms, we need max one FEP list
1737 * for each max_nrj_fep j-atoms.
1739 nri_max
= nbl
->na_sc
* nbl
->na_cj
* (1 + (numJClusterGroups
* c_nbnxnGpuJgroupSize
) / max_nrj_fep
);
1740 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
1742 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
1743 reallocate_nblist(nlist
);
1746 /* Loop over the atoms in the i super-cluster */
1747 for (int c
= 0; c
< c_gpuNumClusterPerCell
; c
++)
1749 c_abs
= sci
* c_gpuNumClusterPerCell
+ c
;
1751 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1753 ind_i
= c_abs
* nbl
->na_ci
+ i
;
1754 ai
= atomIndices
[ind_i
];
1758 nlist
->jindex
[nri
+ 1] = nlist
->jindex
[nri
];
1759 nlist
->iinr
[nri
] = ai
;
1760 /* With GPUs, energy groups are not supported */
1761 nlist
->gid
[nri
] = 0;
1762 nlist
->shift
[nri
] = nbl_sci
->shift
& NBNXN_CI_SHIFT
;
1764 bFEP_i
= iGrid
.atomIsPerturbed(c_abs
- iGrid
.cellOffset() * c_gpuNumClusterPerCell
, i
);
1766 xi
= nbat
->x()[ind_i
* nbat
->xstride
+ XX
] + shx
;
1767 yi
= nbat
->x()[ind_i
* nbat
->xstride
+ YY
] + shy
;
1768 zi
= nbat
->x()[ind_i
* nbat
->xstride
+ ZZ
] + shz
;
1770 const int nrjMax
= nlist
->nrj
+ numJClusterGroups
* c_nbnxnGpuJgroupSize
* nbl
->na_cj
;
1771 if (nrjMax
> nlist
->maxnrj
)
1773 nlist
->maxnrj
= over_alloc_small(nrjMax
);
1774 srenew(nlist
->jjnr
, nlist
->maxnrj
);
1775 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
1778 for (int cj4_ind
= cj4_ind_start
; cj4_ind
< cj4_ind_end
; cj4_ind
++)
1780 cj4
= &nbl
->cj4
[cj4_ind
];
1782 for (int gcj
= 0; gcj
< c_nbnxnGpuJgroupSize
; gcj
++)
1784 if ((cj4
->imei
[0].imask
& (1U << (gcj
* c_gpuNumClusterPerCell
+ c
))) == 0)
1786 /* Skip this ci for this cj */
1790 const int cjr
= cj4
->cj
[gcj
] - jGrid
.cellOffset() * c_gpuNumClusterPerCell
;
1792 if (bFEP_i
|| jGrid
.clusterIsPerturbed(cjr
))
1794 for (int j
= 0; j
< nbl
->na_cj
; j
++)
1796 /* Is this interaction perturbed and not excluded? */
1797 ind_j
= (jGrid
.cellOffset() * c_gpuNumClusterPerCell
+ cjr
) * nbl
->na_cj
+ j
;
1798 aj
= atomIndices
[ind_j
];
1799 if (aj
>= 0 && (bFEP_i
|| jGrid
.atomIsPerturbed(cjr
, j
))
1800 && (!bDiagRemoved
|| ind_j
>= ind_i
))
1803 unsigned int excl_bit
;
1807 j
/ (c_nbnxnGpuClusterSize
/ c_nbnxnGpuClusterpairSplit
);
1808 nbnxn_excl_t
& excl
= get_exclusion_mask(nbl
, cj4_ind
, jHalf
);
1810 excl_pair
= a_mod_wj(j
) * nbl
->na_ci
+ i
;
1811 excl_bit
= (1U << (gcj
* c_gpuNumClusterPerCell
+ c
));
1813 dx
= nbat
->x()[ind_j
* nbat
->xstride
+ XX
] - xi
;
1814 dy
= nbat
->x()[ind_j
* nbat
->xstride
+ YY
] - yi
;
1815 dz
= nbat
->x()[ind_j
* nbat
->xstride
+ ZZ
] - zi
;
1817 /* The unpruned GPU list has more than 2/3
1818 * of the atom pairs beyond rlist. Using
1819 * this list will cause a lot of overhead
1820 * in the CPU FEP kernels, especially
1821 * relative to the fast GPU kernels.
1822 * So we prune the FEP list here.
1824 if (dx
* dx
+ dy
* dy
+ dz
* dz
< rlist_fep2
)
1826 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
1828 fep_list_new_nri_copy(nlist
);
1832 /* Add it to the FEP list */
1833 nlist
->jjnr
[nlist
->nrj
] = aj
;
1834 nlist
->excl_fep
[nlist
->nrj
] =
1835 (excl
.pair
[excl_pair
] & excl_bit
) ? 1 : 0;
1839 /* Exclude it from the normal list.
1840 * Note that the charge and LJ parameters have
1841 * been set to zero, but we need to avoid 0/0,
1842 * as perturbed atoms can be on top of each other.
1844 excl
.pair
[excl_pair
] &= ~excl_bit
;
1848 /* Note that we could mask out this pair in imask
1849 * if all i- and/or all j-particles are perturbed.
1850 * But since the perturbed pairs on the CPU will
1851 * take an order of magnitude more time, the GPU
1852 * will finish before the CPU and there is no gain.
1858 if (nlist
->nrj
> nlist
->jindex
[nri
])
1860 /* Actually add this new, non-empty, list */
1862 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1869 /* Set all atom-pair exclusions for a GPU type list i-entry
1871 * Sets all atom-pair exclusions from the topology stored in exclusions
1872 * as masks in the pair-list for i-super-cluster list entry iEntry.
1874 static void setExclusionsForIEntry(const Nbnxm::GridSet
& gridSet
,
1875 NbnxnPairlistGpu
* nbl
,
1876 gmx_bool diagRemoved
,
1877 int gmx_unused na_cj_2log
,
1878 const nbnxn_sci_t
& iEntry
,
1879 const ListOfLists
<int>& exclusions
)
1881 if (iEntry
.numJClusterGroups() == 0)
1887 /* Set the search ranges using start and end j-cluster indices.
1888 * Note that here we can not use cj4_ind_end, since the last cj4
1889 * can be only partially filled, so we use cj_ind.
1891 const JListRanges
ranges(iEntry
.cj4_ind_start
* c_nbnxnGpuJgroupSize
, nbl
->work
->cj_ind
,
1892 gmx::makeConstArrayRef(nbl
->cj4
));
1894 GMX_ASSERT(nbl
->na_ci
== c_nbnxnGpuClusterSize
, "na_ci should match the GPU cluster size");
1895 constexpr int c_clusterSize
= c_nbnxnGpuClusterSize
;
1896 constexpr int c_superClusterSize
= c_nbnxnGpuNumClusterPerSupercluster
* c_nbnxnGpuClusterSize
;
1898 const int iSuperCluster
= iEntry
.sci
;
1900 gmx::ArrayRef
<const int> atomIndices
= gridSet
.atomIndices();
1901 gmx::ArrayRef
<const int> cell
= gridSet
.cells();
1903 /* Loop over the atoms in the i super-cluster */
1904 for (int i
= 0; i
< c_superClusterSize
; i
++)
1906 const int iIndex
= iSuperCluster
* c_superClusterSize
+ i
;
1907 const int iAtom
= atomIndices
[iIndex
];
1910 const int iCluster
= i
/ c_clusterSize
;
1912 /* Loop over the topology-based exclusions for this i-atom */
1913 for (const int jAtom
: exclusions
[iAtom
])
1917 /* The self exclusions are already set, save some time */
1921 /* Get the index of the j-atom in the nbnxn atom data */
1922 const int jIndex
= cell
[jAtom
];
1924 /* Without shifts we only calculate interactions j>i
1925 * for one-way pair-lists.
1927 /* NOTE: We would like to use iIndex on the right hand side,
1928 * but that makes this routine 25% slower with gcc6/7.
1929 * Even using c_superClusterSize makes it slower.
1930 * Either of these changes triggers peeling of the exclIndex
1931 * loop, which apparently leads to far less efficient code.
1933 if (diagRemoved
&& jIndex
<= iSuperCluster
* nbl
->na_sc
+ i
)
1938 const int jCluster
= jIndex
/ c_clusterSize
;
1940 /* Check whether the cluster is in our list? */
1941 if (jCluster
>= ranges
.cjFirst
&& jCluster
<= ranges
.cjLast
)
1944 findJClusterInJList(jCluster
, ranges
, gmx::makeConstArrayRef(nbl
->cj4
));
1948 /* We found an exclusion, clear the corresponding
1951 const unsigned int pairMask
=
1952 (1U << (cj_mod_cj4(index
) * c_gpuNumClusterPerCell
+ iCluster
));
1953 /* Check if the i-cluster interacts with the j-cluster */
1954 if (nbl_imask0(nbl
, index
) & pairMask
)
1956 const int innerI
= (i
& (c_clusterSize
- 1));
1957 const int innerJ
= (jIndex
& (c_clusterSize
- 1));
1959 /* Determine which j-half (CUDA warp) we are in */
1960 const int jHalf
= innerJ
/ (c_clusterSize
/ c_nbnxnGpuClusterpairSplit
);
1962 nbnxn_excl_t
& interactionMask
=
1963 get_exclusion_mask(nbl
, cj_to_cj4(index
), jHalf
);
1965 interactionMask
.pair
[a_mod_wj(innerJ
) * c_clusterSize
+ innerI
] &= ~pairMask
;
1974 /* Make a new ci entry at the back of nbl->ci */
1975 static void addNewIEntry(NbnxnPairlistCpu
* nbl
, int ci
, int shift
, int flags
)
1979 ciEntry
.shift
= shift
;
1980 /* Store the interaction flags along with the shift */
1981 ciEntry
.shift
|= flags
;
1982 ciEntry
.cj_ind_start
= nbl
->cj
.size();
1983 ciEntry
.cj_ind_end
= nbl
->cj
.size();
1984 nbl
->ci
.push_back(ciEntry
);
1987 /* Make a new sci entry at index nbl->nsci */
1988 static void addNewIEntry(NbnxnPairlistGpu
* nbl
, int sci
, int shift
, int gmx_unused flags
)
1990 nbnxn_sci_t sciEntry
;
1992 sciEntry
.shift
= shift
;
1993 sciEntry
.cj4_ind_start
= nbl
->cj4
.size();
1994 sciEntry
.cj4_ind_end
= nbl
->cj4
.size();
1996 nbl
->sci
.push_back(sciEntry
);
1999 /* Sort the simple j-list cj on exclusions.
2000 * Entries with exclusions will all be sorted to the beginning of the list.
2002 static void sort_cj_excl(nbnxn_cj_t
* cj
, int ncj
, NbnxnPairlistCpuWork
* work
)
2004 work
->cj
.resize(ncj
);
2006 /* Make a list of the j-cells involving exclusions */
2008 for (int j
= 0; j
< ncj
; j
++)
2010 if (cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
2012 work
->cj
[jnew
++] = cj
[j
];
2015 /* Check if there are exclusions at all or not just the first entry */
2016 if (!((jnew
== 0) || (jnew
== 1 && cj
[0].excl
!= NBNXN_INTERACTION_MASK_ALL
)))
2018 for (int j
= 0; j
< ncj
; j
++)
2020 if (cj
[j
].excl
== NBNXN_INTERACTION_MASK_ALL
)
2022 work
->cj
[jnew
++] = cj
[j
];
2025 for (int j
= 0; j
< ncj
; j
++)
2027 cj
[j
] = work
->cj
[j
];
2032 /* Close this simple list i entry */
2033 static void closeIEntry(NbnxnPairlistCpu
* nbl
,
2034 int gmx_unused sp_max_av
,
2035 gmx_bool gmx_unused progBal
,
2036 float gmx_unused nsp_tot_est
,
2037 int gmx_unused thread
,
2038 int gmx_unused nthread
)
2040 nbnxn_ci_t
& ciEntry
= nbl
->ci
.back();
2042 /* All content of the new ci entry have already been filled correctly,
2043 * we only need to sort and increase counts or remove the entry when empty.
2045 const int jlen
= ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
;
2048 sort_cj_excl(nbl
->cj
.data() + ciEntry
.cj_ind_start
, jlen
, nbl
->work
.get());
2050 /* The counts below are used for non-bonded pair/flop counts
2051 * and should therefore match the available kernel setups.
2053 if (!(ciEntry
.shift
& NBNXN_CI_DO_COUL(0)))
2055 nbl
->work
->ncj_noq
+= jlen
;
2057 else if ((ciEntry
.shift
& NBNXN_CI_HALF_LJ(0)) || !(ciEntry
.shift
& NBNXN_CI_DO_LJ(0)))
2059 nbl
->work
->ncj_hlj
+= jlen
;
2064 /* Entry is empty: remove it */
2069 /* Split sci entry for load balancing on the GPU.
2070 * Splitting ensures we have enough lists to fully utilize the whole GPU.
2071 * With progBal we generate progressively smaller lists, which improves
2072 * load balancing. As we only know the current count on our own thread,
2073 * we will need to estimate the current total amount of i-entries.
2074 * As the lists get concatenated later, this estimate depends
2075 * both on nthread and our own thread index.
2077 static void split_sci_entry(NbnxnPairlistGpu
* nbl
,
2090 /* Estimate the total numbers of ci's of the nblist combined
2091 * over all threads using the target number of ci's.
2093 nsp_est
= (nsp_tot_est
* thread
) / nthread
+ nbl
->nci_tot
;
2095 /* The first ci blocks should be larger, to avoid overhead.
2096 * The last ci blocks should be smaller, to improve load balancing.
2097 * The factor 3/2 makes the first block 3/2 times the target average
2098 * and ensures that the total number of blocks end up equal to
2099 * that of equally sized blocks of size nsp_target_av.
2101 nsp_max
= static_cast<int>(nsp_target_av
* (nsp_tot_est
* 1.5 / (nsp_est
+ nsp_tot_est
)));
2105 nsp_max
= nsp_target_av
;
2108 const int cj4_start
= nbl
->sci
.back().cj4_ind_start
;
2109 const int cj4_end
= nbl
->sci
.back().cj4_ind_end
;
2110 const int j4len
= cj4_end
- cj4_start
;
2112 if (j4len
> 1 && j4len
* c_gpuNumClusterPerCell
* c_nbnxnGpuJgroupSize
> nsp_max
)
2114 /* Modify the last ci entry and process the cj4's again */
2120 for (int cj4
= cj4_start
; cj4
< cj4_end
; cj4
++)
2122 int nsp_cj4_p
= nsp_cj4
;
2123 /* Count the number of cluster pairs in this cj4 group */
2125 for (int p
= 0; p
< c_gpuNumClusterPerCell
* c_nbnxnGpuJgroupSize
; p
++)
2127 nsp_cj4
+= (nbl
->cj4
[cj4
].imei
[0].imask
>> p
) & 1;
2130 /* If adding the current cj4 with nsp_cj4 pairs get us further
2131 * away from our target nsp_max, split the list before this cj4.
2133 if (nsp
> 0 && nsp_max
- nsp
< nsp
+ nsp_cj4
- nsp_max
)
2135 /* Split the list at cj4 */
2136 nbl
->sci
.back().cj4_ind_end
= cj4
;
2137 /* Create a new sci entry */
2139 sciNew
.sci
= nbl
->sci
.back().sci
;
2140 sciNew
.shift
= nbl
->sci
.back().shift
;
2141 sciNew
.cj4_ind_start
= cj4
;
2142 nbl
->sci
.push_back(sciNew
);
2145 nsp_cj4_e
= nsp_cj4_p
;
2151 /* Put the remaining cj4's in the last sci entry */
2152 nbl
->sci
.back().cj4_ind_end
= cj4_end
;
2154 /* Possibly balance out the last two sci's
2155 * by moving the last cj4 of the second last sci.
2157 if (nsp_sci
- nsp_cj4_e
>= nsp
+ nsp_cj4_e
)
2159 GMX_ASSERT(nbl
->sci
.size() >= 2, "We expect at least two elements");
2160 nbl
->sci
[nbl
->sci
.size() - 2].cj4_ind_end
--;
2161 nbl
->sci
[nbl
->sci
.size() - 1].cj4_ind_start
--;
2166 /* Clost this super/sub list i entry */
2167 static void closeIEntry(NbnxnPairlistGpu
* nbl
, int nsp_max_av
, gmx_bool progBal
, float nsp_tot_est
, int thread
, int nthread
)
2169 nbnxn_sci_t
& sciEntry
= *getOpenIEntry(nbl
);
2171 /* All content of the new ci entry have already been filled correctly,
2172 * we only need to, potentially, split or remove the entry when empty.
2174 int j4len
= sciEntry
.numJClusterGroups();
2177 /* We can only have complete blocks of 4 j-entries in a list,
2178 * so round the count up before closing.
2180 int ncj4
= (nbl
->work
->cj_ind
+ c_nbnxnGpuJgroupSize
- 1) / c_nbnxnGpuJgroupSize
;
2181 nbl
->work
->cj_ind
= ncj4
* c_nbnxnGpuJgroupSize
;
2185 /* Measure the size of the new entry and potentially split it */
2186 split_sci_entry(nbl
, nsp_max_av
, progBal
, nsp_tot_est
, thread
, nthread
);
2191 /* Entry is empty: remove it */
2192 nbl
->sci
.pop_back();
2196 /* Syncs the working array before adding another grid pair to the GPU list */
2197 static void sync_work(NbnxnPairlistCpu gmx_unused
* nbl
) {}
2199 /* Syncs the working array before adding another grid pair to the GPU list */
2200 static void sync_work(NbnxnPairlistGpu
* nbl
)
2202 nbl
->work
->cj_ind
= nbl
->cj4
.size() * c_nbnxnGpuJgroupSize
;
2205 /* Clears an NbnxnPairlistCpu data structure */
2206 static void clear_pairlist(NbnxnPairlistCpu
* nbl
)
2212 nbl
->ciOuter
.clear();
2213 nbl
->cjOuter
.clear();
2215 nbl
->work
->ncj_noq
= 0;
2216 nbl
->work
->ncj_hlj
= 0;
2219 /* Clears an NbnxnPairlistGpu data structure */
2220 static void clear_pairlist(NbnxnPairlistGpu
* nbl
)
2224 nbl
->excl
.resize(1);
2228 /* Clears an atom-atom-style pair list */
2229 static void clear_pairlist_fep(t_nblist
* nl
)
2233 if (nl
->jindex
== nullptr)
2235 snew(nl
->jindex
, 1);
2240 /* Sets a simple list i-cell bounding box, including PBC shift */
2242 set_icell_bb_simple(gmx::ArrayRef
<const BoundingBox
> bb
, int ci
, real shx
, real shy
, real shz
, BoundingBox
* bb_ci
)
2244 bb_ci
->lower
.x
= bb
[ci
].lower
.x
+ shx
;
2245 bb_ci
->lower
.y
= bb
[ci
].lower
.y
+ shy
;
2246 bb_ci
->lower
.z
= bb
[ci
].lower
.z
+ shz
;
2247 bb_ci
->upper
.x
= bb
[ci
].upper
.x
+ shx
;
2248 bb_ci
->upper
.y
= bb
[ci
].upper
.y
+ shy
;
2249 bb_ci
->upper
.z
= bb
[ci
].upper
.z
+ shz
;
2252 /* Sets a simple list i-cell bounding box, including PBC shift */
2253 static inline void set_icell_bb(const Grid
& iGrid
, int ci
, real shx
, real shy
, real shz
, NbnxnPairlistCpuWork
* work
)
2255 set_icell_bb_simple(iGrid
.iBoundingBoxes(), ci
, shx
, shy
, shz
, &work
->iClusterData
.bb
[0]);
2259 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2260 static void set_icell_bbxxxx_supersub(gmx::ArrayRef
<const float> bb
, int ci
, real shx
, real shy
, real shz
, float* bb_ci
)
2262 constexpr int cellBBStride
= packedBoundingBoxesIndex(c_gpuNumClusterPerCell
);
2263 constexpr int pbbStride
= c_packedBoundingBoxesDimSize
;
2264 const int ia
= ci
* cellBBStride
;
2265 for (int m
= 0; m
< cellBBStride
; m
+= c_packedBoundingBoxesSize
)
2267 for (int i
= 0; i
< pbbStride
; i
++)
2269 bb_ci
[m
+ 0 * pbbStride
+ i
] = bb
[ia
+ m
+ 0 * pbbStride
+ i
] + shx
;
2270 bb_ci
[m
+ 1 * pbbStride
+ i
] = bb
[ia
+ m
+ 1 * pbbStride
+ i
] + shy
;
2271 bb_ci
[m
+ 2 * pbbStride
+ i
] = bb
[ia
+ m
+ 2 * pbbStride
+ i
] + shz
;
2272 bb_ci
[m
+ 3 * pbbStride
+ i
] = bb
[ia
+ m
+ 3 * pbbStride
+ i
] + shx
;
2273 bb_ci
[m
+ 4 * pbbStride
+ i
] = bb
[ia
+ m
+ 4 * pbbStride
+ i
] + shy
;
2274 bb_ci
[m
+ 5 * pbbStride
+ i
] = bb
[ia
+ m
+ 5 * pbbStride
+ i
] + shz
;
2280 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2281 gmx_unused
static void set_icell_bb_supersub(gmx::ArrayRef
<const BoundingBox
> bb
,
2288 for (int i
= 0; i
< c_gpuNumClusterPerCell
; i
++)
2290 set_icell_bb_simple(bb
, ci
* c_gpuNumClusterPerCell
+ i
, shx
, shy
, shz
, &bb_ci
[i
]);
2294 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2295 gmx_unused
static void set_icell_bb(const Grid
& iGrid
, int ci
, real shx
, real shy
, real shz
, NbnxnPairlistGpuWork
* work
)
2298 set_icell_bbxxxx_supersub(iGrid
.packedBoundingBoxes(), ci
, shx
, shy
, shz
,
2299 work
->iSuperClusterData
.bbPacked
.data());
2301 set_icell_bb_supersub(iGrid
.iBoundingBoxes(), ci
, shx
, shy
, shz
, work
->iSuperClusterData
.bb
.data());
2305 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
2306 static void icell_set_x_simple(int ci
,
2312 NbnxnPairlistCpuWork::IClusterData
* iClusterData
)
2314 const int ia
= ci
* c_nbnxnCpuIClusterSize
;
2316 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
; i
++)
2318 iClusterData
->x
[i
* STRIDE_XYZ
+ XX
] = x
[(ia
+ i
) * stride
+ XX
] + shx
;
2319 iClusterData
->x
[i
* STRIDE_XYZ
+ YY
] = x
[(ia
+ i
) * stride
+ YY
] + shy
;
2320 iClusterData
->x
[i
* STRIDE_XYZ
+ ZZ
] = x
[(ia
+ i
) * stride
+ ZZ
] + shz
;
2324 static void icell_set_x(int ci
,
2330 const ClusterDistanceKernelType kernelType
,
2331 NbnxnPairlistCpuWork
* work
)
2336 # ifdef GMX_NBNXN_SIMD_4XN
2337 case ClusterDistanceKernelType::CpuSimd_4xM
:
2338 icell_set_x_simd_4xn(ci
, shx
, shy
, shz
, stride
, x
, work
);
2341 # ifdef GMX_NBNXN_SIMD_2XNN
2342 case ClusterDistanceKernelType::CpuSimd_2xMM
:
2343 icell_set_x_simd_2xnn(ci
, shx
, shy
, shz
, stride
, x
, work
);
2347 case ClusterDistanceKernelType::CpuPlainC
:
2348 icell_set_x_simple(ci
, shx
, shy
, shz
, stride
, x
, &work
->iClusterData
);
2350 default: GMX_ASSERT(false, "Unhandled case"); break;
2354 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
2355 static void icell_set_x(int ci
,
2361 ClusterDistanceKernelType gmx_unused kernelType
,
2362 NbnxnPairlistGpuWork
* work
)
2364 #if !GMX_SIMD4_HAVE_REAL
2366 real
* x_ci
= work
->iSuperClusterData
.x
.data();
2368 int ia
= ci
* c_gpuNumClusterPerCell
* c_nbnxnGpuClusterSize
;
2369 for (int i
= 0; i
< c_gpuNumClusterPerCell
* c_nbnxnGpuClusterSize
; i
++)
2371 x_ci
[i
* DIM
+ XX
] = x
[(ia
+ i
) * stride
+ XX
] + shx
;
2372 x_ci
[i
* DIM
+ YY
] = x
[(ia
+ i
) * stride
+ YY
] + shy
;
2373 x_ci
[i
* DIM
+ ZZ
] = x
[(ia
+ i
) * stride
+ ZZ
] + shz
;
2376 #else /* !GMX_SIMD4_HAVE_REAL */
2378 real
* x_ci
= work
->iSuperClusterData
.xSimd
.data();
2380 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
2382 for (int i
= 0; i
< c_nbnxnGpuClusterSize
; i
+= GMX_SIMD4_WIDTH
)
2384 int io
= si
* c_nbnxnGpuClusterSize
+ i
;
2385 int ia
= ci
* c_gpuNumClusterPerCell
* c_nbnxnGpuClusterSize
+ io
;
2386 for (int j
= 0; j
< GMX_SIMD4_WIDTH
; j
++)
2388 x_ci
[io
* DIM
+ j
+ XX
* GMX_SIMD4_WIDTH
] = x
[(ia
+ j
) * stride
+ XX
] + shx
;
2389 x_ci
[io
* DIM
+ j
+ YY
* GMX_SIMD4_WIDTH
] = x
[(ia
+ j
) * stride
+ YY
] + shy
;
2390 x_ci
[io
* DIM
+ j
+ ZZ
* GMX_SIMD4_WIDTH
] = x
[(ia
+ j
) * stride
+ ZZ
] + shz
;
2395 #endif /* !GMX_SIMD4_HAVE_REAL */
2398 static real
minimum_subgrid_size_xy(const Grid
& grid
)
2400 const Grid::Dimensions
& dims
= grid
.dimensions();
2402 if (grid
.geometry().isSimple
)
2404 return std::min(dims
.cellSize
[XX
], dims
.cellSize
[YY
]);
2408 return std::min(dims
.cellSize
[XX
] / c_gpuNumClusterPerCellX
,
2409 dims
.cellSize
[YY
] / c_gpuNumClusterPerCellY
);
2413 static real
effective_buffer_1x1_vs_MxN(const Grid
& iGrid
, const Grid
& jGrid
)
2415 const real eff_1x1_buffer_fac_overest
= 0.1;
2417 /* Determine an atom-pair list cut-off buffer size for atom pairs,
2418 * to be added to rlist (including buffer) used for MxN.
2419 * This is for converting an MxN list to a 1x1 list. This means we can't
2420 * use the normal buffer estimate, as we have an MxN list in which
2421 * some atom pairs beyond rlist are missing. We want to capture
2422 * the beneficial effect of buffering by extra pairs just outside rlist,
2423 * while removing the useless pairs that are further away from rlist.
2424 * (Also the buffer could have been set manually not using the estimate.)
2425 * This buffer size is an overestimate.
2426 * We add 10% of the smallest grid sub-cell dimensions.
2427 * Note that the z-size differs per cell and we don't use this,
2428 * so we overestimate.
2429 * With PME, the 10% value gives a buffer that is somewhat larger
2430 * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
2431 * Smaller tolerances or using RF lead to a smaller effective buffer,
2432 * so 10% gives a safe overestimate.
2434 return eff_1x1_buffer_fac_overest
* (minimum_subgrid_size_xy(iGrid
) + minimum_subgrid_size_xy(jGrid
));
2437 /* Estimates the interaction volume^2 for non-local interactions */
2438 static real
nonlocal_vol2(const struct gmx_domdec_zones_t
* zones
, const rvec ls
, real r
)
2446 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
2447 * not home interaction volume^2. As these volumes are not additive,
2448 * this is an overestimate, but it would only be significant in the limit
2449 * of small cells, where we anyhow need to split the lists into
2450 * as small parts as possible.
2453 for (int z
= 0; z
< zones
->n
; z
++)
2455 if (zones
->shift
[z
][XX
] + zones
->shift
[z
][YY
] + zones
->shift
[z
][ZZ
] == 1)
2460 for (int d
= 0; d
< DIM
; d
++)
2462 if (zones
->shift
[z
][d
] == 0)
2466 za
*= zones
->size
[z
].x1
[d
] - zones
->size
[z
].x0
[d
];
2470 /* 4 octants of a sphere */
2471 vold_est
= 0.25 * M_PI
* r
* r
* r
* r
;
2472 /* 4 quarter pie slices on the edges */
2473 vold_est
+= 4 * cl
* M_PI
/ 6.0 * r
* r
* r
;
2474 /* One rectangular volume on a face */
2475 vold_est
+= ca
* 0.5 * r
* r
;
2477 vol2_est_tot
+= vold_est
* za
;
2481 return vol2_est_tot
;
2484 /* Estimates the average size of a full j-list for super/sub setup */
2485 static void get_nsubpair_target(const Nbnxm::GridSet
& gridSet
,
2486 const InteractionLocality iloc
,
2488 const int min_ci_balanced
,
2489 int* nsubpair_target
,
2490 float* nsubpair_tot_est
)
2492 /* The target value of 36 seems to be the optimum for Kepler.
2493 * Maxwell is less sensitive to the exact value.
2495 const int nsubpair_target_min
= 36;
2496 real r_eff_sup
, vol_est
, nsp_est
, nsp_est_nl
;
2498 const Grid
& grid
= gridSet
.grids()[0];
2500 /* We don't need to balance list sizes if:
2501 * - We didn't request balancing.
2502 * - The number of grid cells >= the number of lists requested,
2503 * since we will always generate at least #cells lists.
2504 * - We don't have any cells, since then there won't be any lists.
2506 if (min_ci_balanced
<= 0 || grid
.numCells() >= min_ci_balanced
|| grid
.numCells() == 0)
2508 /* nsubpair_target==0 signals no balancing */
2509 *nsubpair_target
= 0;
2510 *nsubpair_tot_est
= 0;
2516 const int numAtomsCluster
= grid
.geometry().numAtomsICluster
;
2517 const Grid::Dimensions
& dims
= grid
.dimensions();
2519 ls
[XX
] = dims
.cellSize
[XX
] / c_gpuNumClusterPerCellX
;
2520 ls
[YY
] = dims
.cellSize
[YY
] / c_gpuNumClusterPerCellY
;
2521 ls
[ZZ
] = numAtomsCluster
/ (dims
.atomDensity
* ls
[XX
] * ls
[YY
]);
2523 /* The formulas below are a heuristic estimate of the average nsj per si*/
2524 r_eff_sup
= rlist
+ nbnxn_get_rlist_effective_inc(numAtomsCluster
, ls
);
2526 if (!gridSet
.domainSetup().haveMultipleDomains
|| gridSet
.domainSetup().zones
->n
== 1)
2532 nsp_est_nl
= gmx::square(dims
.atomDensity
/ numAtomsCluster
)
2533 * nonlocal_vol2(gridSet
.domainSetup().zones
, ls
, r_eff_sup
);
2536 if (iloc
== InteractionLocality::Local
)
2538 /* Sub-cell interacts with itself */
2539 vol_est
= ls
[XX
] * ls
[YY
] * ls
[ZZ
];
2540 /* 6/2 rectangular volume on the faces */
2541 vol_est
+= (ls
[XX
] * ls
[YY
] + ls
[XX
] * ls
[ZZ
] + ls
[YY
] * ls
[ZZ
]) * r_eff_sup
;
2542 /* 12/2 quarter pie slices on the edges */
2543 vol_est
+= 2 * (ls
[XX
] + ls
[YY
] + ls
[ZZ
]) * 0.25 * M_PI
* gmx::square(r_eff_sup
);
2544 /* 4 octants of a sphere */
2545 vol_est
+= 0.5 * 4.0 / 3.0 * M_PI
* gmx::power3(r_eff_sup
);
2547 /* Estimate the number of cluster pairs as the local number of
2548 * clusters times the volume they interact with times the density.
2550 nsp_est
= grid
.numClusters() * vol_est
* dims
.atomDensity
/ numAtomsCluster
;
2552 /* Subtract the non-local pair count */
2553 nsp_est
-= nsp_est_nl
;
2555 /* For small cut-offs nsp_est will be an underesimate.
2556 * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
2557 * So to avoid too small or negative nsp_est we set a minimum of
2558 * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
2559 * This might be a slight overestimate for small non-periodic groups of
2560 * atoms as will occur for a local domain with DD, but for small
2561 * groups of atoms we'll anyhow be limited by nsubpair_target_min,
2562 * so this overestimation will not matter.
2564 nsp_est
= std::max(nsp_est
, grid
.numClusters() * 14._real
);
2568 fprintf(debug
, "nsp_est local %5.1f non-local %5.1f\n", nsp_est
, nsp_est_nl
);
2573 nsp_est
= nsp_est_nl
;
2576 /* Thus the (average) maximum j-list size should be as follows.
2577 * Since there is overhead, we shouldn't make the lists too small
2578 * (and we can't chop up j-groups) so we use a minimum target size of 36.
2580 *nsubpair_target
= std::max(nsubpair_target_min
, roundToInt(nsp_est
/ min_ci_balanced
));
2581 *nsubpair_tot_est
= nsp_est
;
2585 fprintf(debug
, "nbl nsp estimate %.1f, nsubpair_target %d\n", nsp_est
, *nsubpair_target
);
2589 /* Debug list print function */
2590 static void print_nblist_ci_cj(FILE* fp
, const NbnxnPairlistCpu
& nbl
)
2592 for (const nbnxn_ci_t
& ciEntry
: nbl
.ci
)
2594 fprintf(fp
, "ci %4d shift %2d ncj %3d\n", ciEntry
.ci
, ciEntry
.shift
,
2595 ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
);
2597 for (int j
= ciEntry
.cj_ind_start
; j
< ciEntry
.cj_ind_end
; j
++)
2599 fprintf(fp
, " cj %5d imask %x\n", nbl
.cj
[j
].cj
, nbl
.cj
[j
].excl
);
2604 /* Debug list print function */
2605 static void print_nblist_sci_cj(FILE* fp
, const NbnxnPairlistGpu
& nbl
)
2607 for (const nbnxn_sci_t
& sci
: nbl
.sci
)
2609 fprintf(fp
, "ci %4d shift %2d ncj4 %2d\n", sci
.sci
, sci
.shift
, sci
.numJClusterGroups());
2612 for (int j4
= sci
.cj4_ind_start
; j4
< sci
.cj4_ind_end
; j4
++)
2614 for (int j
= 0; j
< c_nbnxnGpuJgroupSize
; j
++)
2616 fprintf(fp
, " sj %5d imask %x\n", nbl
.cj4
[j4
].cj
[j
], nbl
.cj4
[j4
].imei
[0].imask
);
2617 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
2619 if (nbl
.cj4
[j4
].imei
[0].imask
& (1U << (j
* c_gpuNumClusterPerCell
+ si
)))
2626 fprintf(fp
, "ci %4d shift %2d ncj4 %2d ncp %3d\n", sci
.sci
, sci
.shift
,
2627 sci
.numJClusterGroups(), ncp
);
2631 /* Combine pair lists *nbl generated on multiple threads nblc */
2632 static void combine_nblists(gmx::ArrayRef
<const NbnxnPairlistGpu
> nbls
, NbnxnPairlistGpu
* nblc
)
2634 int nsci
= nblc
->sci
.size();
2635 int ncj4
= nblc
->cj4
.size();
2636 int nexcl
= nblc
->excl
.size();
2637 for (auto& nbl
: nbls
)
2639 nsci
+= nbl
.sci
.size();
2640 ncj4
+= nbl
.cj4
.size();
2641 nexcl
+= nbl
.excl
.size();
2644 /* Resize with the final, combined size, so we can fill in parallel */
2645 /* NOTE: For better performance we should use default initialization */
2646 nblc
->sci
.resize(nsci
);
2647 nblc
->cj4
.resize(ncj4
);
2648 nblc
->excl
.resize(nexcl
);
2650 /* Each thread should copy its own data to the combined arrays,
2651 * as otherwise data will go back and forth between different caches.
2653 const int gmx_unused nthreads
= gmx_omp_nthreads_get(emntPairsearch
);
2655 #pragma omp parallel for num_threads(nthreads) schedule(static)
2656 for (gmx::index n
= 0; n
< nbls
.ssize(); n
++)
2660 /* Determine the offset in the combined data for our thread.
2661 * Note that the original sizes in nblc are lost.
2663 int sci_offset
= nsci
;
2664 int cj4_offset
= ncj4
;
2665 int excl_offset
= nexcl
;
2667 for (gmx::index i
= n
; i
< nbls
.ssize(); i
++)
2669 sci_offset
-= nbls
[i
].sci
.size();
2670 cj4_offset
-= nbls
[i
].cj4
.size();
2671 excl_offset
-= nbls
[i
].excl
.size();
2674 const NbnxnPairlistGpu
& nbli
= nbls
[n
];
2676 for (size_t i
= 0; i
< nbli
.sci
.size(); i
++)
2678 nblc
->sci
[sci_offset
+ i
] = nbli
.sci
[i
];
2679 nblc
->sci
[sci_offset
+ i
].cj4_ind_start
+= cj4_offset
;
2680 nblc
->sci
[sci_offset
+ i
].cj4_ind_end
+= cj4_offset
;
2683 for (size_t j4
= 0; j4
< nbli
.cj4
.size(); j4
++)
2685 nblc
->cj4
[cj4_offset
+ j4
] = nbli
.cj4
[j4
];
2686 nblc
->cj4
[cj4_offset
+ j4
].imei
[0].excl_ind
+= excl_offset
;
2687 nblc
->cj4
[cj4_offset
+ j4
].imei
[1].excl_ind
+= excl_offset
;
2690 for (size_t j4
= 0; j4
< nbli
.excl
.size(); j4
++)
2692 nblc
->excl
[excl_offset
+ j4
] = nbli
.excl
[j4
];
2695 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2698 for (auto& nbl
: nbls
)
2700 nblc
->nci_tot
+= nbl
.nci_tot
;
2704 static void balance_fep_lists(gmx::ArrayRef
<std::unique_ptr
<t_nblist
>> fepLists
,
2705 gmx::ArrayRef
<PairsearchWork
> work
)
2707 const int numLists
= fepLists
.ssize();
2711 /* Nothing to balance */
2715 /* Count the total i-lists and pairs */
2718 for (const auto& list
: fepLists
)
2720 nri_tot
+= list
->nri
;
2721 nrj_tot
+= list
->nrj
;
2724 const int nrj_target
= (nrj_tot
+ numLists
- 1) / numLists
;
2726 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded
) == numLists
,
2727 "We should have as many work objects as FEP lists");
2729 #pragma omp parallel for schedule(static) num_threads(numLists)
2730 for (int th
= 0; th
< numLists
; th
++)
2734 t_nblist
* nbl
= work
[th
].nbl_fep
.get();
2736 /* Note that here we allocate for the total size, instead of
2737 * a per-thread esimate (which is hard to obtain).
2739 if (nri_tot
> nbl
->maxnri
)
2741 nbl
->maxnri
= over_alloc_large(nri_tot
);
2742 reallocate_nblist(nbl
);
2744 if (nri_tot
> nbl
->maxnri
|| nrj_tot
> nbl
->maxnrj
)
2746 nbl
->maxnrj
= over_alloc_small(nrj_tot
);
2747 srenew(nbl
->jjnr
, nbl
->maxnrj
);
2748 srenew(nbl
->excl_fep
, nbl
->maxnrj
);
2751 clear_pairlist_fep(nbl
);
2753 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2756 /* Loop over the source lists and assign and copy i-entries */
2758 t_nblist
* nbld
= work
[th_dest
].nbl_fep
.get();
2759 for (int th
= 0; th
< numLists
; th
++)
2761 const t_nblist
* nbls
= fepLists
[th
].get();
2763 for (int i
= 0; i
< nbls
->nri
; i
++)
2767 /* The number of pairs in this i-entry */
2768 nrj
= nbls
->jindex
[i
+ 1] - nbls
->jindex
[i
];
2770 /* Decide if list th_dest is too large and we should procede
2771 * to the next destination list.
2773 if (th_dest
+ 1 < numLists
&& nbld
->nrj
> 0
2774 && nbld
->nrj
+ nrj
- nrj_target
> nrj_target
- nbld
->nrj
)
2777 nbld
= work
[th_dest
].nbl_fep
.get();
2780 nbld
->iinr
[nbld
->nri
] = nbls
->iinr
[i
];
2781 nbld
->gid
[nbld
->nri
] = nbls
->gid
[i
];
2782 nbld
->shift
[nbld
->nri
] = nbls
->shift
[i
];
2784 for (int j
= nbls
->jindex
[i
]; j
< nbls
->jindex
[i
+ 1]; j
++)
2786 nbld
->jjnr
[nbld
->nrj
] = nbls
->jjnr
[j
];
2787 nbld
->excl_fep
[nbld
->nrj
] = nbls
->excl_fep
[j
];
2791 nbld
->jindex
[nbld
->nri
] = nbld
->nrj
;
2795 /* Swap the list pointers */
2796 for (int th
= 0; th
< numLists
; th
++)
2798 fepLists
[th
].swap(work
[th
].nbl_fep
);
2802 fprintf(debug
, "nbl_fep[%d] nri %4d nrj %4d\n", th
, fepLists
[th
]->nri
, fepLists
[th
]->nrj
);
2807 /* Returns the next ci to be processes by our thread */
2808 static gmx_bool
next_ci(const Grid
& grid
, int nth
, int ci_block
, int* ci_x
, int* ci_y
, int* ci_b
, int* ci
)
2813 if (*ci_b
== ci_block
)
2815 /* Jump to the next block assigned to this task */
2816 *ci
+= (nth
- 1) * ci_block
;
2820 if (*ci
>= grid
.numCells())
2825 while (*ci
>= grid
.firstCellInColumn(*ci_x
* grid
.dimensions().numCells
[YY
] + *ci_y
+ 1))
2828 if (*ci_y
== grid
.dimensions().numCells
[YY
])
2838 /* Returns the distance^2 for which we put cell pairs in the list
2839 * without checking atom pair distances. This is usually < rlist^2.
2841 static float boundingbox_only_distance2(const Grid::Dimensions
& iGridDims
,
2842 const Grid::Dimensions
& jGridDims
,
2846 /* If the distance between two sub-cell bounding boxes is less
2847 * than this distance, do not check the distance between
2848 * all particle pairs in the sub-cell, since then it is likely
2849 * that the box pair has atom pairs within the cut-off.
2850 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
2851 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
2852 * Using more than 0.5 gains at most 0.5%.
2853 * If forces are calculated more than twice, the performance gain
2854 * in the force calculation outweighs the cost of checking.
2855 * Note that with subcell lists, the atom-pair distance check
2856 * is only performed when only 1 out of 8 sub-cells in within range,
2857 * this is because the GPU is much faster than the cpu.
2862 bbx
= 0.5 * (iGridDims
.cellSize
[XX
] + jGridDims
.cellSize
[XX
]);
2863 bby
= 0.5 * (iGridDims
.cellSize
[YY
] + jGridDims
.cellSize
[YY
]);
2866 bbx
/= c_gpuNumClusterPerCellX
;
2867 bby
/= c_gpuNumClusterPerCellY
;
2870 rbb2
= std::max(0.0, rlist
- 0.5 * std::sqrt(bbx
* bbx
+ bby
* bby
));
2876 return (float)((1 + GMX_FLOAT_EPS
) * rbb2
);
2880 static int get_ci_block_size(const Grid
& iGrid
, const bool haveMultipleDomains
, const int numLists
)
2882 const int ci_block_enum
= 5;
2883 const int ci_block_denom
= 11;
2884 const int ci_block_min_atoms
= 16;
2887 /* Here we decide how to distribute the blocks over the threads.
2888 * We use prime numbers to try to avoid that the grid size becomes
2889 * a multiple of the number of threads, which would lead to some
2890 * threads getting "inner" pairs and others getting boundary pairs,
2891 * which in turns will lead to load imbalance between threads.
2892 * Set the block size as 5/11/ntask times the average number of cells
2893 * in a y,z slab. This should ensure a quite uniform distribution
2894 * of the grid parts of the different thread along all three grid
2895 * zone boundaries with 3D domain decomposition. At the same time
2896 * the blocks will not become too small.
2898 GMX_ASSERT(iGrid
.dimensions().numCells
[XX
] > 0, "Grid can't be empty");
2899 GMX_ASSERT(numLists
> 0, "We need at least one list");
2900 ci_block
= (iGrid
.numCells() * ci_block_enum
)
2901 / (ci_block_denom
* iGrid
.dimensions().numCells
[XX
] * numLists
);
2903 const int numAtomsPerCell
= iGrid
.geometry().numAtomsPerCell
;
2905 /* Ensure the blocks are not too small: avoids cache invalidation */
2906 if (ci_block
* numAtomsPerCell
< ci_block_min_atoms
)
2908 ci_block
= (ci_block_min_atoms
+ numAtomsPerCell
- 1) / numAtomsPerCell
;
2911 /* Without domain decomposition
2912 * or with less than 3 blocks per task, divide in nth blocks.
2914 if (!haveMultipleDomains
|| numLists
* 3 * ci_block
> iGrid
.numCells())
2916 ci_block
= (iGrid
.numCells() + numLists
- 1) / numLists
;
2919 if (ci_block
> 1 && (numLists
- 1) * ci_block
>= iGrid
.numCells())
2921 /* Some threads have no work. Although reducing the block size
2922 * does not decrease the block count on the first few threads,
2923 * with GPUs better mixing of "upper" cells that have more empty
2924 * clusters results in a somewhat lower max load over all threads.
2925 * Without GPUs the regime of so few atoms per thread is less
2926 * performance relevant, but with 8-wide SIMD the same reasoning
2927 * applies, since the pair list uses 4 i-atom "sub-clusters".
2935 /* Returns the number of bits to right-shift a cluster index to obtain
2936 * the corresponding force buffer flag index.
2938 static int getBufferFlagShift(int numAtomsPerCluster
)
2940 int bufferFlagShift
= 0;
2941 while ((numAtomsPerCluster
<< bufferFlagShift
) < NBNXN_BUFFERFLAG_SIZE
)
2946 return bufferFlagShift
;
2949 static bool pairlistIsSimple(const NbnxnPairlistCpu gmx_unused
& pairlist
)
2954 static bool pairlistIsSimple(const NbnxnPairlistGpu gmx_unused
& pairlist
)
2959 static void makeClusterListWrapper(NbnxnPairlistCpu
* nbl
,
2960 const Grid gmx_unused
& iGrid
,
2963 const int firstCell
,
2965 const bool excludeSubDiagonal
,
2966 const nbnxn_atomdata_t
* nbat
,
2969 const ClusterDistanceKernelType kernelType
,
2970 int* numDistanceChecks
)
2974 case ClusterDistanceKernelType::CpuPlainC
:
2975 makeClusterListSimple(jGrid
, nbl
, ci
, firstCell
, lastCell
, excludeSubDiagonal
,
2976 nbat
->x().data(), rlist2
, rbb2
, numDistanceChecks
);
2978 #ifdef GMX_NBNXN_SIMD_4XN
2979 case ClusterDistanceKernelType::CpuSimd_4xM
:
2980 makeClusterListSimd4xn(jGrid
, nbl
, ci
, firstCell
, lastCell
, excludeSubDiagonal
,
2981 nbat
->x().data(), rlist2
, rbb2
, numDistanceChecks
);
2984 #ifdef GMX_NBNXN_SIMD_2XNN
2985 case ClusterDistanceKernelType::CpuSimd_2xMM
:
2986 makeClusterListSimd2xnn(jGrid
, nbl
, ci
, firstCell
, lastCell
, excludeSubDiagonal
,
2987 nbat
->x().data(), rlist2
, rbb2
, numDistanceChecks
);
2990 default: GMX_ASSERT(false, "Unhandled kernel type");
2994 static void makeClusterListWrapper(NbnxnPairlistGpu
* nbl
,
2995 const Grid
& gmx_unused iGrid
,
2998 const int firstCell
,
3000 const bool excludeSubDiagonal
,
3001 const nbnxn_atomdata_t
* nbat
,
3004 ClusterDistanceKernelType gmx_unused kernelType
,
3005 int* numDistanceChecks
)
3007 for (int cj
= firstCell
; cj
<= lastCell
; cj
++)
3009 make_cluster_list_supersub(iGrid
, jGrid
, nbl
, ci
, cj
, excludeSubDiagonal
, nbat
->xstride
,
3010 nbat
->x().data(), rlist2
, rbb2
, numDistanceChecks
);
3014 static int getNumSimpleJClustersInList(const NbnxnPairlistCpu
& nbl
)
3016 return nbl
.cj
.size();
3019 static int getNumSimpleJClustersInList(const gmx_unused NbnxnPairlistGpu
& nbl
)
3024 static void incrementNumSimpleJClustersInList(NbnxnPairlistCpu
* nbl
, int ncj_old_j
)
3026 nbl
->ncjInUse
+= nbl
->cj
.size();
3027 nbl
->ncjInUse
-= ncj_old_j
;
3030 static void incrementNumSimpleJClustersInList(NbnxnPairlistGpu gmx_unused
* nbl
, int gmx_unused ncj_old_j
)
3034 static void checkListSizeConsistency(const NbnxnPairlistCpu
& nbl
, const bool haveFreeEnergy
)
3036 GMX_RELEASE_ASSERT(static_cast<size_t>(nbl
.ncjInUse
) == nbl
.cj
.size() || haveFreeEnergy
,
3037 "Without free-energy all cj pair-list entries should be in use. "
3038 "Note that subsequent code does not make use of the equality, "
3039 "this check is only here to catch bugs");
3042 static void checkListSizeConsistency(const NbnxnPairlistGpu gmx_unused
& nbl
, bool gmx_unused haveFreeEnergy
)
3044 /* We currently can not check consistency here */
3047 /* Set the buffer flags for newly added entries in the list */
3048 static void setBufferFlags(const NbnxnPairlistCpu
& nbl
,
3049 const int ncj_old_j
,
3050 const int gridj_flag_shift
,
3051 gmx_bitmask_t
* gridj_flag
,
3054 if (gmx::ssize(nbl
.cj
) > ncj_old_j
)
3056 int cbFirst
= nbl
.cj
[ncj_old_j
].cj
>> gridj_flag_shift
;
3057 int cbLast
= nbl
.cj
.back().cj
>> gridj_flag_shift
;
3058 for (int cb
= cbFirst
; cb
<= cbLast
; cb
++)
3060 bitmask_init_bit(&gridj_flag
[cb
], th
);
3065 static void setBufferFlags(const NbnxnPairlistGpu gmx_unused
& nbl
,
3066 int gmx_unused ncj_old_j
,
3067 int gmx_unused gridj_flag_shift
,
3068 gmx_bitmask_t gmx_unused
* gridj_flag
,
3071 GMX_ASSERT(false, "This function should never be called");
3074 /* Generates the part of pair-list nbl assigned to our thread */
3075 template<typename T
>
3076 static void nbnxn_make_pairlist_part(const Nbnxm::GridSet
& gridSet
,
3079 PairsearchWork
* work
,
3080 const nbnxn_atomdata_t
* nbat
,
3081 const ListOfLists
<int>& exclusions
,
3083 const PairlistType pairlistType
,
3085 gmx_bool bFBufferFlag
,
3088 float nsubpair_tot_est
,
3098 int ci_b
, ci
, ci_x
, ci_y
, ci_xy
;
3100 real bx0
, bx1
, by0
, by1
, bz0
, bz1
;
3102 real d2cx
, d2z
, d2z_cx
, d2z_cy
, d2zx
, d2zxy
, d2xy
;
3103 int cxf
, cxl
, cyf
, cyf_x
, cyl
;
3104 int numDistanceChecks
;
3105 int gridi_flag_shift
= 0, gridj_flag_shift
= 0;
3106 gmx_bitmask_t
* gridj_flag
= nullptr;
3107 int ncj_old_i
, ncj_old_j
;
3109 if (jGrid
.geometry().isSimple
!= pairlistIsSimple(*nbl
)
3110 || iGrid
.geometry().isSimple
!= pairlistIsSimple(*nbl
))
3112 gmx_incons("Grid incompatible with pair-list");
3116 GMX_ASSERT(nbl
->na_ci
== jGrid
.geometry().numAtomsICluster
,
3117 "The cluster sizes in the list and grid should match");
3118 nbl
->na_cj
= JClusterSizePerListType
[pairlistType
];
3119 na_cj_2log
= get_2log(nbl
->na_cj
);
3125 /* Determine conversion of clusters to flag blocks */
3126 gridi_flag_shift
= getBufferFlagShift(nbl
->na_ci
);
3127 gridj_flag_shift
= getBufferFlagShift(nbl
->na_cj
);
3129 gridj_flag
= work
->buffer_flags
.flag
;
3132 gridSet
.getBox(box
);
3134 const bool haveFep
= gridSet
.haveFep();
3136 const real rlist2
= nbl
->rlist
* nbl
->rlist
;
3138 // Select the cluster pair distance kernel type
3139 const ClusterDistanceKernelType kernelType
= getClusterDistanceKernelType(pairlistType
, *nbat
);
3141 if (haveFep
&& !pairlistIsSimple(*nbl
))
3143 /* Determine an atom-pair list cut-off distance for FEP atom pairs.
3144 * We should not simply use rlist, since then we would not have
3145 * the small, effective buffering of the NxN lists.
3146 * The buffer is on overestimate, but the resulting cost for pairs
3147 * beyond rlist is neglible compared to the FEP pairs within rlist.
3149 rl_fep2
= nbl
->rlist
+ effective_buffer_1x1_vs_MxN(iGrid
, jGrid
);
3153 fprintf(debug
, "nbl_fep atom-pair rlist %f\n", rl_fep2
);
3155 rl_fep2
= rl_fep2
* rl_fep2
;
3158 const Grid::Dimensions
& iGridDims
= iGrid
.dimensions();
3159 const Grid::Dimensions
& jGridDims
= jGrid
.dimensions();
3161 rbb2
= boundingbox_only_distance2(iGridDims
, jGridDims
, nbl
->rlist
, pairlistIsSimple(*nbl
));
3165 fprintf(debug
, "nbl bounding box only distance %f\n", std::sqrt(rbb2
));
3168 const bool isIntraGridList
= (&iGrid
== &jGrid
);
3170 /* Set the shift range */
3171 for (int d
= 0; d
< DIM
; d
++)
3173 /* Check if we need periodicity shifts.
3174 * Without PBC or with domain decomposition we don't need them.
3176 if (d
>= numPbcDimensions(gridSet
.domainSetup().pbcType
)
3177 || gridSet
.domainSetup().haveMultipleDomainsPerDim
[d
])
3183 const real listRangeCellToCell
=
3184 listRangeForGridCellToGridCell(rlist
, iGrid
.dimensions(), jGrid
.dimensions());
3185 if (d
== XX
&& box
[XX
][XX
] - fabs(box
[YY
][XX
]) - fabs(box
[ZZ
][XX
]) < listRangeCellToCell
)
3195 const bool bSimple
= pairlistIsSimple(*nbl
);
3196 gmx::ArrayRef
<const BoundingBox
> bb_i
;
3198 gmx::ArrayRef
<const float> pbb_i
;
3201 bb_i
= iGrid
.iBoundingBoxes();
3205 pbb_i
= iGrid
.packedBoundingBoxes();
3208 /* We use the normal bounding box format for both grid types */
3209 bb_i
= iGrid
.iBoundingBoxes();
3211 gmx::ArrayRef
<const BoundingBox1D
> bbcz_i
= iGrid
.zBoundingBoxes();
3212 gmx::ArrayRef
<const int> flags_i
= iGrid
.clusterFlags();
3213 gmx::ArrayRef
<const BoundingBox1D
> bbcz_j
= jGrid
.zBoundingBoxes();
3214 int cell0_i
= iGrid
.cellOffset();
3218 fprintf(debug
, "nbl nc_i %d col.av. %.1f ci_block %d\n", iGrid
.numCells(),
3219 iGrid
.numCells() / static_cast<double>(iGrid
.numColumns()), ci_block
);
3222 numDistanceChecks
= 0;
3224 const real listRangeBBToJCell2
=
3225 gmx::square(listRangeForBoundingBoxToGridCell(rlist
, jGrid
.dimensions()));
3227 /* Initially ci_b and ci to 1 before where we want them to start,
3228 * as they will both be incremented in next_ci.
3231 ci
= th
* ci_block
- 1;
3234 while (next_ci(iGrid
, nth
, ci_block
, &ci_x
, &ci_y
, &ci_b
, &ci
))
3236 if (bSimple
&& flags_i
[ci
] == 0)
3240 ncj_old_i
= getNumSimpleJClustersInList(*nbl
);
3243 if (!isIntraGridList
&& shp
[XX
] == 0)
3247 bx1
= bb_i
[ci
].upper
.x
;
3251 bx1
= iGridDims
.lowerCorner
[XX
] + (real(ci_x
) + 1) * iGridDims
.cellSize
[XX
];
3253 if (bx1
< jGridDims
.lowerCorner
[XX
])
3255 d2cx
= gmx::square(jGridDims
.lowerCorner
[XX
] - bx1
);
3257 if (d2cx
>= listRangeBBToJCell2
)
3264 ci_xy
= ci_x
* iGridDims
.numCells
[YY
] + ci_y
;
3266 /* Loop over shift vectors in three dimensions */
3267 for (int tz
= -shp
[ZZ
]; tz
<= shp
[ZZ
]; tz
++)
3269 const real shz
= real(tz
) * box
[ZZ
][ZZ
];
3271 bz0
= bbcz_i
[ci
].lower
+ shz
;
3272 bz1
= bbcz_i
[ci
].upper
+ shz
;
3280 d2z
= gmx::square(bz1
);
3284 d2z
= gmx::square(bz0
- box
[ZZ
][ZZ
]);
3287 d2z_cx
= d2z
+ d2cx
;
3289 if (d2z_cx
>= rlist2
)
3294 bz1_frac
= bz1
/ real(iGrid
.numCellsInColumn(ci_xy
));
3299 /* The check with bz1_frac close to or larger than 1 comes later */
3301 for (int ty
= -shp
[YY
]; ty
<= shp
[YY
]; ty
++)
3303 const real shy
= real(ty
) * box
[YY
][YY
] + real(tz
) * box
[ZZ
][YY
];
3307 by0
= bb_i
[ci
].lower
.y
+ shy
;
3308 by1
= bb_i
[ci
].upper
.y
+ shy
;
3312 by0
= iGridDims
.lowerCorner
[YY
] + (real(ci_y
)) * iGridDims
.cellSize
[YY
] + shy
;
3313 by1
= iGridDims
.lowerCorner
[YY
] + (real(ci_y
) + 1) * iGridDims
.cellSize
[YY
] + shy
;
3316 get_cell_range
<YY
>(by0
, by1
, jGridDims
, d2z_cx
, rlist
, &cyf
, &cyl
);
3324 if (by1
< jGridDims
.lowerCorner
[YY
])
3326 d2z_cy
+= gmx::square(jGridDims
.lowerCorner
[YY
] - by1
);
3328 else if (by0
> jGridDims
.upperCorner
[YY
])
3330 d2z_cy
+= gmx::square(by0
- jGridDims
.upperCorner
[YY
]);
3333 for (int tx
= -shp
[XX
]; tx
<= shp
[XX
]; tx
++)
3335 const int shift
= XYZ2IS(tx
, ty
, tz
);
3337 const bool excludeSubDiagonal
= (isIntraGridList
&& shift
== CENTRAL
);
3339 if (c_pbcShiftBackward
&& isIntraGridList
&& shift
> CENTRAL
)
3345 real(tx
) * box
[XX
][XX
] + real(ty
) * box
[YY
][XX
] + real(tz
) * box
[ZZ
][XX
];
3349 bx0
= bb_i
[ci
].lower
.x
+ shx
;
3350 bx1
= bb_i
[ci
].upper
.x
+ shx
;
3354 bx0
= iGridDims
.lowerCorner
[XX
] + (real(ci_x
)) * iGridDims
.cellSize
[XX
] + shx
;
3355 bx1
= iGridDims
.lowerCorner
[XX
] + (real(ci_x
) + 1) * iGridDims
.cellSize
[XX
] + shx
;
3358 get_cell_range
<XX
>(bx0
, bx1
, jGridDims
, d2z_cy
, rlist
, &cxf
, &cxl
);
3365 addNewIEntry(nbl
, cell0_i
+ ci
, shift
, flags_i
[ci
]);
3367 if ((!c_pbcShiftBackward
|| excludeSubDiagonal
) && cxf
< ci_x
)
3369 /* Leave the pairs with i > j.
3370 * x is the major index, so skip half of it.
3375 set_icell_bb(iGrid
, ci
, shx
, shy
, shz
, nbl
->work
.get());
3377 icell_set_x(cell0_i
+ ci
, shx
, shy
, shz
, nbat
->xstride
, nbat
->x().data(),
3378 kernelType
, nbl
->work
.get());
3380 for (int cx
= cxf
; cx
<= cxl
; cx
++)
3382 const real cx_real
= cx
;
3384 if (jGridDims
.lowerCorner
[XX
] + cx_real
* jGridDims
.cellSize
[XX
] > bx1
)
3386 d2zx
+= gmx::square(jGridDims
.lowerCorner
[XX
]
3387 + cx_real
* jGridDims
.cellSize
[XX
] - bx1
);
3389 else if (jGridDims
.lowerCorner
[XX
] + (cx_real
+ 1) * jGridDims
.cellSize
[XX
] < bx0
)
3391 d2zx
+= gmx::square(jGridDims
.lowerCorner
[XX
]
3392 + (cx_real
+ 1) * jGridDims
.cellSize
[XX
] - bx0
);
3395 if (isIntraGridList
&& cx
== 0 && (!c_pbcShiftBackward
|| shift
== CENTRAL
)
3398 /* Leave the pairs with i > j.
3399 * Skip half of y when i and j have the same x.
3408 for (int cy
= cyf_x
; cy
<= cyl
; cy
++)
3410 const int columnStart
=
3411 jGrid
.firstCellInColumn(cx
* jGridDims
.numCells
[YY
] + cy
);
3412 const int columnEnd
=
3413 jGrid
.firstCellInColumn(cx
* jGridDims
.numCells
[YY
] + cy
+ 1);
3415 const real cy_real
= cy
;
3417 if (jGridDims
.lowerCorner
[YY
] + cy_real
* jGridDims
.cellSize
[YY
] > by1
)
3419 d2zxy
+= gmx::square(jGridDims
.lowerCorner
[YY
]
3420 + cy_real
* jGridDims
.cellSize
[YY
] - by1
);
3422 else if (jGridDims
.lowerCorner
[YY
] + (cy_real
+ 1) * jGridDims
.cellSize
[YY
] < by0
)
3424 d2zxy
+= gmx::square(jGridDims
.lowerCorner
[YY
]
3425 + (cy_real
+ 1) * jGridDims
.cellSize
[YY
] - by0
);
3427 if (columnStart
< columnEnd
&& d2zxy
< listRangeBBToJCell2
)
3429 /* To improve efficiency in the common case
3430 * of a homogeneous particle distribution,
3431 * we estimate the index of the middle cell
3432 * in range (midCell). We search down and up
3433 * starting from this index.
3435 * Note that the bbcz_j array contains bounds
3436 * for i-clusters, thus for clusters of 4 atoms.
3437 * For the common case where the j-cluster size
3438 * is 8, we could step with a stride of 2,
3439 * but we do not do this because it would
3440 * complicate this code even more.
3444 + static_cast<int>(bz1_frac
3445 * static_cast<real
>(columnEnd
- columnStart
));
3446 if (midCell
>= columnEnd
)
3448 midCell
= columnEnd
- 1;
3453 /* Find the lowest cell that can possibly
3455 * Check if we hit the bottom of the grid,
3456 * if the j-cell is below the i-cell and if so,
3457 * if it is within range.
3459 int downTestCell
= midCell
;
3460 while (downTestCell
>= columnStart
3461 && (bbcz_j
[downTestCell
].upper
>= bz0
3462 || d2xy
+ gmx::square(bbcz_j
[downTestCell
].upper
- bz0
) < rlist2
))
3466 int firstCell
= downTestCell
+ 1;
3468 /* Find the highest cell that can possibly
3470 * Check if we hit the top of the grid,
3471 * if the j-cell is above the i-cell and if so,
3472 * if it is within range.
3474 int upTestCell
= midCell
+ 1;
3475 while (upTestCell
< columnEnd
3476 && (bbcz_j
[upTestCell
].lower
<= bz1
3477 || d2xy
+ gmx::square(bbcz_j
[upTestCell
].lower
- bz1
) < rlist2
))
3481 int lastCell
= upTestCell
- 1;
3483 #define NBNXN_REFCODE 0
3486 /* Simple reference code, for debugging,
3487 * overrides the more complex code above.
3489 firstCell
= columnEnd
;
3491 for (int k
= columnStart
; k
< columnEnd
; k
++)
3493 if (d2xy
+ gmx::square(bbcz_j
[k
* NNBSBB_D
+ 1] - bz0
) < rlist2
3498 if (d2xy
+ gmx::square(bbcz_j
[k
* NNBSBB_D
] - bz1
) < rlist2
3507 if (isIntraGridList
)
3509 /* We want each atom/cell pair only once,
3510 * only use cj >= ci.
3512 if (!c_pbcShiftBackward
|| shift
== CENTRAL
)
3514 firstCell
= std::max(firstCell
, ci
);
3518 if (firstCell
<= lastCell
)
3520 GMX_ASSERT(firstCell
>= columnStart
&& lastCell
< columnEnd
,
3521 "The range should reside within the current grid "
3524 /* For f buffer flags with simple lists */
3525 ncj_old_j
= getNumSimpleJClustersInList(*nbl
);
3527 makeClusterListWrapper(nbl
, iGrid
, ci
, jGrid
, firstCell
, lastCell
,
3528 excludeSubDiagonal
, nbat
, rlist2
, rbb2
,
3529 kernelType
, &numDistanceChecks
);
3533 setBufferFlags(*nbl
, ncj_old_j
, gridj_flag_shift
, gridj_flag
, th
);
3536 incrementNumSimpleJClustersInList(nbl
, ncj_old_j
);
3542 if (!exclusions
.empty())
3544 /* Set the exclusions for this ci list */
3545 setExclusionsForIEntry(gridSet
, nbl
, excludeSubDiagonal
, na_cj_2log
,
3546 *getOpenIEntry(nbl
), exclusions
);
3551 make_fep_list(gridSet
.atomIndices(), nbat
, nbl
, excludeSubDiagonal
,
3552 getOpenIEntry(nbl
), shx
, shy
, shz
, rl_fep2
, iGrid
, jGrid
, nbl_fep
);
3555 /* Close this ci list */
3556 closeIEntry(nbl
, nsubpair_max
, progBal
, nsubpair_tot_est
, th
, nth
);
3561 if (bFBufferFlag
&& getNumSimpleJClustersInList(*nbl
) > ncj_old_i
)
3563 bitmask_init_bit(&(work
->buffer_flags
.flag
[(iGrid
.cellOffset() + ci
) >> gridi_flag_shift
]), th
);
3567 work
->ndistc
= numDistanceChecks
;
3569 checkListSizeConsistency(*nbl
, haveFep
);
3573 fprintf(debug
, "number of distance checks %d\n", numDistanceChecks
);
3575 print_nblist_statistics(debug
, *nbl
, gridSet
, rlist
);
3579 fprintf(debug
, "nbl FEP list pairs: %d\n", nbl_fep
->nrj
);
3584 static void reduce_buffer_flags(gmx::ArrayRef
<PairsearchWork
> searchWork
,
3586 const nbnxn_buffer_flags_t
* dest
)
3588 for (int s
= 0; s
< nsrc
; s
++)
3590 gmx_bitmask_t
* flag
= searchWork
[s
].buffer_flags
.flag
;
3592 for (int b
= 0; b
< dest
->nflag
; b
++)
3594 bitmask_union(&(dest
->flag
[b
]), flag
[b
]);
3599 static void print_reduction_cost(const nbnxn_buffer_flags_t
* flags
, int nout
)
3601 int nelem
, nkeep
, ncopy
, nred
, out
;
3602 gmx_bitmask_t mask_0
;
3608 bitmask_init_bit(&mask_0
, 0);
3609 for (int b
= 0; b
< flags
->nflag
; b
++)
3611 if (bitmask_is_equal(flags
->flag
[b
], mask_0
))
3613 /* Only flag 0 is set, no copy of reduction required */
3617 else if (!bitmask_is_zero(flags
->flag
[b
]))
3620 for (out
= 0; out
< nout
; out
++)
3622 if (bitmask_is_set(flags
->flag
[b
], out
))
3640 "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
3641 flags
->nflag
, nout
, nelem
/ static_cast<double>(flags
->nflag
),
3642 nkeep
/ static_cast<double>(flags
->nflag
), ncopy
/ static_cast<double>(flags
->nflag
),
3643 nred
/ static_cast<double>(flags
->nflag
));
3646 /* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
3647 * *cjGlobal is updated with the cj count in src.
3648 * When setFlags==true, flag bit t is set in flag for all i and j clusters.
3650 template<bool setFlags
>
3651 static void copySelectedListRange(const nbnxn_ci_t
* gmx_restrict srcCi
,
3652 const NbnxnPairlistCpu
* gmx_restrict src
,
3653 NbnxnPairlistCpu
* gmx_restrict dest
,
3654 gmx_bitmask_t
* flag
,
3659 const int ncj
= srcCi
->cj_ind_end
- srcCi
->cj_ind_start
;
3661 dest
->ci
.push_back(*srcCi
);
3662 dest
->ci
.back().cj_ind_start
= dest
->cj
.size();
3663 dest
->ci
.back().cj_ind_end
= dest
->ci
.back().cj_ind_start
+ ncj
;
3667 bitmask_init_bit(&flag
[srcCi
->ci
>> iFlagShift
], t
);
3670 for (int j
= srcCi
->cj_ind_start
; j
< srcCi
->cj_ind_end
; j
++)
3672 dest
->cj
.push_back(src
->cj
[j
]);
3676 /* NOTE: This is relatively expensive, since this
3677 * operation is done for all elements in the list,
3678 * whereas at list generation this is done only
3679 * once for each flag entry.
3681 bitmask_init_bit(&flag
[src
->cj
[j
].cj
>> jFlagShift
], t
);
3686 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
3687 /* Avoid gcc 7 avx512 loop vectorization bug (actually only needed with -mavx512f) */
3688 # pragma GCC push_options
3689 # pragma GCC optimize("no-tree-vectorize")
3692 /* Returns the number of cluster pairs that are in use summed over all lists */
3693 static int countClusterpairs(gmx::ArrayRef
<const NbnxnPairlistCpu
> pairlists
)
3695 /* gcc 7 with -mavx512f can miss the contributions of 16 consecutive
3696 * elements to the sum calculated in this loop. Above we have disabled
3697 * loop vectorization to avoid this bug.
3700 for (const auto& pairlist
: pairlists
)
3702 ncjTotal
+= pairlist
.ncjInUse
;
3707 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
3708 # pragma GCC pop_options
3711 /* This routine re-balances the pairlists such that all are nearly equally
3712 * sized. Only whole i-entries are moved between lists. These are moved
3713 * between the ends of the lists, such that the buffer reduction cost should
3714 * not change significantly.
3715 * Note that all original reduction flags are currently kept. This can lead
3716 * to reduction of parts of the force buffer that could be avoided. But since
3717 * the original lists are quite balanced, this will only give minor overhead.
3719 static void rebalanceSimpleLists(gmx::ArrayRef
<const NbnxnPairlistCpu
> srcSet
,
3720 gmx::ArrayRef
<NbnxnPairlistCpu
> destSet
,
3721 gmx::ArrayRef
<PairsearchWork
> searchWork
)
3723 const int ncjTotal
= countClusterpairs(srcSet
);
3724 const int numLists
= srcSet
.ssize();
3725 const int ncjTarget
= (ncjTotal
+ numLists
- 1) / numLists
;
3727 #pragma omp parallel num_threads(numLists)
3729 int t
= gmx_omp_get_thread_num();
3731 int cjStart
= ncjTarget
* t
;
3732 int cjEnd
= ncjTarget
* (t
+ 1);
3734 /* The destination pair-list for task/thread t */
3735 NbnxnPairlistCpu
& dest
= destSet
[t
];
3737 clear_pairlist(&dest
);
3738 dest
.na_cj
= srcSet
[0].na_cj
;
3740 /* Note that the flags in the work struct (still) contain flags
3741 * for all entries that are present in srcSet->nbl[t].
3743 gmx_bitmask_t
* flag
= searchWork
[t
].buffer_flags
.flag
;
3745 int iFlagShift
= getBufferFlagShift(dest
.na_ci
);
3746 int jFlagShift
= getBufferFlagShift(dest
.na_cj
);
3749 for (int s
= 0; s
< numLists
&& cjGlobal
< cjEnd
; s
++)
3751 const NbnxnPairlistCpu
* src
= &srcSet
[s
];
3753 if (cjGlobal
+ src
->ncjInUse
> cjStart
)
3755 for (gmx::index i
= 0; i
< gmx::ssize(src
->ci
) && cjGlobal
< cjEnd
; i
++)
3757 const nbnxn_ci_t
* srcCi
= &src
->ci
[i
];
3758 int ncj
= srcCi
->cj_ind_end
- srcCi
->cj_ind_start
;
3759 if (cjGlobal
>= cjStart
)
3761 /* If the source list is not our own, we need to set
3762 * extra flags (the template bool parameter).
3766 copySelectedListRange
<true>(srcCi
, src
, &dest
, flag
, iFlagShift
, jFlagShift
, t
);
3770 copySelectedListRange
<false>(srcCi
, src
, &dest
, flag
, iFlagShift
,
3779 cjGlobal
+= src
->ncjInUse
;
3783 dest
.ncjInUse
= dest
.cj
.size();
3787 const int ncjTotalNew
= countClusterpairs(destSet
);
3788 GMX_RELEASE_ASSERT(ncjTotalNew
== ncjTotal
,
3789 "The total size of the lists before and after rebalancing should match");
3793 /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
3794 static bool checkRebalanceSimpleLists(gmx::ArrayRef
<const NbnxnPairlistCpu
> lists
)
3796 int numLists
= lists
.ssize();
3799 for (int s
= 0; s
< numLists
; s
++)
3801 ncjMax
= std::max(ncjMax
, lists
[s
].ncjInUse
);
3802 ncjTotal
+= lists
[s
].ncjInUse
;
3806 fprintf(debug
, "Pair-list ncjMax %d ncjTotal %d\n", ncjMax
, ncjTotal
);
3808 /* The rebalancing adds 3% extra time to the search. Heuristically we
3809 * determined that under common conditions the non-bonded kernel balance
3810 * improvement will outweigh this when the imbalance is more than 3%.
3811 * But this will, obviously, depend on search vs kernel time and nstlist.
3813 const real rebalanceTolerance
= 1.03;
3815 return real(numLists
* ncjMax
) > real(ncjTotal
) * rebalanceTolerance
;
3818 /* Perform a count (linear) sort to sort the smaller lists to the end.
3819 * This avoids load imbalance on the GPU, as large lists will be
3820 * scheduled and executed first and the smaller lists later.
3821 * Load balancing between multi-processors only happens at the end
3822 * and there smaller lists lead to more effective load balancing.
3823 * The sorting is done on the cj4 count, not on the actual pair counts.
3824 * Not only does this make the sort faster, but it also results in
3825 * better load balancing than using a list sorted on exact load.
3826 * This function swaps the pointer in the pair list to avoid a copy operation.
3828 static void sort_sci(NbnxnPairlistGpu
* nbl
)
3830 if (nbl
->cj4
.size() <= nbl
->sci
.size())
3832 /* nsci = 0 or all sci have size 1, sorting won't change the order */
3836 NbnxnPairlistGpuWork
& work
= *nbl
->work
;
3838 /* We will distinguish differences up to double the average */
3839 const int m
= static_cast<int>((2 * ssize(nbl
->cj4
)) / ssize(nbl
->sci
));
3841 /* Resize work.sci_sort so we can sort into it */
3842 work
.sci_sort
.resize(nbl
->sci
.size());
3844 std::vector
<int>& sort
= work
.sortBuffer
;
3845 /* Set up m + 1 entries in sort, initialized at 0 */
3847 sort
.resize(m
+ 1, 0);
3848 /* Count the entries of each size */
3849 for (const nbnxn_sci_t
& sci
: nbl
->sci
)
3851 int i
= std::min(m
, sci
.numJClusterGroups());
3854 /* Calculate the offset for each count */
3857 for (gmx::index i
= m
- 1; i
>= 0; i
--)
3860 sort
[i
] = sort
[i
+ 1] + s0
;
3864 /* Sort entries directly into place */
3865 gmx::ArrayRef
<nbnxn_sci_t
> sci_sort
= work
.sci_sort
;
3866 for (const nbnxn_sci_t
& sci
: nbl
->sci
)
3868 int i
= std::min(m
, sci
.numJClusterGroups());
3869 sci_sort
[sort
[i
]++] = sci
;
3872 /* Swap the sci pointers so we use the new, sorted list */
3873 std::swap(nbl
->sci
, work
.sci_sort
);
3876 /* Returns the i-zone range for pairlist construction for the give locality */
3877 static Range
<int> getIZoneRange(const Nbnxm::GridSet::DomainSetup
& domainSetup
,
3878 const InteractionLocality locality
)
3880 if (domainSetup
.doTestParticleInsertion
)
3882 /* With TPI we do grid 1, the inserted molecule, versus grid 0, the rest */
3885 else if (locality
== InteractionLocality::Local
)
3887 /* Local: only zone (grid) 0 vs 0 */
3892 /* Non-local: we need all i-zones */
3893 return { 0, int(domainSetup
.zones
->iZones
.size()) };
3897 /* Returns the j-zone range for pairlist construction for the give locality and i-zone */
3898 static Range
<int> getJZoneRange(const gmx_domdec_zones_t
& ddZones
,
3899 const InteractionLocality locality
,
3902 if (locality
== InteractionLocality::Local
)
3904 /* Local: zone 0 vs 0 or with TPI 1 vs 0 */
3907 else if (iZone
== 0)
3909 /* Non-local: we need to avoid the local (zone 0 vs 0) interactions */
3910 return { 1, *ddZones
.iZones
[iZone
].jZoneRange
.end() };
3914 /* Non-local with non-local i-zone: use all j-zones */
3915 return ddZones
.iZones
[iZone
].jZoneRange
;
3919 //! Prepares CPU lists produced by the search for dynamic pruning
3920 static void prepareListsForDynamicPruning(gmx::ArrayRef
<NbnxnPairlistCpu
> lists
);
3922 void PairlistSet::constructPairlists(const Nbnxm::GridSet
& gridSet
,
3923 gmx::ArrayRef
<PairsearchWork
> searchWork
,
3924 nbnxn_atomdata_t
* nbat
,
3925 const ListOfLists
<int>& exclusions
,
3926 const int minimumIlistCountForGpuBalancing
,
3928 SearchCycleCounting
* searchCycleCounting
)
3930 const real rlist
= params_
.rlistOuter
;
3932 int nsubpair_target
;
3933 float nsubpair_tot_est
;
3936 int np_tot
, np_noq
, np_hlj
, nap
;
3938 const int numLists
= (isCpuType_
? cpuLists_
.size() : gpuLists_
.size());
3942 fprintf(debug
, "ns making %d nblists\n", numLists
);
3945 nbat
->bUseBufferFlags
= (nbat
->out
.size() > 1);
3946 /* We should re-init the flags before making the first list */
3947 if (nbat
->bUseBufferFlags
&& locality_
== InteractionLocality::Local
)
3949 init_buffer_flags(&nbat
->buffer_flags
, nbat
->numAtoms());
3952 if (!isCpuType_
&& minimumIlistCountForGpuBalancing
> 0)
3954 get_nsubpair_target(gridSet
, locality_
, rlist
, minimumIlistCountForGpuBalancing
,
3955 &nsubpair_target
, &nsubpair_tot_est
);
3959 nsubpair_target
= 0;
3960 nsubpair_tot_est
= 0;
3963 /* Clear all pair-lists */
3964 for (int th
= 0; th
< numLists
; th
++)
3968 clear_pairlist(&cpuLists_
[th
]);
3972 clear_pairlist(&gpuLists_
[th
]);
3975 if (params_
.haveFep
)
3977 clear_pairlist_fep(fepLists_
[th
].get());
3981 const gmx_domdec_zones_t
& ddZones
= *gridSet
.domainSetup().zones
;
3983 const auto iZoneRange
= getIZoneRange(gridSet
.domainSetup(), locality_
);
3985 for (const int iZone
: iZoneRange
)
3987 const Grid
& iGrid
= gridSet
.grids()[iZone
];
3989 const auto jZoneRange
= getJZoneRange(ddZones
, locality_
, iZone
);
3991 for (int jZone
: jZoneRange
)
3993 const Grid
& jGrid
= gridSet
.grids()[jZone
];
3997 fprintf(debug
, "ns search grid %d vs %d\n", iZone
, jZone
);
4000 searchCycleCounting
->start(enbsCCsearch
);
4002 ci_block
= get_ci_block_size(iGrid
, gridSet
.domainSetup().haveMultipleDomains
, numLists
);
4004 /* With GPU: generate progressively smaller lists for
4005 * load balancing for local only or non-local with 2 zones.
4007 progBal
= (locality_
== InteractionLocality::Local
|| ddZones
.n
<= 2);
4009 #pragma omp parallel for num_threads(numLists) schedule(static)
4010 for (int th
= 0; th
< numLists
; th
++)
4014 /* Re-init the thread-local work flag data before making
4015 * the first list (not an elegant conditional).
4017 if (nbat
->bUseBufferFlags
&& (iZone
== 0 && jZone
== 0))
4019 init_buffer_flags(&searchWork
[th
].buffer_flags
, nbat
->numAtoms());
4022 if (combineLists_
&& th
> 0)
4024 GMX_ASSERT(!isCpuType_
, "Can only combine GPU lists");
4026 clear_pairlist(&gpuLists_
[th
]);
4029 PairsearchWork
& work
= searchWork
[th
];
4031 work
.cycleCounter
.start();
4033 t_nblist
* fepListPtr
= (fepLists_
.empty() ? nullptr : fepLists_
[th
].get());
4035 /* Divide the i cells equally over the pairlists */
4038 nbnxn_make_pairlist_part(gridSet
, iGrid
, jGrid
, &work
, nbat
, exclusions
, rlist
,
4039 params_
.pairlistType
, ci_block
, nbat
->bUseBufferFlags
,
4040 nsubpair_target
, progBal
, nsubpair_tot_est
, th
,
4041 numLists
, &cpuLists_
[th
], fepListPtr
);
4045 nbnxn_make_pairlist_part(gridSet
, iGrid
, jGrid
, &work
, nbat
, exclusions
, rlist
,
4046 params_
.pairlistType
, ci_block
, nbat
->bUseBufferFlags
,
4047 nsubpair_target
, progBal
, nsubpair_tot_est
, th
,
4048 numLists
, &gpuLists_
[th
], fepListPtr
);
4051 work
.cycleCounter
.stop();
4053 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
4055 searchCycleCounting
->stop(enbsCCsearch
);
4060 for (int th
= 0; th
< numLists
; th
++)
4062 inc_nrnb(nrnb
, eNR_NBNXN_DIST2
, searchWork
[th
].ndistc
);
4066 const NbnxnPairlistCpu
& nbl
= cpuLists_
[th
];
4067 np_tot
+= nbl
.cj
.size();
4068 np_noq
+= nbl
.work
->ncj_noq
;
4069 np_hlj
+= nbl
.work
->ncj_hlj
;
4073 const NbnxnPairlistGpu
& nbl
= gpuLists_
[th
];
4074 /* This count ignores potential subsequent pair pruning */
4075 np_tot
+= nbl
.nci_tot
;
4080 nap
= cpuLists_
[0].na_ci
* cpuLists_
[0].na_cj
;
4084 nap
= gmx::square(gpuLists_
[0].na_ci
);
4086 natpair_ljq_
= (np_tot
- np_noq
) * nap
- np_hlj
* nap
/ 2;
4087 natpair_lj_
= np_noq
* nap
;
4088 natpair_q_
= np_hlj
* nap
/ 2;
4090 if (combineLists_
&& numLists
> 1)
4092 GMX_ASSERT(!isCpuType_
, "Can only combine GPU lists");
4094 searchCycleCounting
->start(enbsCCcombine
);
4096 combine_nblists(gmx::constArrayRefFromArray(&gpuLists_
[1], numLists
- 1), &gpuLists_
[0]);
4098 searchCycleCounting
->stop(enbsCCcombine
);
4105 if (numLists
> 1 && checkRebalanceSimpleLists(cpuLists_
))
4107 rebalanceSimpleLists(cpuLists_
, cpuListsWork_
, searchWork
);
4109 /* Swap the sets of pair lists */
4110 cpuLists_
.swap(cpuListsWork_
);
4115 /* Sort the entries on size, large ones first */
4116 if (combineLists_
|| gpuLists_
.size() == 1)
4118 sort_sci(&gpuLists_
[0]);
4122 #pragma omp parallel for num_threads(numLists) schedule(static)
4123 for (int th
= 0; th
< numLists
; th
++)
4127 sort_sci(&gpuLists_
[th
]);
4129 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
4134 if (nbat
->bUseBufferFlags
)
4136 reduce_buffer_flags(searchWork
, numLists
, &nbat
->buffer_flags
);
4139 if (gridSet
.haveFep())
4141 /* Balance the free-energy lists over all the threads */
4142 balance_fep_lists(fepLists_
, searchWork
);
4147 /* This is a fresh list, so not pruned, stored using ci.
4148 * ciOuter is invalid at this point.
4150 GMX_ASSERT(cpuLists_
[0].ciOuter
.empty(), "ciOuter is invalid so it should be empty");
4153 /* If we have more than one list, they either got rebalancing (CPU)
4154 * or combined (GPU), so we should dump the final result to debug.
4158 if (isCpuType_
&& cpuLists_
.size() > 1)
4160 for (auto& cpuList
: cpuLists_
)
4162 print_nblist_statistics(debug
, cpuList
, gridSet
, rlist
);
4165 else if (!isCpuType_
&& gpuLists_
.size() > 1)
4167 print_nblist_statistics(debug
, gpuLists_
[0], gridSet
, rlist
);
4177 for (auto& cpuList
: cpuLists_
)
4179 print_nblist_ci_cj(debug
, cpuList
);
4184 print_nblist_sci_cj(debug
, gpuLists_
[0]);
4188 if (nbat
->bUseBufferFlags
)
4190 print_reduction_cost(&nbat
->buffer_flags
, numLists
);
4194 if (params_
.useDynamicPruning
&& isCpuType_
)
4196 prepareListsForDynamicPruning(cpuLists_
);
4200 void PairlistSets::construct(const InteractionLocality iLocality
,
4201 PairSearch
* pairSearch
,
4202 nbnxn_atomdata_t
* nbat
,
4203 const ListOfLists
<int>& exclusions
,
4207 const auto& gridSet
= pairSearch
->gridSet();
4208 const auto* ddZones
= gridSet
.domainSetup().zones
;
4210 /* The Nbnxm code can also work with more exclusions than those in i-zones only
4211 * when using DD, but the equality check can catch more issues.
4214 exclusions
.empty() || (!ddZones
&& exclusions
.ssize() == gridSet
.numRealAtomsTotal())
4215 || (ddZones
&& exclusions
.ssize() == ddZones
->cg_range
[ddZones
->iZones
.size()]),
4216 "exclusions should either be empty or the number of lists should match the number of "
4219 pairlistSet(iLocality
).constructPairlists(gridSet
, pairSearch
->work(), nbat
, exclusions
,
4220 minimumIlistCountForGpuBalancing_
, nrnb
,
4221 &pairSearch
->cycleCounting_
);
4223 if (iLocality
== InteractionLocality::Local
)
4225 outerListCreationStep_
= step
;
4229 GMX_RELEASE_ASSERT(outerListCreationStep_
== step
,
4230 "Outer list should be created at the same step as the inner list");
4233 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4234 if (iLocality
== InteractionLocality::Local
)
4236 pairSearch
->cycleCounting_
.searchCount_
++;
4238 if (pairSearch
->cycleCounting_
.recordCycles_
4239 && (!pairSearch
->gridSet().domainSetup().haveMultipleDomains
|| iLocality
== InteractionLocality::NonLocal
)
4240 && pairSearch
->cycleCounting_
.searchCount_
% 100 == 0)
4242 pairSearch
->cycleCounting_
.printCycles(stderr
, pairSearch
->work());
4246 void nonbonded_verlet_t::constructPairlist(const InteractionLocality iLocality
,
4247 const ListOfLists
<int>& exclusions
,
4251 pairlistSets_
->construct(iLocality
, pairSearch_
.get(), nbat
.get(), exclusions
, step
, nrnb
);
4255 /* Launch the transfer of the pairlist to the GPU.
4257 * NOTE: The launch overhead is currently not timed separately
4259 Nbnxm::gpu_init_pairlist(gpu_nbv
, pairlistSets().pairlistSet(iLocality
).gpuList(), iLocality
);
4263 static void prepareListsForDynamicPruning(gmx::ArrayRef
<NbnxnPairlistCpu
> lists
)
4265 /* TODO: Restructure the lists so we have actual outer and inner
4266 * list objects so we can set a single pointer instead of
4267 * swapping several pointers.
4270 for (auto& list
: lists
)
4272 /* The search produced a list in ci/cj.
4273 * Swap the list pointers so we get the outer list is ciOuter,cjOuter
4274 * and we can prune that to get an inner list in ci/cj.
4276 GMX_RELEASE_ASSERT(list
.ciOuter
.empty() && list
.cjOuter
.empty(),
4277 "The outer lists should be empty before preparation");
4279 std::swap(list
.ci
, list
.ciOuter
);
4280 std::swap(list
.cj
, list
.cjOuter
);