2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gromacs/domdec/domdec_struct.h"
49 #include "gromacs/gmxlib/nrnb.h"
50 #include "gromacs/math/functions.h"
51 #include "gromacs/math/utilities.h"
52 #include "gromacs/math/vec.h"
53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
54 #include "gromacs/mdtypes/group.h"
55 #include "gromacs/mdtypes/md_enums.h"
56 #include "gromacs/nbnxm/atomdata.h"
57 #include "gromacs/nbnxm/gpu_data_mgmt.h"
58 #include "gromacs/nbnxm/nbnxm_geometry.h"
59 #include "gromacs/nbnxm/nbnxm_simd.h"
60 #include "gromacs/pbcutil/ishift.h"
61 #include "gromacs/pbcutil/pbc.h"
62 #include "gromacs/simd/simd.h"
63 #include "gromacs/simd/vector_operations.h"
64 #include "gromacs/topology/block.h"
65 #include "gromacs/utility/exceptions.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/gmxomp.h"
68 #include "gromacs/utility/smalloc.h"
70 #include "clusterdistancekerneltype.h"
72 #include "pairlistset.h"
73 #include "pairlistsets.h"
74 #include "pairlistwork.h"
75 #include "pairsearch.h"
77 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
79 using BoundingBox
= Nbnxm::BoundingBox
; // TODO: Remove when refactoring this file
80 using BoundingBox1D
= Nbnxm::BoundingBox1D
; // TODO: Remove when refactoring this file
82 using Grid
= Nbnxm::Grid
; // TODO: Remove when refactoring this file
84 // Convience alias for partial Nbnxn namespace usage
85 using InteractionLocality
= Nbnxm::InteractionLocality
;
87 /* We shift the i-particles backward for PBC.
88 * This leads to more conditionals than shifting forward.
89 * We do this to get more balanced pair lists.
91 constexpr bool c_pbcShiftBackward
= true;
93 /* Layout for the nonbonded NxN pair lists */
94 enum class NbnxnLayout
96 NoSimd4x4
, // i-cluster size 4, j-cluster size 4
97 Simd4xN
, // i-cluster size 4, j-cluster size SIMD width
98 Simd2xNN
, // i-cluster size 4, j-cluster size half SIMD width
99 Gpu8x8x8
// i-cluster size 8, j-cluster size 8 + super-clustering
103 /* Returns the j-cluster size */
104 template <NbnxnLayout layout
>
105 static constexpr int jClusterSize()
107 static_assert(layout
== NbnxnLayout::NoSimd4x4
|| layout
== NbnxnLayout::Simd4xN
|| layout
== NbnxnLayout::Simd2xNN
, "Currently jClusterSize only supports CPU layouts");
109 return layout
== NbnxnLayout::Simd4xN
? GMX_SIMD_REAL_WIDTH
: (layout
== NbnxnLayout::Simd2xNN
? GMX_SIMD_REAL_WIDTH
/2 : c_nbnxnCpuIClusterSize
);
112 /*! \brief Returns the j-cluster index given the i-cluster index.
114 * \tparam jClusterSize The number of atoms in a j-cluster
115 * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
116 * \param[in] ci The i-cluster index
118 template <int jClusterSize
, int jSubClusterIndex
>
119 static inline int cjFromCi(int ci
)
121 static_assert(jClusterSize
== c_nbnxnCpuIClusterSize
/2 || jClusterSize
== c_nbnxnCpuIClusterSize
|| jClusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
123 static_assert(jSubClusterIndex
== 0 || jSubClusterIndex
== 1,
124 "Only sub-cluster indices 0 and 1 are supported");
126 if (jClusterSize
== c_nbnxnCpuIClusterSize
/2)
128 if (jSubClusterIndex
== 0)
134 return ((ci
+ 1) << 1) - 1;
137 else if (jClusterSize
== c_nbnxnCpuIClusterSize
)
147 /*! \brief Returns the j-cluster index given the i-cluster index.
149 * \tparam layout The pair-list layout
150 * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
151 * \param[in] ci The i-cluster index
153 template <NbnxnLayout layout
, int jSubClusterIndex
>
154 static inline int cjFromCi(int ci
)
156 constexpr int clusterSize
= jClusterSize
<layout
>();
158 return cjFromCi
<clusterSize
, jSubClusterIndex
>(ci
);
161 /* Returns the nbnxn coordinate data index given the i-cluster index */
162 template <NbnxnLayout layout
>
163 static inline int xIndexFromCi(int ci
)
165 constexpr int clusterSize
= jClusterSize
<layout
>();
167 static_assert(clusterSize
== c_nbnxnCpuIClusterSize
/2 || clusterSize
== c_nbnxnCpuIClusterSize
|| clusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
169 if (clusterSize
<= c_nbnxnCpuIClusterSize
)
171 /* Coordinates are stored packed in groups of 4 */
176 /* Coordinates packed in 8, i-cluster size is half the packing width */
177 return (ci
>> 1)*STRIDE_P8
+ (ci
& 1)*(c_packX8
>> 1);
181 /* Returns the nbnxn coordinate data index given the j-cluster index */
182 template <NbnxnLayout layout
>
183 static inline int xIndexFromCj(int cj
)
185 constexpr int clusterSize
= jClusterSize
<layout
>();
187 static_assert(clusterSize
== c_nbnxnCpuIClusterSize
/2 || clusterSize
== c_nbnxnCpuIClusterSize
|| clusterSize
== c_nbnxnCpuIClusterSize
*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
189 if (clusterSize
== c_nbnxnCpuIClusterSize
/2)
191 /* Coordinates are stored packed in groups of 4 */
192 return (cj
>> 1)*STRIDE_P4
+ (cj
& 1)*(c_packX4
>> 1);
194 else if (clusterSize
== c_nbnxnCpuIClusterSize
)
196 /* Coordinates are stored packed in groups of 4 */
201 /* Coordinates are stored packed in groups of 8 */
208 void nbnxn_init_pairlist_fep(t_nblist
*nl
)
210 nl
->type
= GMX_NBLIST_INTERACTION_FREE_ENERGY
;
211 nl
->igeometry
= GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE
;
212 /* The interaction functions are set in the free energy kernel fuction */
225 nl
->jindex
= nullptr;
227 nl
->excl_fep
= nullptr;
231 static void init_buffer_flags(nbnxn_buffer_flags_t
*flags
,
234 flags
->nflag
= (natoms
+ NBNXN_BUFFERFLAG_SIZE
- 1)/NBNXN_BUFFERFLAG_SIZE
;
235 if (flags
->nflag
> flags
->flag_nalloc
)
237 flags
->flag_nalloc
= over_alloc_large(flags
->nflag
);
238 srenew(flags
->flag
, flags
->flag_nalloc
);
240 for (int b
= 0; b
< flags
->nflag
; b
++)
242 bitmask_clear(&(flags
->flag
[b
]));
246 /* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
248 * Given a cutoff distance between atoms, this functions returns the cutoff
249 * distance2 between a bounding box of a group of atoms and a grid cell.
250 * Since atoms can be geometrically outside of the cell they have been
251 * assigned to (when atom groups instead of individual atoms are assigned
252 * to cells), this distance returned can be larger than the input.
255 listRangeForBoundingBoxToGridCell(real rlist
,
256 const Grid::Dimensions
&gridDims
)
258 return rlist
+ gridDims
.maxAtomGroupRadius
;
261 /* Returns the pair-list cutoff between a grid cells given an atom-to-atom pair-list cutoff
263 * Given a cutoff distance between atoms, this functions returns the cutoff
264 * distance2 between two grid cells.
265 * Since atoms can be geometrically outside of the cell they have been
266 * assigned to (when atom groups instead of individual atoms are assigned
267 * to cells), this distance returned can be larger than the input.
270 listRangeForGridCellToGridCell(real rlist
,
271 const Grid::Dimensions
&iGridDims
,
272 const Grid::Dimensions
&jGridDims
)
274 return rlist
+ iGridDims
.maxAtomGroupRadius
+ jGridDims
.maxAtomGroupRadius
;
277 /* Determines the cell range along one dimension that
278 * the bounding box b0 - b1 sees.
281 static void get_cell_range(real b0
, real b1
,
282 const Grid::Dimensions
&jGridDims
,
283 real d2
, real rlist
, int *cf
, int *cl
)
285 real listRangeBBToCell2
= gmx::square(listRangeForBoundingBoxToGridCell(rlist
, jGridDims
));
286 real distanceInCells
= (b0
- jGridDims
.lowerCorner
[dim
])*jGridDims
.invCellSize
[dim
];
287 *cf
= std::max(static_cast<int>(distanceInCells
), 0);
290 d2
+ gmx::square((b0
- jGridDims
.lowerCorner
[dim
]) - (*cf
- 1 + 1)*jGridDims
.cellSize
[dim
]) < listRangeBBToCell2
)
295 *cl
= std::min(static_cast<int>((b1
- jGridDims
.lowerCorner
[dim
])*jGridDims
.invCellSize
[dim
]), jGridDims
.numCells
[dim
] - 1);
296 while (*cl
< jGridDims
.numCells
[dim
] - 1 &&
297 d2
+ gmx::square((*cl
+ 1)*jGridDims
.cellSize
[dim
] - (b1
- jGridDims
.lowerCorner
[dim
])) < listRangeBBToCell2
)
303 /* Reference code calculating the distance^2 between two bounding boxes */
305 static float box_dist2(float bx0, float bx1, float by0,
306 float by1, float bz0, float bz1,
307 const BoundingBox *bb)
310 float dl, dh, dm, dm0;
314 dl = bx0 - bb->upper.x;
315 dh = bb->lower.x - bx1;
316 dm = std::max(dl, dh);
317 dm0 = std::max(dm, 0.0f);
320 dl = by0 - bb->upper.y;
321 dh = bb->lower.y - by1;
322 dm = std::max(dl, dh);
323 dm0 = std::max(dm, 0.0f);
326 dl = bz0 - bb->upper.z;
327 dh = bb->lower.z - bz1;
328 dm = std::max(dl, dh);
329 dm0 = std::max(dm, 0.0f);
336 #if !NBNXN_SEARCH_BB_SIMD4
338 /*! \brief Plain C code calculating the distance^2 between two bounding boxes in xyz0 format
340 * \param[in] bb_i First bounding box
341 * \param[in] bb_j Second bounding box
343 static float clusterBoundingBoxDistance2(const BoundingBox
&bb_i
,
344 const BoundingBox
&bb_j
)
346 float dl
= bb_i
.lower
.x
- bb_j
.upper
.x
;
347 float dh
= bb_j
.lower
.x
- bb_i
.upper
.x
;
348 float dm
= std::max(dl
, dh
);
349 float dm0
= std::max(dm
, 0.0f
);
352 dl
= bb_i
.lower
.y
- bb_j
.upper
.y
;
353 dh
= bb_j
.lower
.y
- bb_i
.upper
.y
;
354 dm
= std::max(dl
, dh
);
355 dm0
= std::max(dm
, 0.0f
);
358 dl
= bb_i
.lower
.z
- bb_j
.upper
.z
;
359 dh
= bb_j
.lower
.z
- bb_i
.upper
.z
;
360 dm
= std::max(dl
, dh
);
361 dm0
= std::max(dm
, 0.0f
);
367 #else /* NBNXN_SEARCH_BB_SIMD4 */
369 /*! \brief 4-wide SIMD code calculating the distance^2 between two bounding boxes in xyz0 format
371 * \param[in] bb_i First bounding box, should be aligned for 4-wide SIMD
372 * \param[in] bb_j Second bounding box, should be aligned for 4-wide SIMD
374 static float clusterBoundingBoxDistance2(const BoundingBox
&bb_i
,
375 const BoundingBox
&bb_j
)
377 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
380 const Simd4Float bb_i_S0
= load4(bb_i
.lower
.ptr());
381 const Simd4Float bb_i_S1
= load4(bb_i
.upper
.ptr());
382 const Simd4Float bb_j_S0
= load4(bb_j
.lower
.ptr());
383 const Simd4Float bb_j_S1
= load4(bb_j
.upper
.ptr());
385 const Simd4Float dl_S
= bb_i_S0
- bb_j_S1
;
386 const Simd4Float dh_S
= bb_j_S0
- bb_i_S1
;
388 const Simd4Float dm_S
= max(dl_S
, dh_S
);
389 const Simd4Float dm0_S
= max(dm_S
, simd4SetZeroF());
391 return dotProduct(dm0_S
, dm0_S
);
394 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
395 template <int boundingBoxStart
>
396 static inline void gmx_simdcall
397 clusterBoundingBoxDistance2_xxxx_simd4_inner(const float *bb_i
,
399 const Simd4Float xj_l
,
400 const Simd4Float yj_l
,
401 const Simd4Float zj_l
,
402 const Simd4Float xj_h
,
403 const Simd4Float yj_h
,
404 const Simd4Float zj_h
)
406 constexpr int stride
= c_packedBoundingBoxesDimSize
;
408 const int shi
= boundingBoxStart
*Nbnxm::c_numBoundingBoxBounds1D
*DIM
;
410 const Simd4Float zero
= setZero();
412 const Simd4Float xi_l
= load4(bb_i
+ shi
+ 0*stride
);
413 const Simd4Float yi_l
= load4(bb_i
+ shi
+ 1*stride
);
414 const Simd4Float zi_l
= load4(bb_i
+ shi
+ 2*stride
);
415 const Simd4Float xi_h
= load4(bb_i
+ shi
+ 3*stride
);
416 const Simd4Float yi_h
= load4(bb_i
+ shi
+ 4*stride
);
417 const Simd4Float zi_h
= load4(bb_i
+ shi
+ 5*stride
);
419 const Simd4Float dx_0
= xi_l
- xj_h
;
420 const Simd4Float dy_0
= yi_l
- yj_h
;
421 const Simd4Float dz_0
= zi_l
- zj_h
;
423 const Simd4Float dx_1
= xj_l
- xi_h
;
424 const Simd4Float dy_1
= yj_l
- yi_h
;
425 const Simd4Float dz_1
= zj_l
- zi_h
;
427 const Simd4Float mx
= max(dx_0
, dx_1
);
428 const Simd4Float my
= max(dy_0
, dy_1
);
429 const Simd4Float mz
= max(dz_0
, dz_1
);
431 const Simd4Float m0x
= max(mx
, zero
);
432 const Simd4Float m0y
= max(my
, zero
);
433 const Simd4Float m0z
= max(mz
, zero
);
435 const Simd4Float d2x
= m0x
* m0x
;
436 const Simd4Float d2y
= m0y
* m0y
;
437 const Simd4Float d2z
= m0z
* m0z
;
439 const Simd4Float d2s
= d2x
+ d2y
;
440 const Simd4Float d2t
= d2s
+ d2z
;
442 store4(d2
+ boundingBoxStart
, d2t
);
445 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
447 clusterBoundingBoxDistance2_xxxx_simd4(const float *bb_j
,
452 constexpr int stride
= c_packedBoundingBoxesDimSize
;
454 // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
457 const Simd4Float xj_l
= Simd4Float(bb_j
[0*stride
]);
458 const Simd4Float yj_l
= Simd4Float(bb_j
[1*stride
]);
459 const Simd4Float zj_l
= Simd4Float(bb_j
[2*stride
]);
460 const Simd4Float xj_h
= Simd4Float(bb_j
[3*stride
]);
461 const Simd4Float yj_h
= Simd4Float(bb_j
[4*stride
]);
462 const Simd4Float zj_h
= Simd4Float(bb_j
[5*stride
]);
464 /* Here we "loop" over si (0,stride) from 0 to nsi with step stride.
465 * But as we know the number of iterations is 1 or 2, we unroll manually.
467 clusterBoundingBoxDistance2_xxxx_simd4_inner
<0>(bb_i
, d2
,
472 clusterBoundingBoxDistance2_xxxx_simd4_inner
<stride
>(bb_i
, d2
,
478 #endif /* NBNXN_SEARCH_BB_SIMD4 */
481 /* Returns if any atom pair from two clusters is within distance sqrt(rlist2) */
482 static inline gmx_bool
483 clusterpair_in_range(const NbnxnPairlistGpuWork
&work
,
485 int csj
, int stride
, const real
*x_j
,
488 #if !GMX_SIMD4_HAVE_REAL
491 * All coordinates are stored as xyzxyz...
494 const real
*x_i
= work
.iSuperClusterData
.x
.data();
496 for (int i
= 0; i
< c_nbnxnGpuClusterSize
; i
++)
498 int i0
= (si
*c_nbnxnGpuClusterSize
+ i
)*DIM
;
499 for (int j
= 0; j
< c_nbnxnGpuClusterSize
; j
++)
501 int j0
= (csj
*c_nbnxnGpuClusterSize
+ j
)*stride
;
503 real d2
= gmx::square(x_i
[i0
] - x_j
[j0
]) + gmx::square(x_i
[i0
+1] - x_j
[j0
+1]) + gmx::square(x_i
[i0
+2] - x_j
[j0
+2]);
514 #else /* !GMX_SIMD4_HAVE_REAL */
516 /* 4-wide SIMD version.
517 * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
518 * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
520 static_assert(c_nbnxnGpuClusterSize
== 8 || c_nbnxnGpuClusterSize
== 4,
521 "A cluster is hard-coded to 4/8 atoms.");
523 Simd4Real rc2_S
= Simd4Real(rlist2
);
525 const real
*x_i
= work
.iSuperClusterData
.xSimd
.data();
527 int dim_stride
= c_nbnxnGpuClusterSize
*DIM
;
528 Simd4Real ix_S0
= load4(x_i
+ si
*dim_stride
+ 0*GMX_SIMD4_WIDTH
);
529 Simd4Real iy_S0
= load4(x_i
+ si
*dim_stride
+ 1*GMX_SIMD4_WIDTH
);
530 Simd4Real iz_S0
= load4(x_i
+ si
*dim_stride
+ 2*GMX_SIMD4_WIDTH
);
532 Simd4Real ix_S1
, iy_S1
, iz_S1
;
533 if (c_nbnxnGpuClusterSize
== 8)
535 ix_S1
= load4(x_i
+ si
*dim_stride
+ 3*GMX_SIMD4_WIDTH
);
536 iy_S1
= load4(x_i
+ si
*dim_stride
+ 4*GMX_SIMD4_WIDTH
);
537 iz_S1
= load4(x_i
+ si
*dim_stride
+ 5*GMX_SIMD4_WIDTH
);
539 /* We loop from the outer to the inner particles to maximize
540 * the chance that we find a pair in range quickly and return.
542 int j0
= csj
*c_nbnxnGpuClusterSize
;
543 int j1
= j0
+ c_nbnxnGpuClusterSize
- 1;
546 Simd4Real jx0_S
, jy0_S
, jz0_S
;
547 Simd4Real jx1_S
, jy1_S
, jz1_S
;
549 Simd4Real dx_S0
, dy_S0
, dz_S0
;
550 Simd4Real dx_S1
, dy_S1
, dz_S1
;
551 Simd4Real dx_S2
, dy_S2
, dz_S2
;
552 Simd4Real dx_S3
, dy_S3
, dz_S3
;
563 Simd4Bool wco_any_S01
, wco_any_S23
, wco_any_S
;
565 jx0_S
= Simd4Real(x_j
[j0
*stride
+0]);
566 jy0_S
= Simd4Real(x_j
[j0
*stride
+1]);
567 jz0_S
= Simd4Real(x_j
[j0
*stride
+2]);
569 jx1_S
= Simd4Real(x_j
[j1
*stride
+0]);
570 jy1_S
= Simd4Real(x_j
[j1
*stride
+1]);
571 jz1_S
= Simd4Real(x_j
[j1
*stride
+2]);
573 /* Calculate distance */
574 dx_S0
= ix_S0
- jx0_S
;
575 dy_S0
= iy_S0
- jy0_S
;
576 dz_S0
= iz_S0
- jz0_S
;
577 dx_S2
= ix_S0
- jx1_S
;
578 dy_S2
= iy_S0
- jy1_S
;
579 dz_S2
= iz_S0
- jz1_S
;
580 if (c_nbnxnGpuClusterSize
== 8)
582 dx_S1
= ix_S1
- jx0_S
;
583 dy_S1
= iy_S1
- jy0_S
;
584 dz_S1
= iz_S1
- jz0_S
;
585 dx_S3
= ix_S1
- jx1_S
;
586 dy_S3
= iy_S1
- jy1_S
;
587 dz_S3
= iz_S1
- jz1_S
;
590 /* rsq = dx*dx+dy*dy+dz*dz */
591 rsq_S0
= norm2(dx_S0
, dy_S0
, dz_S0
);
592 rsq_S2
= norm2(dx_S2
, dy_S2
, dz_S2
);
593 if (c_nbnxnGpuClusterSize
== 8)
595 rsq_S1
= norm2(dx_S1
, dy_S1
, dz_S1
);
596 rsq_S3
= norm2(dx_S3
, dy_S3
, dz_S3
);
599 wco_S0
= (rsq_S0
< rc2_S
);
600 wco_S2
= (rsq_S2
< rc2_S
);
601 if (c_nbnxnGpuClusterSize
== 8)
603 wco_S1
= (rsq_S1
< rc2_S
);
604 wco_S3
= (rsq_S3
< rc2_S
);
606 if (c_nbnxnGpuClusterSize
== 8)
608 wco_any_S01
= wco_S0
|| wco_S1
;
609 wco_any_S23
= wco_S2
|| wco_S3
;
610 wco_any_S
= wco_any_S01
|| wco_any_S23
;
614 wco_any_S
= wco_S0
|| wco_S2
;
617 if (anyTrue(wco_any_S
))
628 #endif /* !GMX_SIMD4_HAVE_REAL */
631 /* Returns the j-cluster index for index cjIndex in a cj list */
632 static inline int nblCj(gmx::ArrayRef
<const nbnxn_cj_t
> cjList
,
635 return cjList
[cjIndex
].cj
;
638 /* Returns the j-cluster index for index cjIndex in a cj4 list */
639 static inline int nblCj(gmx::ArrayRef
<const nbnxn_cj4_t
> cj4List
,
642 return cj4List
[cjIndex
/c_nbnxnGpuJgroupSize
].cj
[cjIndex
& (c_nbnxnGpuJgroupSize
- 1)];
645 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
646 static unsigned int nbl_imask0(const NbnxnPairlistGpu
*nbl
, int cj_ind
)
648 return nbl
->cj4
[cj_ind
/c_nbnxnGpuJgroupSize
].imei
[0].imask
;
651 NbnxnPairlistCpu::NbnxnPairlistCpu() :
652 na_ci(c_nbnxnCpuIClusterSize
),
657 work(std::make_unique
<NbnxnPairlistCpuWork
>())
661 NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy
) :
662 na_ci(c_nbnxnGpuClusterSize
),
663 na_cj(c_nbnxnGpuClusterSize
),
664 na_sc(c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
),
666 sci({}, {pinningPolicy
}),
667 cj4({}, {pinningPolicy
}),
668 excl({}, {pinningPolicy
}),
670 work(std::make_unique
<NbnxnPairlistGpuWork
>())
672 static_assert(c_nbnxnGpuNumClusterPerSupercluster
== c_gpuNumClusterPerCell
,
673 "The search code assumes that the a super-cluster matches a search grid cell");
675 static_assert(sizeof(cj4
[0].imei
[0].imask
)*8 >= c_nbnxnGpuJgroupSize
*c_gpuNumClusterPerCell
,
676 "The i super-cluster cluster interaction mask does not contain a sufficient number of bits");
678 static_assert(sizeof(excl
[0])*8 >= c_nbnxnGpuJgroupSize
*c_gpuNumClusterPerCell
, "The GPU exclusion mask does not contain a sufficient number of bits");
680 // We always want a first entry without any exclusions
684 // TODO: Move to pairlistset.cpp
685 PairlistSet::PairlistSet(const Nbnxm::InteractionLocality locality
,
686 const PairlistParams
&pairlistParams
) :
688 params_(pairlistParams
)
691 (params_
.pairlistType
== PairlistType::Simple4x2
||
692 params_
.pairlistType
== PairlistType::Simple4x4
||
693 params_
.pairlistType
== PairlistType::Simple4x8
);
694 // Currently GPU lists are always combined
695 combineLists_
= !isCpuType_
;
697 const int numLists
= gmx_omp_nthreads_get(emntNonbonded
);
699 if (!combineLists_
&&
700 numLists
> NBNXN_BUFFERFLAG_MAX_THREADS
)
702 gmx_fatal(FARGS
, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
703 numLists
, NBNXN_BUFFERFLAG_MAX_THREADS
, NBNXN_BUFFERFLAG_MAX_THREADS
);
708 cpuLists_
.resize(numLists
);
711 cpuListsWork_
.resize(numLists
);
716 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
717 gpuLists_
.emplace_back(gmx::PinningPolicy::PinnedIfSupported
);
718 /* Lists 0 to numLists are use for constructing lists in parallel
719 * on the CPU using numLists threads (and then merged into list 0).
721 for (int i
= 1; i
< numLists
; i
++)
723 gpuLists_
.emplace_back(gmx::PinningPolicy::CannotBePinned
);
728 fepLists_
.resize(numLists
);
730 /* Execute in order to avoid memory interleaving between threads */
731 #pragma omp parallel for num_threads(numLists) schedule(static)
732 for (int i
= 0; i
< numLists
; i
++)
736 /* We used to allocate all normal lists locally on each thread
737 * as well. The question is if allocating the object on the
738 * master thread (but all contained list memory thread local)
739 * impacts performance.
741 fepLists_
[i
] = std::make_unique
<t_nblist
>();
742 nbnxn_init_pairlist_fep(fepLists_
[i
].get());
744 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
749 /* Print statistics of a pair list, used for debug output */
750 static void print_nblist_statistics(FILE *fp
,
751 const NbnxnPairlistCpu
&nbl
,
752 const Nbnxm::GridSet
&gridSet
,
755 const Grid
&grid
= gridSet
.grids()[0];
756 const Grid::Dimensions
&dims
= grid
.dimensions();
758 fprintf(fp
, "nbl nci %zu ncj %d\n",
759 nbl
.ci
.size(), nbl
.ncjInUse
);
760 const int numAtomsJCluster
= grid
.geometry().numAtomsJCluster
;
761 const double numAtomsPerCell
= nbl
.ncjInUse
/static_cast<double>(grid
.numCells())*numAtomsJCluster
;
762 fprintf(fp
, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
763 nbl
.na_cj
, rl
, nbl
.ncjInUse
, nbl
.ncjInUse
/static_cast<double>(grid
.numCells()),
765 numAtomsPerCell
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
.numCells()*numAtomsJCluster
/(dims
.gridSize
[XX
]*dims
.gridSize
[YY
]*dims
.gridSize
[ZZ
])));
767 fprintf(fp
, "nbl average j cell list length %.1f\n",
768 0.25*nbl
.ncjInUse
/std::max(static_cast<double>(nbl
.ci
.size()), 1.0));
770 int cs
[SHIFTS
] = { 0 };
772 for (const nbnxn_ci_t
&ciEntry
: nbl
.ci
)
774 cs
[ciEntry
.shift
& NBNXN_CI_SHIFT
] +=
775 ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
;
777 int j
= ciEntry
.cj_ind_start
;
778 while (j
< ciEntry
.cj_ind_end
&&
779 nbl
.cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
785 fprintf(fp
, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
786 nbl
.cj
.size(), npexcl
, 100*npexcl
/std::max(static_cast<double>(nbl
.cj
.size()), 1.0));
787 for (int s
= 0; s
< SHIFTS
; s
++)
791 fprintf(fp
, "nbl shift %2d ncj %3d\n", s
, cs
[s
]);
796 /* Print statistics of a pair lists, used for debug output */
797 static void print_nblist_statistics(FILE *fp
,
798 const NbnxnPairlistGpu
&nbl
,
799 const Nbnxm::GridSet
&gridSet
,
802 const Grid
&grid
= gridSet
.grids()[0];
803 const Grid::Dimensions
&dims
= grid
.dimensions();
805 fprintf(fp
, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
806 nbl
.sci
.size(), nbl
.cj4
.size(), nbl
.nci_tot
, nbl
.excl
.size());
807 const int numAtomsCluster
= grid
.geometry().numAtomsICluster
;
808 const double numAtomsPerCell
= nbl
.nci_tot
/static_cast<double>(grid
.numClusters())*numAtomsCluster
;
809 fprintf(fp
, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
810 nbl
.na_ci
, rl
, nbl
.nci_tot
, nbl
.nci_tot
/static_cast<double>(grid
.numClusters()),
812 numAtomsPerCell
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
.numClusters()*numAtomsCluster
/(dims
.gridSize
[XX
]*dims
.gridSize
[YY
]*dims
.gridSize
[ZZ
])));
817 int c
[c_gpuNumClusterPerCell
+ 1] = { 0 };
818 for (const nbnxn_sci_t
&sci
: nbl
.sci
)
821 for (int j4
= sci
.cj4_ind_start
; j4
< sci
.cj4_ind_end
; j4
++)
823 for (int j
= 0; j
< c_nbnxnGpuJgroupSize
; j
++)
826 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
828 if (nbl
.cj4
[j4
].imei
[0].imask
& (1U << (j
*c_gpuNumClusterPerCell
+ si
)))
839 nsp_max
= std::max(nsp_max
, nsp
);
841 if (!nbl
.sci
.empty())
843 sum_nsp
/= nbl
.sci
.size();
844 sum_nsp2
/= nbl
.sci
.size();
846 fprintf(fp
, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
847 sum_nsp
, std::sqrt(sum_nsp2
- sum_nsp
*sum_nsp
), nsp_max
);
849 if (!nbl
.cj4
.empty())
851 for (int b
= 0; b
<= c_gpuNumClusterPerCell
; b
++)
853 fprintf(fp
, "nbl j-list #i-subcell %d %7d %4.1f\n",
854 b
, c
[b
], 100.0*c
[b
]/size_t {nbl
.cj4
.size()*c_nbnxnGpuJgroupSize
});
859 /* Returns a reference to the exclusion mask for j-cluster-group \p cj4 and warp \p warp
860 * Generates a new exclusion entry when the j-cluster-group uses
861 * the default all-interaction mask at call time, so the returned mask
862 * can be modified when needed.
864 static nbnxn_excl_t
&get_exclusion_mask(NbnxnPairlistGpu
*nbl
,
868 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
870 /* No exclusions set, make a new list entry */
871 const size_t oldSize
= nbl
->excl
.size();
872 GMX_ASSERT(oldSize
>= 1, "We should always have entry [0]");
873 /* Add entry with default values: no exclusions */
874 nbl
->excl
.resize(oldSize
+ 1);
875 nbl
->cj4
[cj4
].imei
[warp
].excl_ind
= oldSize
;
878 return nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
881 /* Sets self exclusions and excludes half of the double pairs in the self cluster-pair \p nbl->cj4[cj4Index].cj[jOffsetInGroup]
883 * \param[in,out] nbl The cluster pair list
884 * \param[in] cj4Index The j-cluster group index into \p nbl->cj4
885 * \param[in] jOffsetInGroup The j-entry offset in \p nbl->cj4[cj4Index]
886 * \param[in] iClusterInCell The i-cluster index in the cell
889 setSelfAndNewtonExclusionsGpu(NbnxnPairlistGpu
*nbl
,
891 const int jOffsetInGroup
,
892 const int iClusterInCell
)
894 constexpr int numJatomsPerPart
= c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
;
896 /* The exclusions are stored separately for each part of the split */
897 for (int part
= 0; part
< c_nbnxnGpuClusterpairSplit
; part
++)
899 const int jOffset
= part
*numJatomsPerPart
;
900 /* Make a new exclusion mask entry for each part, if we don't already have one yet */
901 nbnxn_excl_t
&excl
= get_exclusion_mask(nbl
, cj4Index
, part
);
903 /* Set all bits with j-index <= i-index */
904 for (int jIndexInPart
= 0; jIndexInPart
< numJatomsPerPart
; jIndexInPart
++)
906 for (int i
= jOffset
+ jIndexInPart
; i
< c_nbnxnGpuClusterSize
; i
++)
908 excl
.pair
[jIndexInPart
*c_nbnxnGpuClusterSize
+ i
] &=
909 ~(1U << (jOffsetInGroup
*c_gpuNumClusterPerCell
+ iClusterInCell
));
915 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
916 static unsigned int get_imask(gmx_bool rdiag
, int ci
, int cj
)
918 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
921 /* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
922 gmx_unused
static unsigned int get_imask_simd_j2(gmx_bool rdiag
, int ci
, int cj
)
924 return (rdiag
&& ci
*2 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_0
:
925 (rdiag
&& ci
*2+1 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_1
:
926 NBNXN_INTERACTION_MASK_ALL
));
929 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
930 gmx_unused
static unsigned int get_imask_simd_j4(gmx_bool rdiag
, int ci
, int cj
)
932 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
935 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
936 gmx_unused
static unsigned int get_imask_simd_j8(gmx_bool rdiag
, int ci
, int cj
)
938 return (rdiag
&& ci
== cj
*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
:
939 (rdiag
&& ci
== cj
*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
:
940 NBNXN_INTERACTION_MASK_ALL
));
944 #if GMX_SIMD_REAL_WIDTH == 2
945 #define get_imask_simd_4xn get_imask_simd_j2
947 #if GMX_SIMD_REAL_WIDTH == 4
948 #define get_imask_simd_4xn get_imask_simd_j4
950 #if GMX_SIMD_REAL_WIDTH == 8
951 #define get_imask_simd_4xn get_imask_simd_j8
952 #define get_imask_simd_2xnn get_imask_simd_j4
954 #if GMX_SIMD_REAL_WIDTH == 16
955 #define get_imask_simd_2xnn get_imask_simd_j8
959 /* Plain C code for checking and adding cluster-pairs to the list.
961 * \param[in] gridj The j-grid
962 * \param[in,out] nbl The pair-list to store the cluster pairs in
963 * \param[in] icluster The index of the i-cluster
964 * \param[in] jclusterFirst The first cluster in the j-range
965 * \param[in] jclusterLast The last cluster in the j-range
966 * \param[in] excludeSubDiagonal Exclude atom pairs with i-index > j-index
967 * \param[in] x_j Coordinates for the j-atom, in xyz format
968 * \param[in] rlist2 The squared list cut-off
969 * \param[in] rbb2 The squared cut-off for putting cluster-pairs in the list based on bounding box distance only
970 * \param[in,out] numDistanceChecks The number of distance checks performed
973 makeClusterListSimple(const Grid
&jGrid
,
974 NbnxnPairlistCpu
* nbl
,
978 bool excludeSubDiagonal
,
979 const real
* gmx_restrict x_j
,
982 int * gmx_restrict numDistanceChecks
)
984 const BoundingBox
* gmx_restrict bb_ci
= nbl
->work
->iClusterData
.bb
.data();
985 const real
* gmx_restrict x_ci
= nbl
->work
->iClusterData
.x
.data();
990 while (!InRange
&& jclusterFirst
<= jclusterLast
)
992 real d2
= clusterBoundingBoxDistance2(bb_ci
[0], jGrid
.jBoundingBoxes()[jclusterFirst
]);
993 *numDistanceChecks
+= 2;
995 /* Check if the distance is within the distance where
996 * we use only the bounding box distance rbb,
997 * or within the cut-off and there is at least one atom pair
998 * within the cut-off.
1004 else if (d2
< rlist2
)
1006 int cjf_gl
= jGrid
.cellOffset() + jclusterFirst
;
1007 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
&& !InRange
; i
++)
1009 for (int j
= 0; j
< c_nbnxnCpuIClusterSize
; j
++)
1011 InRange
= InRange
||
1012 (gmx::square(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+XX
]) +
1013 gmx::square(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+YY
]) +
1014 gmx::square(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjf_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+ZZ
]) < rlist2
);
1017 *numDistanceChecks
+= c_nbnxnCpuIClusterSize
*c_nbnxnCpuIClusterSize
;
1030 while (!InRange
&& jclusterLast
> jclusterFirst
)
1032 real d2
= clusterBoundingBoxDistance2(bb_ci
[0], jGrid
.jBoundingBoxes()[jclusterLast
]);
1033 *numDistanceChecks
+= 2;
1035 /* Check if the distance is within the distance where
1036 * we use only the bounding box distance rbb,
1037 * or within the cut-off and there is at least one atom pair
1038 * within the cut-off.
1044 else if (d2
< rlist2
)
1046 int cjl_gl
= jGrid
.cellOffset() + jclusterLast
;
1047 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
&& !InRange
; i
++)
1049 for (int j
= 0; j
< c_nbnxnCpuIClusterSize
; j
++)
1051 InRange
= InRange
||
1052 (gmx::square(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+XX
]) +
1053 gmx::square(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+YY
]) +
1054 gmx::square(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjl_gl
*c_nbnxnCpuIClusterSize
+j
)*STRIDE_XYZ
+ZZ
]) < rlist2
);
1057 *numDistanceChecks
+= c_nbnxnCpuIClusterSize
*c_nbnxnCpuIClusterSize
;
1065 if (jclusterFirst
<= jclusterLast
)
1067 for (int jcluster
= jclusterFirst
; jcluster
<= jclusterLast
; jcluster
++)
1069 /* Store cj and the interaction mask */
1071 cjEntry
.cj
= jGrid
.cellOffset() + jcluster
;
1072 cjEntry
.excl
= get_imask(excludeSubDiagonal
, icluster
, jcluster
);
1073 nbl
->cj
.push_back(cjEntry
);
1075 /* Increase the closing index in the i list */
1076 nbl
->ci
.back().cj_ind_end
= nbl
->cj
.size();
1080 #ifdef GMX_NBNXN_SIMD_4XN
1081 #include "gromacs/nbnxm/pairlist_simd_4xm.h"
1083 #ifdef GMX_NBNXN_SIMD_2XNN
1084 #include "gromacs/nbnxm/pairlist_simd_2xmm.h"
1087 /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
1088 * Checks bounding box distances and possibly atom pair distances.
1090 static void make_cluster_list_supersub(const Grid
&iGrid
,
1092 NbnxnPairlistGpu
*nbl
,
1095 const bool excludeSubDiagonal
,
1100 int *numDistanceChecks
)
1102 NbnxnPairlistGpuWork
&work
= *nbl
->work
;
1105 const float *pbb_ci
= work
.iSuperClusterData
.bbPacked
.data();
1107 const BoundingBox
*bb_ci
= work
.iSuperClusterData
.bb
.data();
1110 assert(c_nbnxnGpuClusterSize
== iGrid
.geometry().numAtomsICluster
);
1111 assert(c_nbnxnGpuClusterSize
== jGrid
.geometry().numAtomsICluster
);
1113 /* We generate the pairlist mainly based on bounding-box distances
1114 * and do atom pair distance based pruning on the GPU.
1115 * Only if a j-group contains a single cluster-pair, we try to prune
1116 * that pair based on atom distances on the CPU to avoid empty j-groups.
1118 #define PRUNE_LIST_CPU_ONE 1
1119 #define PRUNE_LIST_CPU_ALL 0
1121 #if PRUNE_LIST_CPU_ONE
1125 float *d2l
= work
.distanceBuffer
.data();
1127 for (int subc
= 0; subc
< jGrid
.numClustersPerCell()[scj
]; subc
++)
1129 const int cj4_ind
= work
.cj_ind
/c_nbnxnGpuJgroupSize
;
1130 const int cj_offset
= work
.cj_ind
- cj4_ind
*c_nbnxnGpuJgroupSize
;
1131 const int cj
= scj
*c_gpuNumClusterPerCell
+ subc
;
1133 const int cj_gl
= jGrid
.cellOffset()*c_gpuNumClusterPerCell
+ cj
;
1136 if (excludeSubDiagonal
&& sci
== scj
)
1142 ci1
= iGrid
.numClustersPerCell()[sci
];
1146 /* Determine all ci1 bb distances in one call with SIMD4 */
1147 const int offset
= packedBoundingBoxesIndex(cj
) + (cj
& (c_packedBoundingBoxesDimSize
- 1));
1148 clusterBoundingBoxDistance2_xxxx_simd4(jGrid
.packedBoundingBoxes().data() + offset
,
1150 *numDistanceChecks
+= c_nbnxnGpuClusterSize
*2;
1154 unsigned int imask
= 0;
1155 /* We use a fixed upper-bound instead of ci1 to help optimization */
1156 for (int ci
= 0; ci
< c_gpuNumClusterPerCell
; ci
++)
1164 /* Determine the bb distance between ci and cj */
1165 d2l
[ci
] = clusterBoundingBoxDistance2(bb_ci
[ci
], jGrid
.jBoundingBoxes()[cj
]);
1166 *numDistanceChecks
+= 2;
1170 #if PRUNE_LIST_CPU_ALL
1171 /* Check if the distance is within the distance where
1172 * we use only the bounding box distance rbb,
1173 * or within the cut-off and there is at least one atom pair
1174 * within the cut-off. This check is very costly.
1176 *numDistanceChecks
+= c_nbnxnGpuClusterSize
*c_nbnxnGpuClusterSize
;
1179 clusterpair_in_range(work
, ci
, cj_gl
, stride
, x
, rlist2
)))
1181 /* Check if the distance between the two bounding boxes
1182 * in within the pair-list cut-off.
1187 /* Flag this i-subcell to be taken into account */
1188 imask
|= (1U << (cj_offset
*c_gpuNumClusterPerCell
+ ci
));
1190 #if PRUNE_LIST_CPU_ONE
1198 #if PRUNE_LIST_CPU_ONE
1199 /* If we only found 1 pair, check if any atoms are actually
1200 * within the cut-off, so we could get rid of it.
1202 if (npair
== 1 && d2l
[ci_last
] >= rbb2
&&
1203 !clusterpair_in_range(work
, ci_last
, cj_gl
, stride
, x
, rlist2
))
1205 imask
&= ~(1U << (cj_offset
*c_gpuNumClusterPerCell
+ ci_last
));
1212 /* We have at least one cluster pair: add a j-entry */
1213 if (static_cast<size_t>(cj4_ind
) == nbl
->cj4
.size())
1215 nbl
->cj4
.resize(nbl
->cj4
.size() + 1);
1217 nbnxn_cj4_t
*cj4
= &nbl
->cj4
[cj4_ind
];
1219 cj4
->cj
[cj_offset
] = cj_gl
;
1221 /* Set the exclusions for the ci==sj entry.
1222 * Here we don't bother to check if this entry is actually flagged,
1223 * as it will nearly always be in the list.
1225 if (excludeSubDiagonal
&& sci
== scj
)
1227 setSelfAndNewtonExclusionsGpu(nbl
, cj4_ind
, cj_offset
, subc
);
1230 /* Copy the cluster interaction mask to the list */
1231 for (int w
= 0; w
< c_nbnxnGpuClusterpairSplit
; w
++)
1233 cj4
->imei
[w
].imask
|= imask
;
1236 nbl
->work
->cj_ind
++;
1238 /* Keep the count */
1239 nbl
->nci_tot
+= npair
;
1241 /* Increase the closing index in i super-cell list */
1242 nbl
->sci
.back().cj4_ind_end
=
1243 (nbl
->work
->cj_ind
+ c_nbnxnGpuJgroupSize
- 1)/c_nbnxnGpuJgroupSize
;
1248 /* Returns how many contiguous j-clusters we have starting in the i-list */
1249 template <typename CjListType
>
1250 static int numContiguousJClusters(const int cjIndexStart
,
1251 const int cjIndexEnd
,
1252 gmx::ArrayRef
<const CjListType
> cjList
)
1254 const int firstJCluster
= nblCj(cjList
, cjIndexStart
);
1256 int numContiguous
= 0;
1258 while (cjIndexStart
+ numContiguous
< cjIndexEnd
&&
1259 nblCj(cjList
, cjIndexStart
+ numContiguous
) == firstJCluster
+ numContiguous
)
1264 return numContiguous
;
1268 * \brief Helper struct for efficient searching for excluded atoms in a j-list
1272 /*! \brief Constructs a j-list range from \p cjList with the given index range */
1273 template <typename CjListType
>
1274 JListRanges(int cjIndexStart
,
1276 gmx::ArrayRef
<const CjListType
> cjList
);
1278 int cjIndexStart
; //!< The start index in the j-list
1279 int cjIndexEnd
; //!< The end index in the j-list
1280 int cjFirst
; //!< The j-cluster with index cjIndexStart
1281 int cjLast
; //!< The j-cluster with index cjIndexEnd-1
1282 int numDirect
; //!< Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
1286 template <typename CjListType
>
1287 JListRanges::JListRanges(int cjIndexStart
,
1289 gmx::ArrayRef
<const CjListType
> cjList
) :
1290 cjIndexStart(cjIndexStart
),
1291 cjIndexEnd(cjIndexEnd
)
1293 GMX_ASSERT(cjIndexEnd
> cjIndexStart
, "JListRanges should only be called with non-empty lists");
1295 cjFirst
= nblCj(cjList
, cjIndexStart
);
1296 cjLast
= nblCj(cjList
, cjIndexEnd
- 1);
1298 /* Determine how many contiguous j-cells we have starting
1299 * from the first i-cell. This number can be used to directly
1300 * calculate j-cell indices for excluded atoms.
1302 numDirect
= numContiguousJClusters(cjIndexStart
, cjIndexEnd
, cjList
);
1306 /* Return the index of \p jCluster in the given range or -1 when not present
1308 * Note: This code is executed very often and therefore performance is
1309 * important. It should be inlined and fully optimized.
1311 template <typename CjListType
>
1313 findJClusterInJList(int jCluster
,
1314 const JListRanges
&ranges
,
1315 gmx::ArrayRef
<const CjListType
> cjList
)
1319 if (jCluster
< ranges
.cjFirst
+ ranges
.numDirect
)
1321 /* We can calculate the index directly using the offset */
1322 index
= ranges
.cjIndexStart
+ jCluster
- ranges
.cjFirst
;
1326 /* Search for jCluster using bisection */
1328 int rangeStart
= ranges
.cjIndexStart
+ ranges
.numDirect
;
1329 int rangeEnd
= ranges
.cjIndexEnd
;
1331 while (index
== -1 && rangeStart
< rangeEnd
)
1333 rangeMiddle
= (rangeStart
+ rangeEnd
) >> 1;
1335 const int clusterMiddle
= nblCj(cjList
, rangeMiddle
);
1337 if (jCluster
== clusterMiddle
)
1339 index
= rangeMiddle
;
1341 else if (jCluster
< clusterMiddle
)
1343 rangeEnd
= rangeMiddle
;
1347 rangeStart
= rangeMiddle
+ 1;
1355 // TODO: Get rid of the two functions below by renaming sci to ci (or something better)
1357 /* Return the i-entry in the list we are currently operating on */
1358 static nbnxn_ci_t
*getOpenIEntry(NbnxnPairlistCpu
*nbl
)
1360 return &nbl
->ci
.back();
1363 /* Return the i-entry in the list we are currently operating on */
1364 static nbnxn_sci_t
*getOpenIEntry(NbnxnPairlistGpu
*nbl
)
1366 return &nbl
->sci
.back();
1369 /* Set all atom-pair exclusions for a simple type list i-entry
1371 * Set all atom-pair exclusions from the topology stored in exclusions
1372 * as masks in the pair-list for simple list entry iEntry.
1375 setExclusionsForIEntry(const Nbnxm::GridSet
&gridSet
,
1376 NbnxnPairlistCpu
*nbl
,
1377 gmx_bool diagRemoved
,
1379 const nbnxn_ci_t
&iEntry
,
1380 const t_blocka
&exclusions
)
1382 if (iEntry
.cj_ind_end
== iEntry
.cj_ind_start
)
1384 /* Empty list: no exclusions */
1388 const JListRanges
ranges(iEntry
.cj_ind_start
, iEntry
.cj_ind_end
, gmx::makeConstArrayRef(nbl
->cj
));
1390 const int iCluster
= iEntry
.ci
;
1392 gmx::ArrayRef
<const int> cell
= gridSet
.cells();
1393 gmx::ArrayRef
<const int> atomIndices
= gridSet
.atomIndices();
1395 /* Loop over the atoms in the i-cluster */
1396 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1398 const int iIndex
= iCluster
*nbl
->na_ci
+ i
;
1399 const int iAtom
= atomIndices
[iIndex
];
1402 /* Loop over the topology-based exclusions for this i-atom */
1403 for (int exclIndex
= exclusions
.index
[iAtom
]; exclIndex
< exclusions
.index
[iAtom
+ 1]; exclIndex
++)
1405 const int jAtom
= exclusions
.a
[exclIndex
];
1409 /* The self exclusion are already set, save some time */
1413 /* Get the index of the j-atom in the nbnxn atom data */
1414 const int jIndex
= cell
[jAtom
];
1416 /* Without shifts we only calculate interactions j>i
1417 * for one-way pair-lists.
1419 if (diagRemoved
&& jIndex
<= iIndex
)
1424 const int jCluster
= (jIndex
>> na_cj_2log
);
1426 /* Could the cluster se be in our list? */
1427 if (jCluster
>= ranges
.cjFirst
&& jCluster
<= ranges
.cjLast
)
1430 findJClusterInJList(jCluster
, ranges
,
1431 gmx::makeConstArrayRef(nbl
->cj
));
1435 /* We found an exclusion, clear the corresponding
1438 const int innerJ
= jIndex
- (jCluster
<< na_cj_2log
);
1440 nbl
->cj
[index
].excl
&= ~(1U << ((i
<< na_cj_2log
) + innerJ
));
1448 /* Add a new i-entry to the FEP list and copy the i-properties */
1449 static inline void fep_list_new_nri_copy(t_nblist
*nlist
)
1451 /* Add a new i-entry */
1454 assert(nlist
->nri
< nlist
->maxnri
);
1456 /* Duplicate the last i-entry, except for jindex, which continues */
1457 nlist
->iinr
[nlist
->nri
] = nlist
->iinr
[nlist
->nri
-1];
1458 nlist
->shift
[nlist
->nri
] = nlist
->shift
[nlist
->nri
-1];
1459 nlist
->gid
[nlist
->nri
] = nlist
->gid
[nlist
->nri
-1];
1460 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1463 /* Rellocate FEP list for size nl->maxnri, TODO: replace by C++ */
1464 static void reallocate_nblist(t_nblist
*nl
)
1468 fprintf(debug
, "reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
1469 nl
->ielec
, nl
->ivdw
, nl
->igeometry
, nl
->type
, nl
->maxnri
);
1471 srenew(nl
->iinr
, nl
->maxnri
);
1472 srenew(nl
->gid
, nl
->maxnri
);
1473 srenew(nl
->shift
, nl
->maxnri
);
1474 srenew(nl
->jindex
, nl
->maxnri
+1);
1477 /* For load balancing of the free-energy lists over threads, we set
1478 * the maximum nrj size of an i-entry to 40. This leads to good
1479 * load balancing in the worst case scenario of a single perturbed
1480 * particle on 16 threads, while not introducing significant overhead.
1481 * Note that half of the perturbed pairs will anyhow end up in very small lists,
1482 * since non perturbed i-particles will see few perturbed j-particles).
1484 const int max_nrj_fep
= 40;
1486 /* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
1487 * singularities for overlapping particles (0/0), since the charges and
1488 * LJ parameters have been zeroed in the nbnxn data structure.
1489 * Simultaneously make a group pair list for the perturbed pairs.
1491 static void make_fep_list(gmx::ArrayRef
<const int> atomIndices
,
1492 const nbnxn_atomdata_t
*nbat
,
1493 NbnxnPairlistCpu
*nbl
,
1494 gmx_bool bDiagRemoved
,
1496 real gmx_unused shx
,
1497 real gmx_unused shy
,
1498 real gmx_unused shz
,
1499 real gmx_unused rlist_fep2
,
1504 int ci
, cj_ind_start
, cj_ind_end
, cja
, cjr
;
1506 int gid_i
= 0, gid_j
, gid
;
1507 int egp_shift
, egp_mask
;
1509 int ind_i
, ind_j
, ai
, aj
;
1511 gmx_bool bFEP_i
, bFEP_i_all
;
1513 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
1521 cj_ind_start
= nbl_ci
->cj_ind_start
;
1522 cj_ind_end
= nbl_ci
->cj_ind_end
;
1524 /* In worst case we have alternating energy groups
1525 * and create #atom-pair lists, which means we need the size
1526 * of a cluster pair (na_ci*na_cj) times the number of cj's.
1528 nri_max
= nbl
->na_ci
*nbl
->na_cj
*(cj_ind_end
- cj_ind_start
);
1529 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
1531 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
1532 reallocate_nblist(nlist
);
1535 const int numAtomsJCluster
= jGrid
.geometry().numAtomsJCluster
;
1537 const nbnxn_atomdata_t::Params
&nbatParams
= nbat
->params();
1539 const int ngid
= nbatParams
.nenergrp
;
1541 /* TODO: Consider adding a check in grompp and changing this to an assert */
1542 const int numBitsInEnergyGroupIdsForAtomsInJCluster
= sizeof(gid_cj
)*8;
1543 if (ngid
*numAtomsJCluster
> numBitsInEnergyGroupIdsForAtomsInJCluster
)
1545 gmx_fatal(FARGS
, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu energy groups",
1546 iGrid
.geometry().numAtomsICluster
, numAtomsJCluster
,
1547 (sizeof(gid_cj
)*8)/numAtomsJCluster
);
1550 egp_shift
= nbatParams
.neg_2log
;
1551 egp_mask
= (1 << egp_shift
) - 1;
1553 /* Loop over the atoms in the i sub-cell */
1555 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1557 ind_i
= ci
*nbl
->na_ci
+ i
;
1558 ai
= atomIndices
[ind_i
];
1562 nlist
->jindex
[nri
+1] = nlist
->jindex
[nri
];
1563 nlist
->iinr
[nri
] = ai
;
1564 /* The actual energy group pair index is set later */
1565 nlist
->gid
[nri
] = 0;
1566 nlist
->shift
[nri
] = nbl_ci
->shift
& NBNXN_CI_SHIFT
;
1568 bFEP_i
= iGrid
.atomIsPerturbed(ci
- iGrid
.cellOffset(), i
);
1570 bFEP_i_all
= bFEP_i_all
&& bFEP_i
;
1572 if (nlist
->nrj
+ (cj_ind_end
- cj_ind_start
)*nbl
->na_cj
> nlist
->maxnrj
)
1574 nlist
->maxnrj
= over_alloc_small(nlist
->nrj
+ (cj_ind_end
- cj_ind_start
)*nbl
->na_cj
);
1575 srenew(nlist
->jjnr
, nlist
->maxnrj
);
1576 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
1581 gid_i
= (nbatParams
.energrp
[ci
] >> (egp_shift
*i
)) & egp_mask
;
1584 for (int cj_ind
= cj_ind_start
; cj_ind
< cj_ind_end
; cj_ind
++)
1586 unsigned int fep_cj
;
1588 cja
= nbl
->cj
[cj_ind
].cj
;
1590 if (numAtomsJCluster
== jGrid
.geometry().numAtomsICluster
)
1592 cjr
= cja
- jGrid
.cellOffset();
1593 fep_cj
= jGrid
.fepBits(cjr
);
1596 gid_cj
= nbatParams
.energrp
[cja
];
1599 else if (2*numAtomsJCluster
== jGrid
.geometry().numAtomsICluster
)
1601 cjr
= cja
- jGrid
.cellOffset()*2;
1602 /* Extract half of the ci fep/energrp mask */
1603 fep_cj
= (jGrid
.fepBits(cjr
>> 1) >> ((cjr
& 1)*numAtomsJCluster
)) & ((1 << numAtomsJCluster
) - 1);
1606 gid_cj
= nbatParams
.energrp
[cja
>> 1] >> ((cja
& 1)*numAtomsJCluster
*egp_shift
) & ((1 << (numAtomsJCluster
*egp_shift
)) - 1);
1611 cjr
= cja
- (jGrid
.cellOffset() >> 1);
1612 /* Combine two ci fep masks/energrp */
1613 fep_cj
= jGrid
.fepBits(cjr
*2) + (jGrid
.fepBits(cjr
*2 + 1) << jGrid
.geometry().numAtomsICluster
);
1616 gid_cj
= nbatParams
.energrp
[cja
*2] + (nbatParams
.energrp
[cja
*2+1] << (jGrid
.geometry().numAtomsICluster
*egp_shift
));
1620 if (bFEP_i
|| fep_cj
!= 0)
1622 for (int j
= 0; j
< nbl
->na_cj
; j
++)
1624 /* Is this interaction perturbed and not excluded? */
1625 ind_j
= cja
*nbl
->na_cj
+ j
;
1626 aj
= atomIndices
[ind_j
];
1628 (bFEP_i
|| (fep_cj
& (1 << j
))) &&
1629 (!bDiagRemoved
|| ind_j
>= ind_i
))
1633 gid_j
= (gid_cj
>> (j
*egp_shift
)) & egp_mask
;
1634 gid
= GID(gid_i
, gid_j
, ngid
);
1636 if (nlist
->nrj
> nlist
->jindex
[nri
] &&
1637 nlist
->gid
[nri
] != gid
)
1639 /* Energy group pair changed: new list */
1640 fep_list_new_nri_copy(nlist
);
1643 nlist
->gid
[nri
] = gid
;
1646 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
1648 fep_list_new_nri_copy(nlist
);
1652 /* Add it to the FEP list */
1653 nlist
->jjnr
[nlist
->nrj
] = aj
;
1654 nlist
->excl_fep
[nlist
->nrj
] = (nbl
->cj
[cj_ind
].excl
>> (i
*nbl
->na_cj
+ j
)) & 1;
1657 /* Exclude it from the normal list.
1658 * Note that the charge has been set to zero,
1659 * but we need to avoid 0/0, as perturbed atoms
1660 * can be on top of each other.
1662 nbl
->cj
[cj_ind
].excl
&= ~(1U << (i
*nbl
->na_cj
+ j
));
1668 if (nlist
->nrj
> nlist
->jindex
[nri
])
1670 /* Actually add this new, non-empty, list */
1672 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1679 /* All interactions are perturbed, we can skip this entry */
1680 nbl_ci
->cj_ind_end
= cj_ind_start
;
1681 nbl
->ncjInUse
-= cj_ind_end
- cj_ind_start
;
1685 /* Return the index of atom a within a cluster */
1686 static inline int cj_mod_cj4(int cj
)
1688 return cj
& (c_nbnxnGpuJgroupSize
- 1);
1691 /* Convert a j-cluster to a cj4 group */
1692 static inline int cj_to_cj4(int cj
)
1694 return cj
/c_nbnxnGpuJgroupSize
;
1697 /* Return the index of an j-atom within a warp */
1698 static inline int a_mod_wj(int a
)
1700 return a
& (c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
- 1);
1703 /* As make_fep_list above, but for super/sub lists. */
1704 static void make_fep_list(gmx::ArrayRef
<const int> atomIndices
,
1705 const nbnxn_atomdata_t
*nbat
,
1706 NbnxnPairlistGpu
*nbl
,
1707 gmx_bool bDiagRemoved
,
1708 const nbnxn_sci_t
*nbl_sci
,
1719 int ind_i
, ind_j
, ai
, aj
;
1723 const nbnxn_cj4_t
*cj4
;
1725 const int numJClusterGroups
= nbl_sci
->numJClusterGroups();
1726 if (numJClusterGroups
== 0)
1732 const int sci
= nbl_sci
->sci
;
1734 const int cj4_ind_start
= nbl_sci
->cj4_ind_start
;
1735 const int cj4_ind_end
= nbl_sci
->cj4_ind_end
;
1737 /* Here we process one super-cell, max #atoms na_sc, versus a list
1738 * cj4 entries, each with max c_nbnxnGpuJgroupSize cj's, each
1739 * of size na_cj atoms.
1740 * On the GPU we don't support energy groups (yet).
1741 * So for each of the na_sc i-atoms, we need max one FEP list
1742 * for each max_nrj_fep j-atoms.
1744 nri_max
= nbl
->na_sc
*nbl
->na_cj
*(1 + (numJClusterGroups
*c_nbnxnGpuJgroupSize
)/max_nrj_fep
);
1745 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
1747 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
1748 reallocate_nblist(nlist
);
1751 /* Loop over the atoms in the i super-cluster */
1752 for (int c
= 0; c
< c_gpuNumClusterPerCell
; c
++)
1754 c_abs
= sci
*c_gpuNumClusterPerCell
+ c
;
1756 for (int i
= 0; i
< nbl
->na_ci
; i
++)
1758 ind_i
= c_abs
*nbl
->na_ci
+ i
;
1759 ai
= atomIndices
[ind_i
];
1763 nlist
->jindex
[nri
+1] = nlist
->jindex
[nri
];
1764 nlist
->iinr
[nri
] = ai
;
1765 /* With GPUs, energy groups are not supported */
1766 nlist
->gid
[nri
] = 0;
1767 nlist
->shift
[nri
] = nbl_sci
->shift
& NBNXN_CI_SHIFT
;
1769 bFEP_i
= iGrid
.atomIsPerturbed(c_abs
- iGrid
.cellOffset()*c_gpuNumClusterPerCell
, i
);
1771 xi
= nbat
->x()[ind_i
*nbat
->xstride
+XX
] + shx
;
1772 yi
= nbat
->x()[ind_i
*nbat
->xstride
+YY
] + shy
;
1773 zi
= nbat
->x()[ind_i
*nbat
->xstride
+ZZ
] + shz
;
1775 const int nrjMax
= nlist
->nrj
+ numJClusterGroups
*c_nbnxnGpuJgroupSize
*nbl
->na_cj
;
1776 if (nrjMax
> nlist
->maxnrj
)
1778 nlist
->maxnrj
= over_alloc_small(nrjMax
);
1779 srenew(nlist
->jjnr
, nlist
->maxnrj
);
1780 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
1783 for (int cj4_ind
= cj4_ind_start
; cj4_ind
< cj4_ind_end
; cj4_ind
++)
1785 cj4
= &nbl
->cj4
[cj4_ind
];
1787 for (int gcj
= 0; gcj
< c_nbnxnGpuJgroupSize
; gcj
++)
1789 if ((cj4
->imei
[0].imask
& (1U << (gcj
*c_gpuNumClusterPerCell
+ c
))) == 0)
1791 /* Skip this ci for this cj */
1796 cj4
->cj
[gcj
] - jGrid
.cellOffset()*c_gpuNumClusterPerCell
;
1798 if (bFEP_i
|| jGrid
.clusterIsPerturbed(cjr
))
1800 for (int j
= 0; j
< nbl
->na_cj
; j
++)
1802 /* Is this interaction perturbed and not excluded? */
1803 ind_j
= (jGrid
.cellOffset()*c_gpuNumClusterPerCell
+ cjr
)*nbl
->na_cj
+ j
;
1804 aj
= atomIndices
[ind_j
];
1806 (bFEP_i
|| jGrid
.atomIsPerturbed(cjr
, j
)) &&
1807 (!bDiagRemoved
|| ind_j
>= ind_i
))
1810 unsigned int excl_bit
;
1813 const int jHalf
= j
/(c_nbnxnGpuClusterSize
/c_nbnxnGpuClusterpairSplit
);
1814 nbnxn_excl_t
&excl
=
1815 get_exclusion_mask(nbl
, cj4_ind
, jHalf
);
1817 excl_pair
= a_mod_wj(j
)*nbl
->na_ci
+ i
;
1818 excl_bit
= (1U << (gcj
*c_gpuNumClusterPerCell
+ c
));
1820 dx
= nbat
->x()[ind_j
*nbat
->xstride
+XX
] - xi
;
1821 dy
= nbat
->x()[ind_j
*nbat
->xstride
+YY
] - yi
;
1822 dz
= nbat
->x()[ind_j
*nbat
->xstride
+ZZ
] - zi
;
1824 /* The unpruned GPU list has more than 2/3
1825 * of the atom pairs beyond rlist. Using
1826 * this list will cause a lot of overhead
1827 * in the CPU FEP kernels, especially
1828 * relative to the fast GPU kernels.
1829 * So we prune the FEP list here.
1831 if (dx
*dx
+ dy
*dy
+ dz
*dz
< rlist_fep2
)
1833 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
1835 fep_list_new_nri_copy(nlist
);
1839 /* Add it to the FEP list */
1840 nlist
->jjnr
[nlist
->nrj
] = aj
;
1841 nlist
->excl_fep
[nlist
->nrj
] = (excl
.pair
[excl_pair
] & excl_bit
) ? 1 : 0;
1845 /* Exclude it from the normal list.
1846 * Note that the charge and LJ parameters have
1847 * been set to zero, but we need to avoid 0/0,
1848 * as perturbed atoms can be on top of each other.
1850 excl
.pair
[excl_pair
] &= ~excl_bit
;
1854 /* Note that we could mask out this pair in imask
1855 * if all i- and/or all j-particles are perturbed.
1856 * But since the perturbed pairs on the CPU will
1857 * take an order of magnitude more time, the GPU
1858 * will finish before the CPU and there is no gain.
1864 if (nlist
->nrj
> nlist
->jindex
[nri
])
1866 /* Actually add this new, non-empty, list */
1868 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
1875 /* Set all atom-pair exclusions for a GPU type list i-entry
1877 * Sets all atom-pair exclusions from the topology stored in exclusions
1878 * as masks in the pair-list for i-super-cluster list entry iEntry.
1881 setExclusionsForIEntry(const Nbnxm::GridSet
&gridSet
,
1882 NbnxnPairlistGpu
*nbl
,
1883 gmx_bool diagRemoved
,
1884 int gmx_unused na_cj_2log
,
1885 const nbnxn_sci_t
&iEntry
,
1886 const t_blocka
&exclusions
)
1888 if (iEntry
.numJClusterGroups() == 0)
1894 /* Set the search ranges using start and end j-cluster indices.
1895 * Note that here we can not use cj4_ind_end, since the last cj4
1896 * can be only partially filled, so we use cj_ind.
1898 const JListRanges
ranges(iEntry
.cj4_ind_start
*c_nbnxnGpuJgroupSize
,
1900 gmx::makeConstArrayRef(nbl
->cj4
));
1902 GMX_ASSERT(nbl
->na_ci
== c_nbnxnGpuClusterSize
, "na_ci should match the GPU cluster size");
1903 constexpr int c_clusterSize
= c_nbnxnGpuClusterSize
;
1904 constexpr int c_superClusterSize
= c_nbnxnGpuNumClusterPerSupercluster
*c_nbnxnGpuClusterSize
;
1906 const int iSuperCluster
= iEntry
.sci
;
1908 gmx::ArrayRef
<const int> atomIndices
= gridSet
.atomIndices();
1909 gmx::ArrayRef
<const int> cell
= gridSet
.cells();
1911 /* Loop over the atoms in the i super-cluster */
1912 for (int i
= 0; i
< c_superClusterSize
; i
++)
1914 const int iIndex
= iSuperCluster
*c_superClusterSize
+ i
;
1915 const int iAtom
= atomIndices
[iIndex
];
1918 const int iCluster
= i
/c_clusterSize
;
1920 /* Loop over the topology-based exclusions for this i-atom */
1921 for (int exclIndex
= exclusions
.index
[iAtom
]; exclIndex
< exclusions
.index
[iAtom
+ 1]; exclIndex
++)
1923 const int jAtom
= exclusions
.a
[exclIndex
];
1927 /* The self exclusions are already set, save some time */
1931 /* Get the index of the j-atom in the nbnxn atom data */
1932 const int jIndex
= cell
[jAtom
];
1934 /* Without shifts we only calculate interactions j>i
1935 * for one-way pair-lists.
1937 /* NOTE: We would like to use iIndex on the right hand side,
1938 * but that makes this routine 25% slower with gcc6/7.
1939 * Even using c_superClusterSize makes it slower.
1940 * Either of these changes triggers peeling of the exclIndex
1941 * loop, which apparently leads to far less efficient code.
1943 if (diagRemoved
&& jIndex
<= iSuperCluster
*nbl
->na_sc
+ i
)
1948 const int jCluster
= jIndex
/c_clusterSize
;
1950 /* Check whether the cluster is in our list? */
1951 if (jCluster
>= ranges
.cjFirst
&& jCluster
<= ranges
.cjLast
)
1954 findJClusterInJList(jCluster
, ranges
,
1955 gmx::makeConstArrayRef(nbl
->cj4
));
1959 /* We found an exclusion, clear the corresponding
1962 const unsigned int pairMask
= (1U << (cj_mod_cj4(index
)*c_gpuNumClusterPerCell
+ iCluster
));
1963 /* Check if the i-cluster interacts with the j-cluster */
1964 if (nbl_imask0(nbl
, index
) & pairMask
)
1966 const int innerI
= (i
& (c_clusterSize
- 1));
1967 const int innerJ
= (jIndex
& (c_clusterSize
- 1));
1969 /* Determine which j-half (CUDA warp) we are in */
1970 const int jHalf
= innerJ
/(c_clusterSize
/c_nbnxnGpuClusterpairSplit
);
1972 nbnxn_excl_t
&interactionMask
=
1973 get_exclusion_mask(nbl
, cj_to_cj4(index
), jHalf
);
1975 interactionMask
.pair
[a_mod_wj(innerJ
)*c_clusterSize
+ innerI
] &= ~pairMask
;
1984 /* Make a new ci entry at the back of nbl->ci */
1985 static void addNewIEntry(NbnxnPairlistCpu
*nbl
, int ci
, int shift
, int flags
)
1989 ciEntry
.shift
= shift
;
1990 /* Store the interaction flags along with the shift */
1991 ciEntry
.shift
|= flags
;
1992 ciEntry
.cj_ind_start
= nbl
->cj
.size();
1993 ciEntry
.cj_ind_end
= nbl
->cj
.size();
1994 nbl
->ci
.push_back(ciEntry
);
1997 /* Make a new sci entry at index nbl->nsci */
1998 static void addNewIEntry(NbnxnPairlistGpu
*nbl
, int sci
, int shift
, int gmx_unused flags
)
2000 nbnxn_sci_t sciEntry
;
2002 sciEntry
.shift
= shift
;
2003 sciEntry
.cj4_ind_start
= nbl
->cj4
.size();
2004 sciEntry
.cj4_ind_end
= nbl
->cj4
.size();
2006 nbl
->sci
.push_back(sciEntry
);
2009 /* Sort the simple j-list cj on exclusions.
2010 * Entries with exclusions will all be sorted to the beginning of the list.
2012 static void sort_cj_excl(nbnxn_cj_t
*cj
, int ncj
,
2013 NbnxnPairlistCpuWork
*work
)
2015 work
->cj
.resize(ncj
);
2017 /* Make a list of the j-cells involving exclusions */
2019 for (int j
= 0; j
< ncj
; j
++)
2021 if (cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
2023 work
->cj
[jnew
++] = cj
[j
];
2026 /* Check if there are exclusions at all or not just the first entry */
2027 if (!((jnew
== 0) ||
2028 (jnew
== 1 && cj
[0].excl
!= NBNXN_INTERACTION_MASK_ALL
)))
2030 for (int j
= 0; j
< ncj
; j
++)
2032 if (cj
[j
].excl
== NBNXN_INTERACTION_MASK_ALL
)
2034 work
->cj
[jnew
++] = cj
[j
];
2037 for (int j
= 0; j
< ncj
; j
++)
2039 cj
[j
] = work
->cj
[j
];
2044 /* Close this simple list i entry */
2045 static void closeIEntry(NbnxnPairlistCpu
*nbl
,
2046 int gmx_unused sp_max_av
,
2047 gmx_bool gmx_unused progBal
,
2048 float gmx_unused nsp_tot_est
,
2049 int gmx_unused thread
,
2050 int gmx_unused nthread
)
2052 nbnxn_ci_t
&ciEntry
= nbl
->ci
.back();
2054 /* All content of the new ci entry have already been filled correctly,
2055 * we only need to sort and increase counts or remove the entry when empty.
2057 const int jlen
= ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
;
2060 sort_cj_excl(nbl
->cj
.data() + ciEntry
.cj_ind_start
, jlen
, nbl
->work
.get());
2062 /* The counts below are used for non-bonded pair/flop counts
2063 * and should therefore match the available kernel setups.
2065 if (!(ciEntry
.shift
& NBNXN_CI_DO_COUL(0)))
2067 nbl
->work
->ncj_noq
+= jlen
;
2069 else if ((ciEntry
.shift
& NBNXN_CI_HALF_LJ(0)) ||
2070 !(ciEntry
.shift
& NBNXN_CI_DO_LJ(0)))
2072 nbl
->work
->ncj_hlj
+= jlen
;
2077 /* Entry is empty: remove it */
2082 /* Split sci entry for load balancing on the GPU.
2083 * Splitting ensures we have enough lists to fully utilize the whole GPU.
2084 * With progBal we generate progressively smaller lists, which improves
2085 * load balancing. As we only know the current count on our own thread,
2086 * we will need to estimate the current total amount of i-entries.
2087 * As the lists get concatenated later, this estimate depends
2088 * both on nthread and our own thread index.
2090 static void split_sci_entry(NbnxnPairlistGpu
*nbl
,
2092 gmx_bool progBal
, float nsp_tot_est
,
2093 int thread
, int nthread
)
2101 /* Estimate the total numbers of ci's of the nblist combined
2102 * over all threads using the target number of ci's.
2104 nsp_est
= (nsp_tot_est
*thread
)/nthread
+ nbl
->nci_tot
;
2106 /* The first ci blocks should be larger, to avoid overhead.
2107 * The last ci blocks should be smaller, to improve load balancing.
2108 * The factor 3/2 makes the first block 3/2 times the target average
2109 * and ensures that the total number of blocks end up equal to
2110 * that of equally sized blocks of size nsp_target_av.
2112 nsp_max
= static_cast<int>(nsp_target_av
*(nsp_tot_est
*1.5/(nsp_est
+ nsp_tot_est
)));
2116 nsp_max
= nsp_target_av
;
2119 const int cj4_start
= nbl
->sci
.back().cj4_ind_start
;
2120 const int cj4_end
= nbl
->sci
.back().cj4_ind_end
;
2121 const int j4len
= cj4_end
- cj4_start
;
2123 if (j4len
> 1 && j4len
*c_gpuNumClusterPerCell
*c_nbnxnGpuJgroupSize
> nsp_max
)
2125 /* Modify the last ci entry and process the cj4's again */
2131 for (int cj4
= cj4_start
; cj4
< cj4_end
; cj4
++)
2133 int nsp_cj4_p
= nsp_cj4
;
2134 /* Count the number of cluster pairs in this cj4 group */
2136 for (int p
= 0; p
< c_gpuNumClusterPerCell
*c_nbnxnGpuJgroupSize
; p
++)
2138 nsp_cj4
+= (nbl
->cj4
[cj4
].imei
[0].imask
>> p
) & 1;
2141 /* If adding the current cj4 with nsp_cj4 pairs get us further
2142 * away from our target nsp_max, split the list before this cj4.
2144 if (nsp
> 0 && nsp_max
- nsp
< nsp
+ nsp_cj4
- nsp_max
)
2146 /* Split the list at cj4 */
2147 nbl
->sci
.back().cj4_ind_end
= cj4
;
2148 /* Create a new sci entry */
2150 sciNew
.sci
= nbl
->sci
.back().sci
;
2151 sciNew
.shift
= nbl
->sci
.back().shift
;
2152 sciNew
.cj4_ind_start
= cj4
;
2153 nbl
->sci
.push_back(sciNew
);
2156 nsp_cj4_e
= nsp_cj4_p
;
2162 /* Put the remaining cj4's in the last sci entry */
2163 nbl
->sci
.back().cj4_ind_end
= cj4_end
;
2165 /* Possibly balance out the last two sci's
2166 * by moving the last cj4 of the second last sci.
2168 if (nsp_sci
- nsp_cj4_e
>= nsp
+ nsp_cj4_e
)
2170 GMX_ASSERT(nbl
->sci
.size() >= 2, "We expect at least two elements");
2171 nbl
->sci
[nbl
->sci
.size() - 2].cj4_ind_end
--;
2172 nbl
->sci
[nbl
->sci
.size() - 1].cj4_ind_start
--;
2177 /* Clost this super/sub list i entry */
2178 static void closeIEntry(NbnxnPairlistGpu
*nbl
,
2180 gmx_bool progBal
, float nsp_tot_est
,
2181 int thread
, int nthread
)
2183 nbnxn_sci_t
&sciEntry
= *getOpenIEntry(nbl
);
2185 /* All content of the new ci entry have already been filled correctly,
2186 * we only need to, potentially, split or remove the entry when empty.
2188 int j4len
= sciEntry
.numJClusterGroups();
2191 /* We can only have complete blocks of 4 j-entries in a list,
2192 * so round the count up before closing.
2194 int ncj4
= (nbl
->work
->cj_ind
+ c_nbnxnGpuJgroupSize
- 1)/c_nbnxnGpuJgroupSize
;
2195 nbl
->work
->cj_ind
= ncj4
*c_nbnxnGpuJgroupSize
;
2199 /* Measure the size of the new entry and potentially split it */
2200 split_sci_entry(nbl
, nsp_max_av
, progBal
, nsp_tot_est
,
2206 /* Entry is empty: remove it */
2207 nbl
->sci
.pop_back();
2211 /* Syncs the working array before adding another grid pair to the GPU list */
2212 static void sync_work(NbnxnPairlistCpu gmx_unused
*nbl
)
2216 /* Syncs the working array before adding another grid pair to the GPU list */
2217 static void sync_work(NbnxnPairlistGpu
*nbl
)
2219 nbl
->work
->cj_ind
= nbl
->cj4
.size()*c_nbnxnGpuJgroupSize
;
2222 /* Clears an NbnxnPairlistCpu data structure */
2223 static void clear_pairlist(NbnxnPairlistCpu
*nbl
)
2229 nbl
->ciOuter
.clear();
2230 nbl
->cjOuter
.clear();
2232 nbl
->work
->ncj_noq
= 0;
2233 nbl
->work
->ncj_hlj
= 0;
2236 /* Clears an NbnxnPairlistGpu data structure */
2237 static void clear_pairlist(NbnxnPairlistGpu
*nbl
)
2241 nbl
->excl
.resize(1);
2245 /* Clears a group scheme pair list */
2246 static void clear_pairlist_fep(t_nblist
*nl
)
2250 if (nl
->jindex
== nullptr)
2252 snew(nl
->jindex
, 1);
2257 /* Sets a simple list i-cell bounding box, including PBC shift */
2258 static inline void set_icell_bb_simple(gmx::ArrayRef
<const BoundingBox
> bb
,
2260 real shx
, real shy
, real shz
,
2263 bb_ci
->lower
.x
= bb
[ci
].lower
.x
+ shx
;
2264 bb_ci
->lower
.y
= bb
[ci
].lower
.y
+ shy
;
2265 bb_ci
->lower
.z
= bb
[ci
].lower
.z
+ shz
;
2266 bb_ci
->upper
.x
= bb
[ci
].upper
.x
+ shx
;
2267 bb_ci
->upper
.y
= bb
[ci
].upper
.y
+ shy
;
2268 bb_ci
->upper
.z
= bb
[ci
].upper
.z
+ shz
;
2271 /* Sets a simple list i-cell bounding box, including PBC shift */
2272 static inline void set_icell_bb(const Grid
&iGrid
,
2274 real shx
, real shy
, real shz
,
2275 NbnxnPairlistCpuWork
*work
)
2277 set_icell_bb_simple(iGrid
.iBoundingBoxes(), ci
, shx
, shy
, shz
,
2278 &work
->iClusterData
.bb
[0]);
2282 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2283 static void set_icell_bbxxxx_supersub(gmx::ArrayRef
<const float> bb
,
2285 real shx
, real shy
, real shz
,
2288 constexpr int cellBBStride
= packedBoundingBoxesIndex(c_gpuNumClusterPerCell
);
2289 constexpr int pbbStride
= c_packedBoundingBoxesDimSize
;
2290 const int ia
= ci
*cellBBStride
;
2291 for (int m
= 0; m
< cellBBStride
; m
+= c_packedBoundingBoxesSize
)
2293 for (int i
= 0; i
< pbbStride
; i
++)
2295 bb_ci
[m
+ 0*pbbStride
+ i
] = bb
[ia
+ m
+ 0*pbbStride
+ i
] + shx
;
2296 bb_ci
[m
+ 1*pbbStride
+ i
] = bb
[ia
+ m
+ 1*pbbStride
+ i
] + shy
;
2297 bb_ci
[m
+ 2*pbbStride
+ i
] = bb
[ia
+ m
+ 2*pbbStride
+ i
] + shz
;
2298 bb_ci
[m
+ 3*pbbStride
+ i
] = bb
[ia
+ m
+ 3*pbbStride
+ i
] + shx
;
2299 bb_ci
[m
+ 4*pbbStride
+ i
] = bb
[ia
+ m
+ 4*pbbStride
+ i
] + shy
;
2300 bb_ci
[m
+ 5*pbbStride
+ i
] = bb
[ia
+ m
+ 5*pbbStride
+ i
] + shz
;
2306 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2307 gmx_unused
static void set_icell_bb_supersub(gmx::ArrayRef
<const BoundingBox
> bb
,
2309 real shx
, real shy
, real shz
,
2312 for (int i
= 0; i
< c_gpuNumClusterPerCell
; i
++)
2314 set_icell_bb_simple(bb
, ci
*c_gpuNumClusterPerCell
+i
,
2320 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2321 gmx_unused
static void set_icell_bb(const Grid
&iGrid
,
2323 real shx
, real shy
, real shz
,
2324 NbnxnPairlistGpuWork
*work
)
2327 set_icell_bbxxxx_supersub(iGrid
.packedBoundingBoxes(), ci
, shx
, shy
, shz
,
2328 work
->iSuperClusterData
.bbPacked
.data());
2330 set_icell_bb_supersub(iGrid
.iBoundingBoxes(), ci
, shx
, shy
, shz
,
2331 work
->iSuperClusterData
.bb
.data());
2335 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
2336 static void icell_set_x_simple(int ci
,
2337 real shx
, real shy
, real shz
,
2338 int stride
, const real
*x
,
2339 NbnxnPairlistCpuWork::IClusterData
*iClusterData
)
2341 const int ia
= ci
*c_nbnxnCpuIClusterSize
;
2343 for (int i
= 0; i
< c_nbnxnCpuIClusterSize
; i
++)
2345 iClusterData
->x
[i
*STRIDE_XYZ
+XX
] = x
[(ia
+i
)*stride
+XX
] + shx
;
2346 iClusterData
->x
[i
*STRIDE_XYZ
+YY
] = x
[(ia
+i
)*stride
+YY
] + shy
;
2347 iClusterData
->x
[i
*STRIDE_XYZ
+ZZ
] = x
[(ia
+i
)*stride
+ZZ
] + shz
;
2351 static void icell_set_x(int ci
,
2352 real shx
, real shy
, real shz
,
2353 int stride
, const real
*x
,
2354 const ClusterDistanceKernelType kernelType
,
2355 NbnxnPairlistCpuWork
*work
)
2360 #ifdef GMX_NBNXN_SIMD_4XN
2361 case ClusterDistanceKernelType::CpuSimd_4xM
:
2362 icell_set_x_simd_4xn(ci
, shx
, shy
, shz
, stride
, x
, work
);
2365 #ifdef GMX_NBNXN_SIMD_2XNN
2366 case ClusterDistanceKernelType::CpuSimd_2xMM
:
2367 icell_set_x_simd_2xnn(ci
, shx
, shy
, shz
, stride
, x
, work
);
2371 case ClusterDistanceKernelType::CpuPlainC
:
2372 icell_set_x_simple(ci
, shx
, shy
, shz
, stride
, x
, &work
->iClusterData
);
2375 GMX_ASSERT(false, "Unhandled case");
2380 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
2381 static void icell_set_x(int ci
,
2382 real shx
, real shy
, real shz
,
2383 int stride
, const real
*x
,
2384 ClusterDistanceKernelType gmx_unused kernelType
,
2385 NbnxnPairlistGpuWork
*work
)
2387 #if !GMX_SIMD4_HAVE_REAL
2389 real
* x_ci
= work
->iSuperClusterData
.x
.data();
2391 int ia
= ci
*c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
;
2392 for (int i
= 0; i
< c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
; i
++)
2394 x_ci
[i
*DIM
+ XX
] = x
[(ia
+i
)*stride
+ XX
] + shx
;
2395 x_ci
[i
*DIM
+ YY
] = x
[(ia
+i
)*stride
+ YY
] + shy
;
2396 x_ci
[i
*DIM
+ ZZ
] = x
[(ia
+i
)*stride
+ ZZ
] + shz
;
2399 #else /* !GMX_SIMD4_HAVE_REAL */
2401 real
* x_ci
= work
->iSuperClusterData
.xSimd
.data();
2403 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
2405 for (int i
= 0; i
< c_nbnxnGpuClusterSize
; i
+= GMX_SIMD4_WIDTH
)
2407 int io
= si
*c_nbnxnGpuClusterSize
+ i
;
2408 int ia
= ci
*c_gpuNumClusterPerCell
*c_nbnxnGpuClusterSize
+ io
;
2409 for (int j
= 0; j
< GMX_SIMD4_WIDTH
; j
++)
2411 x_ci
[io
*DIM
+ j
+ XX
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ XX
] + shx
;
2412 x_ci
[io
*DIM
+ j
+ YY
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ YY
] + shy
;
2413 x_ci
[io
*DIM
+ j
+ ZZ
*GMX_SIMD4_WIDTH
] = x
[(ia
+ j
)*stride
+ ZZ
] + shz
;
2418 #endif /* !GMX_SIMD4_HAVE_REAL */
2421 static real
minimum_subgrid_size_xy(const Grid
&grid
)
2423 const Grid::Dimensions
&dims
= grid
.dimensions();
2425 if (grid
.geometry().isSimple
)
2427 return std::min(dims
.cellSize
[XX
], dims
.cellSize
[YY
]);
2431 return std::min(dims
.cellSize
[XX
]/c_gpuNumClusterPerCellX
,
2432 dims
.cellSize
[YY
]/c_gpuNumClusterPerCellY
);
2436 static real
effective_buffer_1x1_vs_MxN(const Grid
&iGrid
,
2439 const real eff_1x1_buffer_fac_overest
= 0.1;
2441 /* Determine an atom-pair list cut-off buffer size for atom pairs,
2442 * to be added to rlist (including buffer) used for MxN.
2443 * This is for converting an MxN list to a 1x1 list. This means we can't
2444 * use the normal buffer estimate, as we have an MxN list in which
2445 * some atom pairs beyond rlist are missing. We want to capture
2446 * the beneficial effect of buffering by extra pairs just outside rlist,
2447 * while removing the useless pairs that are further away from rlist.
2448 * (Also the buffer could have been set manually not using the estimate.)
2449 * This buffer size is an overestimate.
2450 * We add 10% of the smallest grid sub-cell dimensions.
2451 * Note that the z-size differs per cell and we don't use this,
2452 * so we overestimate.
2453 * With PME, the 10% value gives a buffer that is somewhat larger
2454 * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
2455 * Smaller tolerances or using RF lead to a smaller effective buffer,
2456 * so 10% gives a safe overestimate.
2458 return eff_1x1_buffer_fac_overest
*(minimum_subgrid_size_xy(iGrid
) +
2459 minimum_subgrid_size_xy(jGrid
));
2462 /* Estimates the interaction volume^2 for non-local interactions */
2463 static real
nonlocal_vol2(const struct gmx_domdec_zones_t
*zones
, const rvec ls
, real r
)
2471 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
2472 * not home interaction volume^2. As these volumes are not additive,
2473 * this is an overestimate, but it would only be significant in the limit
2474 * of small cells, where we anyhow need to split the lists into
2475 * as small parts as possible.
2478 for (int z
= 0; z
< zones
->n
; z
++)
2480 if (zones
->shift
[z
][XX
] + zones
->shift
[z
][YY
] + zones
->shift
[z
][ZZ
] == 1)
2485 for (int d
= 0; d
< DIM
; d
++)
2487 if (zones
->shift
[z
][d
] == 0)
2491 za
*= zones
->size
[z
].x1
[d
] - zones
->size
[z
].x0
[d
];
2495 /* 4 octants of a sphere */
2496 vold_est
= 0.25*M_PI
*r
*r
*r
*r
;
2497 /* 4 quarter pie slices on the edges */
2498 vold_est
+= 4*cl
*M_PI
/6.0*r
*r
*r
;
2499 /* One rectangular volume on a face */
2500 vold_est
+= ca
*0.5*r
*r
;
2502 vol2_est_tot
+= vold_est
*za
;
2506 return vol2_est_tot
;
2509 /* Estimates the average size of a full j-list for super/sub setup */
2510 static void get_nsubpair_target(const Nbnxm::GridSet
&gridSet
,
2511 const InteractionLocality iloc
,
2513 const int min_ci_balanced
,
2514 int *nsubpair_target
,
2515 float *nsubpair_tot_est
)
2517 /* The target value of 36 seems to be the optimum for Kepler.
2518 * Maxwell is less sensitive to the exact value.
2520 const int nsubpair_target_min
= 36;
2521 real r_eff_sup
, vol_est
, nsp_est
, nsp_est_nl
;
2523 const Grid
&grid
= gridSet
.grids()[0];
2525 /* We don't need to balance list sizes if:
2526 * - We didn't request balancing.
2527 * - The number of grid cells >= the number of lists requested,
2528 * since we will always generate at least #cells lists.
2529 * - We don't have any cells, since then there won't be any lists.
2531 if (min_ci_balanced
<= 0 || grid
.numCells() >= min_ci_balanced
|| grid
.numCells() == 0)
2533 /* nsubpair_target==0 signals no balancing */
2534 *nsubpair_target
= 0;
2535 *nsubpair_tot_est
= 0;
2541 const int numAtomsCluster
= grid
.geometry().numAtomsICluster
;
2542 const Grid::Dimensions
&dims
= grid
.dimensions();
2544 ls
[XX
] = dims
.cellSize
[XX
]/c_gpuNumClusterPerCellX
;
2545 ls
[YY
] = dims
.cellSize
[YY
]/c_gpuNumClusterPerCellY
;
2546 ls
[ZZ
] = numAtomsCluster
/(dims
.atomDensity
*ls
[XX
]*ls
[YY
]);
2548 /* The formulas below are a heuristic estimate of the average nsj per si*/
2549 r_eff_sup
= rlist
+ nbnxn_get_rlist_effective_inc(numAtomsCluster
, ls
);
2551 if (!gridSet
.domainSetup().haveMultipleDomains
||
2552 gridSet
.domainSetup().zones
->n
== 1)
2559 gmx::square(dims
.atomDensity
/numAtomsCluster
)*
2560 nonlocal_vol2(gridSet
.domainSetup().zones
, ls
, r_eff_sup
);
2563 if (iloc
== InteractionLocality::Local
)
2565 /* Sub-cell interacts with itself */
2566 vol_est
= ls
[XX
]*ls
[YY
]*ls
[ZZ
];
2567 /* 6/2 rectangular volume on the faces */
2568 vol_est
+= (ls
[XX
]*ls
[YY
] + ls
[XX
]*ls
[ZZ
] + ls
[YY
]*ls
[ZZ
])*r_eff_sup
;
2569 /* 12/2 quarter pie slices on the edges */
2570 vol_est
+= 2*(ls
[XX
] + ls
[YY
] + ls
[ZZ
])*0.25*M_PI
*gmx::square(r_eff_sup
);
2571 /* 4 octants of a sphere */
2572 vol_est
+= 0.5*4.0/3.0*M_PI
*gmx::power3(r_eff_sup
);
2574 /* Estimate the number of cluster pairs as the local number of
2575 * clusters times the volume they interact with times the density.
2577 nsp_est
= grid
.numClusters()*vol_est
*dims
.atomDensity
/numAtomsCluster
;
2579 /* Subtract the non-local pair count */
2580 nsp_est
-= nsp_est_nl
;
2582 /* For small cut-offs nsp_est will be an underesimate.
2583 * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
2584 * So to avoid too small or negative nsp_est we set a minimum of
2585 * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
2586 * This might be a slight overestimate for small non-periodic groups of
2587 * atoms as will occur for a local domain with DD, but for small
2588 * groups of atoms we'll anyhow be limited by nsubpair_target_min,
2589 * so this overestimation will not matter.
2591 nsp_est
= std::max(nsp_est
, grid
.numClusters()*14._real
);
2595 fprintf(debug
, "nsp_est local %5.1f non-local %5.1f\n",
2596 nsp_est
, nsp_est_nl
);
2601 nsp_est
= nsp_est_nl
;
2604 /* Thus the (average) maximum j-list size should be as follows.
2605 * Since there is overhead, we shouldn't make the lists too small
2606 * (and we can't chop up j-groups) so we use a minimum target size of 36.
2608 *nsubpair_target
= std::max(nsubpair_target_min
,
2609 roundToInt(nsp_est
/min_ci_balanced
));
2610 *nsubpair_tot_est
= static_cast<int>(nsp_est
);
2614 fprintf(debug
, "nbl nsp estimate %.1f, nsubpair_target %d\n",
2615 nsp_est
, *nsubpair_target
);
2619 /* Debug list print function */
2620 static void print_nblist_ci_cj(FILE *fp
,
2621 const NbnxnPairlistCpu
&nbl
)
2623 for (const nbnxn_ci_t
&ciEntry
: nbl
.ci
)
2625 fprintf(fp
, "ci %4d shift %2d ncj %3d\n",
2626 ciEntry
.ci
, ciEntry
.shift
,
2627 ciEntry
.cj_ind_end
- ciEntry
.cj_ind_start
);
2629 for (int j
= ciEntry
.cj_ind_start
; j
< ciEntry
.cj_ind_end
; j
++)
2631 fprintf(fp
, " cj %5d imask %x\n",
2638 /* Debug list print function */
2639 static void print_nblist_sci_cj(FILE *fp
,
2640 const NbnxnPairlistGpu
&nbl
)
2642 for (const nbnxn_sci_t
&sci
: nbl
.sci
)
2644 fprintf(fp
, "ci %4d shift %2d ncj4 %2d\n",
2646 sci
.numJClusterGroups());
2649 for (int j4
= sci
.cj4_ind_start
; j4
< sci
.cj4_ind_end
; j4
++)
2651 for (int j
= 0; j
< c_nbnxnGpuJgroupSize
; j
++)
2653 fprintf(fp
, " sj %5d imask %x\n",
2655 nbl
.cj4
[j4
].imei
[0].imask
);
2656 for (int si
= 0; si
< c_gpuNumClusterPerCell
; si
++)
2658 if (nbl
.cj4
[j4
].imei
[0].imask
& (1U << (j
*c_gpuNumClusterPerCell
+ si
)))
2665 fprintf(fp
, "ci %4d shift %2d ncj4 %2d ncp %3d\n",
2667 sci
.numJClusterGroups(),
2672 /* Combine pair lists *nbl generated on multiple threads nblc */
2673 static void combine_nblists(gmx::ArrayRef
<const NbnxnPairlistGpu
> nbls
,
2674 NbnxnPairlistGpu
*nblc
)
2676 int nsci
= nblc
->sci
.size();
2677 int ncj4
= nblc
->cj4
.size();
2678 int nexcl
= nblc
->excl
.size();
2679 for (auto &nbl
: nbls
)
2681 nsci
+= nbl
.sci
.size();
2682 ncj4
+= nbl
.cj4
.size();
2683 nexcl
+= nbl
.excl
.size();
2686 /* Resize with the final, combined size, so we can fill in parallel */
2687 /* NOTE: For better performance we should use default initialization */
2688 nblc
->sci
.resize(nsci
);
2689 nblc
->cj4
.resize(ncj4
);
2690 nblc
->excl
.resize(nexcl
);
2692 /* Each thread should copy its own data to the combined arrays,
2693 * as otherwise data will go back and forth between different caches.
2695 #if GMX_OPENMP && !(defined __clang_analyzer__)
2696 int nthreads
= gmx_omp_nthreads_get(emntPairsearch
);
2699 #pragma omp parallel for num_threads(nthreads) schedule(static)
2700 for (int n
= 0; n
< nbls
.ssize(); n
++)
2704 /* Determine the offset in the combined data for our thread.
2705 * Note that the original sizes in nblc are lost.
2707 int sci_offset
= nsci
;
2708 int cj4_offset
= ncj4
;
2709 int excl_offset
= nexcl
;
2711 for (int i
= n
; i
< nbls
.ssize(); i
++)
2713 sci_offset
-= nbls
[i
].sci
.size();
2714 cj4_offset
-= nbls
[i
].cj4
.size();
2715 excl_offset
-= nbls
[i
].excl
.size();
2718 const NbnxnPairlistGpu
&nbli
= nbls
[n
];
2720 for (size_t i
= 0; i
< nbli
.sci
.size(); i
++)
2722 nblc
->sci
[sci_offset
+ i
] = nbli
.sci
[i
];
2723 nblc
->sci
[sci_offset
+ i
].cj4_ind_start
+= cj4_offset
;
2724 nblc
->sci
[sci_offset
+ i
].cj4_ind_end
+= cj4_offset
;
2727 for (size_t j4
= 0; j4
< nbli
.cj4
.size(); j4
++)
2729 nblc
->cj4
[cj4_offset
+ j4
] = nbli
.cj4
[j4
];
2730 nblc
->cj4
[cj4_offset
+ j4
].imei
[0].excl_ind
+= excl_offset
;
2731 nblc
->cj4
[cj4_offset
+ j4
].imei
[1].excl_ind
+= excl_offset
;
2734 for (size_t j4
= 0; j4
< nbli
.excl
.size(); j4
++)
2736 nblc
->excl
[excl_offset
+ j4
] = nbli
.excl
[j4
];
2739 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2742 for (auto &nbl
: nbls
)
2744 nblc
->nci_tot
+= nbl
.nci_tot
;
2748 static void balance_fep_lists(gmx::ArrayRef
< std::unique_ptr
< t_nblist
>> fepLists
,
2749 gmx::ArrayRef
<PairsearchWork
> work
)
2751 const int numLists
= fepLists
.ssize();
2755 /* Nothing to balance */
2759 /* Count the total i-lists and pairs */
2762 for (const auto &list
: fepLists
)
2764 nri_tot
+= list
->nri
;
2765 nrj_tot
+= list
->nrj
;
2768 const int nrj_target
= (nrj_tot
+ numLists
- 1)/numLists
;
2770 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded
) == numLists
,
2771 "We should have as many work objects as FEP lists");
2773 #pragma omp parallel for schedule(static) num_threads(numLists)
2774 for (int th
= 0; th
< numLists
; th
++)
2778 t_nblist
*nbl
= work
[th
].nbl_fep
.get();
2780 /* Note that here we allocate for the total size, instead of
2781 * a per-thread esimate (which is hard to obtain).
2783 if (nri_tot
> nbl
->maxnri
)
2785 nbl
->maxnri
= over_alloc_large(nri_tot
);
2786 reallocate_nblist(nbl
);
2788 if (nri_tot
> nbl
->maxnri
|| nrj_tot
> nbl
->maxnrj
)
2790 nbl
->maxnrj
= over_alloc_small(nrj_tot
);
2791 srenew(nbl
->jjnr
, nbl
->maxnrj
);
2792 srenew(nbl
->excl_fep
, nbl
->maxnrj
);
2795 clear_pairlist_fep(nbl
);
2797 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
2800 /* Loop over the source lists and assign and copy i-entries */
2802 t_nblist
*nbld
= work
[th_dest
].nbl_fep
.get();
2803 for (int th
= 0; th
< numLists
; th
++)
2805 const t_nblist
*nbls
= fepLists
[th
].get();
2807 for (int i
= 0; i
< nbls
->nri
; i
++)
2811 /* The number of pairs in this i-entry */
2812 nrj
= nbls
->jindex
[i
+1] - nbls
->jindex
[i
];
2814 /* Decide if list th_dest is too large and we should procede
2815 * to the next destination list.
2817 if (th_dest
+ 1 < numLists
&& nbld
->nrj
> 0 &&
2818 nbld
->nrj
+ nrj
- nrj_target
> nrj_target
- nbld
->nrj
)
2821 nbld
= work
[th_dest
].nbl_fep
.get();
2824 nbld
->iinr
[nbld
->nri
] = nbls
->iinr
[i
];
2825 nbld
->gid
[nbld
->nri
] = nbls
->gid
[i
];
2826 nbld
->shift
[nbld
->nri
] = nbls
->shift
[i
];
2828 for (int j
= nbls
->jindex
[i
]; j
< nbls
->jindex
[i
+1]; j
++)
2830 nbld
->jjnr
[nbld
->nrj
] = nbls
->jjnr
[j
];
2831 nbld
->excl_fep
[nbld
->nrj
] = nbls
->excl_fep
[j
];
2835 nbld
->jindex
[nbld
->nri
] = nbld
->nrj
;
2839 /* Swap the list pointers */
2840 for (int th
= 0; th
< numLists
; th
++)
2842 fepLists
[th
].swap(work
[th
].nbl_fep
);
2846 fprintf(debug
, "nbl_fep[%d] nri %4d nrj %4d\n",
2854 /* Returns the next ci to be processes by our thread */
2855 static gmx_bool
next_ci(const Grid
&grid
,
2856 int nth
, int ci_block
,
2857 int *ci_x
, int *ci_y
,
2863 if (*ci_b
== ci_block
)
2865 /* Jump to the next block assigned to this task */
2866 *ci
+= (nth
- 1)*ci_block
;
2870 if (*ci
>= grid
.numCells())
2875 while (*ci
>= grid
.firstCellInColumn(*ci_x
*grid
.dimensions().numCells
[YY
] + *ci_y
+ 1))
2878 if (*ci_y
== grid
.dimensions().numCells
[YY
])
2888 /* Returns the distance^2 for which we put cell pairs in the list
2889 * without checking atom pair distances. This is usually < rlist^2.
2891 static float boundingbox_only_distance2(const Grid::Dimensions
&iGridDims
,
2892 const Grid::Dimensions
&jGridDims
,
2896 /* If the distance between two sub-cell bounding boxes is less
2897 * than this distance, do not check the distance between
2898 * all particle pairs in the sub-cell, since then it is likely
2899 * that the box pair has atom pairs within the cut-off.
2900 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
2901 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
2902 * Using more than 0.5 gains at most 0.5%.
2903 * If forces are calculated more than twice, the performance gain
2904 * in the force calculation outweighs the cost of checking.
2905 * Note that with subcell lists, the atom-pair distance check
2906 * is only performed when only 1 out of 8 sub-cells in within range,
2907 * this is because the GPU is much faster than the cpu.
2912 bbx
= 0.5*(iGridDims
.cellSize
[XX
] + jGridDims
.cellSize
[XX
]);
2913 bby
= 0.5*(iGridDims
.cellSize
[YY
] + jGridDims
.cellSize
[YY
]);
2916 bbx
/= c_gpuNumClusterPerCellX
;
2917 bby
/= c_gpuNumClusterPerCellY
;
2920 rbb2
= std::max(0.0, rlist
- 0.5*std::sqrt(bbx
*bbx
+ bby
*bby
));
2926 return (float)((1+GMX_FLOAT_EPS
)*rbb2
);
2930 static int get_ci_block_size(const Grid
&iGrid
,
2931 const bool haveMultipleDomains
,
2934 const int ci_block_enum
= 5;
2935 const int ci_block_denom
= 11;
2936 const int ci_block_min_atoms
= 16;
2939 /* Here we decide how to distribute the blocks over the threads.
2940 * We use prime numbers to try to avoid that the grid size becomes
2941 * a multiple of the number of threads, which would lead to some
2942 * threads getting "inner" pairs and others getting boundary pairs,
2943 * which in turns will lead to load imbalance between threads.
2944 * Set the block size as 5/11/ntask times the average number of cells
2945 * in a y,z slab. This should ensure a quite uniform distribution
2946 * of the grid parts of the different thread along all three grid
2947 * zone boundaries with 3D domain decomposition. At the same time
2948 * the blocks will not become too small.
2950 GMX_ASSERT(iGrid
.dimensions().numCells
[XX
] > 0, "Grid can't be empty");
2951 GMX_ASSERT(numLists
> 0, "We need at least one list");
2952 ci_block
= (iGrid
.numCells()*ci_block_enum
)/(ci_block_denom
*iGrid
.dimensions().numCells
[XX
]*numLists
);
2954 const int numAtomsPerCell
= iGrid
.geometry().numAtomsPerCell
;
2956 /* Ensure the blocks are not too small: avoids cache invalidation */
2957 if (ci_block
*numAtomsPerCell
< ci_block_min_atoms
)
2959 ci_block
= (ci_block_min_atoms
+ numAtomsPerCell
- 1)/numAtomsPerCell
;
2962 /* Without domain decomposition
2963 * or with less than 3 blocks per task, divide in nth blocks.
2965 if (!haveMultipleDomains
|| numLists
*3*ci_block
> iGrid
.numCells())
2967 ci_block
= (iGrid
.numCells() + numLists
- 1)/numLists
;
2970 if (ci_block
> 1 && (numLists
- 1)*ci_block
>= iGrid
.numCells())
2972 /* Some threads have no work. Although reducing the block size
2973 * does not decrease the block count on the first few threads,
2974 * with GPUs better mixing of "upper" cells that have more empty
2975 * clusters results in a somewhat lower max load over all threads.
2976 * Without GPUs the regime of so few atoms per thread is less
2977 * performance relevant, but with 8-wide SIMD the same reasoning
2978 * applies, since the pair list uses 4 i-atom "sub-clusters".
2986 /* Returns the number of bits to right-shift a cluster index to obtain
2987 * the corresponding force buffer flag index.
2989 static int getBufferFlagShift(int numAtomsPerCluster
)
2991 int bufferFlagShift
= 0;
2992 while ((numAtomsPerCluster
<< bufferFlagShift
) < NBNXN_BUFFERFLAG_SIZE
)
2997 return bufferFlagShift
;
3000 static bool pairlistIsSimple(const NbnxnPairlistCpu gmx_unused
&pairlist
)
3005 static bool pairlistIsSimple(const NbnxnPairlistGpu gmx_unused
&pairlist
)
3011 makeClusterListWrapper(NbnxnPairlistCpu
*nbl
,
3012 const Grid gmx_unused
&iGrid
,
3015 const int firstCell
,
3017 const bool excludeSubDiagonal
,
3018 const nbnxn_atomdata_t
*nbat
,
3021 const ClusterDistanceKernelType kernelType
,
3022 int *numDistanceChecks
)
3026 case ClusterDistanceKernelType::CpuPlainC
:
3027 makeClusterListSimple(jGrid
,
3028 nbl
, ci
, firstCell
, lastCell
,
3034 #ifdef GMX_NBNXN_SIMD_4XN
3035 case ClusterDistanceKernelType::CpuSimd_4xM
:
3036 makeClusterListSimd4xn(jGrid
,
3037 nbl
, ci
, firstCell
, lastCell
,
3044 #ifdef GMX_NBNXN_SIMD_2XNN
3045 case ClusterDistanceKernelType::CpuSimd_2xMM
:
3046 makeClusterListSimd2xnn(jGrid
,
3047 nbl
, ci
, firstCell
, lastCell
,
3055 GMX_ASSERT(false, "Unhandled kernel type");
3060 makeClusterListWrapper(NbnxnPairlistGpu
*nbl
,
3061 const Grid
&gmx_unused iGrid
,
3064 const int firstCell
,
3066 const bool excludeSubDiagonal
,
3067 const nbnxn_atomdata_t
*nbat
,
3070 ClusterDistanceKernelType gmx_unused kernelType
,
3071 int *numDistanceChecks
)
3073 for (int cj
= firstCell
; cj
<= lastCell
; cj
++)
3075 make_cluster_list_supersub(iGrid
, jGrid
,
3078 nbat
->xstride
, nbat
->x().data(),
3084 static int getNumSimpleJClustersInList(const NbnxnPairlistCpu
&nbl
)
3086 return nbl
.cj
.size();
3089 static int getNumSimpleJClustersInList(const gmx_unused NbnxnPairlistGpu
&nbl
)
3094 static void incrementNumSimpleJClustersInList(NbnxnPairlistCpu
*nbl
,
3097 nbl
->ncjInUse
+= nbl
->cj
.size() - ncj_old_j
;
3100 static void incrementNumSimpleJClustersInList(NbnxnPairlistGpu gmx_unused
*nbl
,
3101 int gmx_unused ncj_old_j
)
3105 static void checkListSizeConsistency(const NbnxnPairlistCpu
&nbl
,
3106 const bool haveFreeEnergy
)
3108 GMX_RELEASE_ASSERT(static_cast<size_t>(nbl
.ncjInUse
) == nbl
.cj
.size() || haveFreeEnergy
,
3109 "Without free-energy all cj pair-list entries should be in use. "
3110 "Note that subsequent code does not make use of the equality, "
3111 "this check is only here to catch bugs");
3114 static void checkListSizeConsistency(const NbnxnPairlistGpu gmx_unused
&nbl
,
3115 bool gmx_unused haveFreeEnergy
)
3117 /* We currently can not check consistency here */
3120 /* Set the buffer flags for newly added entries in the list */
3121 static void setBufferFlags(const NbnxnPairlistCpu
&nbl
,
3122 const int ncj_old_j
,
3123 const int gridj_flag_shift
,
3124 gmx_bitmask_t
*gridj_flag
,
3127 if (gmx::ssize(nbl
.cj
) > ncj_old_j
)
3129 int cbFirst
= nbl
.cj
[ncj_old_j
].cj
>> gridj_flag_shift
;
3130 int cbLast
= nbl
.cj
.back().cj
>> gridj_flag_shift
;
3131 for (int cb
= cbFirst
; cb
<= cbLast
; cb
++)
3133 bitmask_init_bit(&gridj_flag
[cb
], th
);
3138 static void setBufferFlags(const NbnxnPairlistGpu gmx_unused
&nbl
,
3139 int gmx_unused ncj_old_j
,
3140 int gmx_unused gridj_flag_shift
,
3141 gmx_bitmask_t gmx_unused
*gridj_flag
,
3144 GMX_ASSERT(false, "This function should never be called");
3147 /* Generates the part of pair-list nbl assigned to our thread */
3148 template <typename T
>
3149 static void nbnxn_make_pairlist_part(const Nbnxm::GridSet
&gridSet
,
3152 PairsearchWork
*work
,
3153 const nbnxn_atomdata_t
*nbat
,
3154 const t_blocka
&exclusions
,
3156 const PairlistType pairlistType
,
3158 gmx_bool bFBufferFlag
,
3161 float nsubpair_tot_est
,
3170 int ci_b
, ci
, ci_x
, ci_y
, ci_xy
;
3172 real bx0
, bx1
, by0
, by1
, bz0
, bz1
;
3174 real d2cx
, d2z
, d2z_cx
, d2z_cy
, d2zx
, d2zxy
, d2xy
;
3175 int cxf
, cxl
, cyf
, cyf_x
, cyl
;
3176 int numDistanceChecks
;
3177 int gridi_flag_shift
= 0, gridj_flag_shift
= 0;
3178 gmx_bitmask_t
*gridj_flag
= nullptr;
3179 int ncj_old_i
, ncj_old_j
;
3181 if (jGrid
.geometry().isSimple
!= pairlistIsSimple(*nbl
) ||
3182 iGrid
.geometry().isSimple
!= pairlistIsSimple(*nbl
))
3184 gmx_incons("Grid incompatible with pair-list");
3188 GMX_ASSERT(nbl
->na_ci
== jGrid
.geometry().numAtomsICluster
,
3189 "The cluster sizes in the list and grid should match");
3190 nbl
->na_cj
= JClusterSizePerListType
[pairlistType
];
3191 na_cj_2log
= get_2log(nbl
->na_cj
);
3197 /* Determine conversion of clusters to flag blocks */
3198 gridi_flag_shift
= getBufferFlagShift(nbl
->na_ci
);
3199 gridj_flag_shift
= getBufferFlagShift(nbl
->na_cj
);
3201 gridj_flag
= work
->buffer_flags
.flag
;
3204 gridSet
.getBox(box
);
3206 const bool haveFep
= gridSet
.haveFep();
3208 const real rlist2
= nbl
->rlist
*nbl
->rlist
;
3210 // Select the cluster pair distance kernel type
3211 const ClusterDistanceKernelType kernelType
=
3212 getClusterDistanceKernelType(pairlistType
, *nbat
);
3214 if (haveFep
&& !pairlistIsSimple(*nbl
))
3216 /* Determine an atom-pair list cut-off distance for FEP atom pairs.
3217 * We should not simply use rlist, since then we would not have
3218 * the small, effective buffering of the NxN lists.
3219 * The buffer is on overestimate, but the resulting cost for pairs
3220 * beyond rlist is neglible compared to the FEP pairs within rlist.
3222 rl_fep2
= nbl
->rlist
+ effective_buffer_1x1_vs_MxN(iGrid
, jGrid
);
3226 fprintf(debug
, "nbl_fep atom-pair rlist %f\n", rl_fep2
);
3228 rl_fep2
= rl_fep2
*rl_fep2
;
3231 const Grid::Dimensions
&iGridDims
= iGrid
.dimensions();
3232 const Grid::Dimensions
&jGridDims
= jGrid
.dimensions();
3234 rbb2
= boundingbox_only_distance2(iGridDims
, jGridDims
, nbl
->rlist
, pairlistIsSimple(*nbl
));
3238 fprintf(debug
, "nbl bounding box only distance %f\n", std::sqrt(rbb2
));
3241 const bool isIntraGridList
= (&iGrid
== &jGrid
);
3243 /* Set the shift range */
3244 for (int d
= 0; d
< DIM
; d
++)
3246 /* Check if we need periodicity shifts.
3247 * Without PBC or with domain decomposition we don't need them.
3249 if (d
>= ePBC2npbcdim(gridSet
.domainSetup().ePBC
) ||
3250 gridSet
.domainSetup().haveMultipleDomainsPerDim
[d
])
3256 const real listRangeCellToCell
=
3257 listRangeForGridCellToGridCell(rlist
, iGrid
.dimensions(), jGrid
.dimensions());
3259 box
[XX
][XX
] - fabs(box
[YY
][XX
]) - fabs(box
[ZZ
][XX
]) < listRangeCellToCell
)
3269 const bool bSimple
= pairlistIsSimple(*nbl
);
3270 gmx::ArrayRef
<const BoundingBox
> bb_i
;
3272 gmx::ArrayRef
<const float> pbb_i
;
3275 bb_i
= iGrid
.iBoundingBoxes();
3279 pbb_i
= iGrid
.packedBoundingBoxes();
3282 /* We use the normal bounding box format for both grid types */
3283 bb_i
= iGrid
.iBoundingBoxes();
3285 gmx::ArrayRef
<const BoundingBox1D
> bbcz_i
= iGrid
.zBoundingBoxes();
3286 gmx::ArrayRef
<const int> flags_i
= iGrid
.clusterFlags();
3287 gmx::ArrayRef
<const BoundingBox1D
> bbcz_j
= jGrid
.zBoundingBoxes();
3288 int cell0_i
= iGrid
.cellOffset();
3292 fprintf(debug
, "nbl nc_i %d col.av. %.1f ci_block %d\n",
3293 iGrid
.numCells(), iGrid
.numCells()/static_cast<double>(iGrid
.numColumns()), ci_block
);
3296 numDistanceChecks
= 0;
3298 const real listRangeBBToJCell2
= gmx::square(listRangeForBoundingBoxToGridCell(rlist
, jGrid
.dimensions()));
3300 /* Initially ci_b and ci to 1 before where we want them to start,
3301 * as they will both be incremented in next_ci.
3304 ci
= th
*ci_block
- 1;
3307 while (next_ci(iGrid
, nth
, ci_block
, &ci_x
, &ci_y
, &ci_b
, &ci
))
3309 if (bSimple
&& flags_i
[ci
] == 0)
3314 ncj_old_i
= getNumSimpleJClustersInList(*nbl
);
3317 if (!isIntraGridList
&& shp
[XX
] == 0)
3321 bx1
= bb_i
[ci
].upper
.x
;
3325 bx1
= iGridDims
.lowerCorner
[XX
] + (ci_x
+1)*iGridDims
.cellSize
[XX
];
3327 if (bx1
< jGridDims
.lowerCorner
[XX
])
3329 d2cx
= gmx::square(jGridDims
.lowerCorner
[XX
] - bx1
);
3331 if (d2cx
>= listRangeBBToJCell2
)
3338 ci_xy
= ci_x
*iGridDims
.numCells
[YY
] + ci_y
;
3340 /* Loop over shift vectors in three dimensions */
3341 for (int tz
= -shp
[ZZ
]; tz
<= shp
[ZZ
]; tz
++)
3343 const real shz
= tz
*box
[ZZ
][ZZ
];
3345 bz0
= bbcz_i
[ci
].lower
+ shz
;
3346 bz1
= bbcz_i
[ci
].upper
+ shz
;
3354 d2z
= gmx::square(bz1
);
3358 d2z
= gmx::square(bz0
- box
[ZZ
][ZZ
]);
3361 d2z_cx
= d2z
+ d2cx
;
3363 if (d2z_cx
>= rlist2
)
3368 bz1_frac
= bz1
/iGrid
.numCellsInColumn(ci_xy
);
3373 /* The check with bz1_frac close to or larger than 1 comes later */
3375 for (int ty
= -shp
[YY
]; ty
<= shp
[YY
]; ty
++)
3377 const real shy
= ty
*box
[YY
][YY
] + tz
*box
[ZZ
][YY
];
3381 by0
= bb_i
[ci
].lower
.y
+ shy
;
3382 by1
= bb_i
[ci
].upper
.y
+ shy
;
3386 by0
= iGridDims
.lowerCorner
[YY
] + (ci_y
)*iGridDims
.cellSize
[YY
] + shy
;
3387 by1
= iGridDims
.lowerCorner
[YY
] + (ci_y
+ 1)*iGridDims
.cellSize
[YY
] + shy
;
3390 get_cell_range
<YY
>(by0
, by1
,
3401 if (by1
< jGridDims
.lowerCorner
[YY
])
3403 d2z_cy
+= gmx::square(jGridDims
.lowerCorner
[YY
] - by1
);
3405 else if (by0
> jGridDims
.upperCorner
[YY
])
3407 d2z_cy
+= gmx::square(by0
- jGridDims
.upperCorner
[YY
]);
3410 for (int tx
= -shp
[XX
]; tx
<= shp
[XX
]; tx
++)
3412 const int shift
= XYZ2IS(tx
, ty
, tz
);
3414 const bool excludeSubDiagonal
= (isIntraGridList
&& shift
== CENTRAL
);
3416 if (c_pbcShiftBackward
&& isIntraGridList
&& shift
> CENTRAL
)
3421 const real shx
= tx
*box
[XX
][XX
] + ty
*box
[YY
][XX
] + tz
*box
[ZZ
][XX
];
3425 bx0
= bb_i
[ci
].lower
.x
+ shx
;
3426 bx1
= bb_i
[ci
].upper
.x
+ shx
;
3430 bx0
= iGridDims
.lowerCorner
[XX
] + (ci_x
)*iGridDims
.cellSize
[XX
] + shx
;
3431 bx1
= iGridDims
.lowerCorner
[XX
] + (ci_x
+1)*iGridDims
.cellSize
[XX
] + shx
;
3434 get_cell_range
<XX
>(bx0
, bx1
,
3444 addNewIEntry(nbl
, cell0_i
+ci
, shift
, flags_i
[ci
]);
3446 if ((!c_pbcShiftBackward
|| excludeSubDiagonal
) &&
3449 /* Leave the pairs with i > j.
3450 * x is the major index, so skip half of it.
3455 set_icell_bb(iGrid
, ci
, shx
, shy
, shz
,
3458 icell_set_x(cell0_i
+ci
, shx
, shy
, shz
,
3459 nbat
->xstride
, nbat
->x().data(),
3463 for (int cx
= cxf
; cx
<= cxl
; cx
++)
3466 if (jGridDims
.lowerCorner
[XX
] + cx
*jGridDims
.cellSize
[XX
] > bx1
)
3468 d2zx
+= gmx::square(jGridDims
.lowerCorner
[XX
] + cx
*jGridDims
.cellSize
[XX
] - bx1
);
3470 else if (jGridDims
.lowerCorner
[XX
] + (cx
+1)*jGridDims
.cellSize
[XX
] < bx0
)
3472 d2zx
+= gmx::square(jGridDims
.lowerCorner
[XX
] + (cx
+1)*jGridDims
.cellSize
[XX
] - bx0
);
3475 if (isIntraGridList
&&
3477 (!c_pbcShiftBackward
|| shift
== CENTRAL
) &&
3480 /* Leave the pairs with i > j.
3481 * Skip half of y when i and j have the same x.
3490 for (int cy
= cyf_x
; cy
<= cyl
; cy
++)
3492 const int columnStart
= jGrid
.firstCellInColumn(cx
*jGridDims
.numCells
[YY
] + cy
);
3493 const int columnEnd
= jGrid
.firstCellInColumn(cx
*jGridDims
.numCells
[YY
] + cy
+ 1);
3496 if (jGridDims
.lowerCorner
[YY
] + cy
*jGridDims
.cellSize
[YY
] > by1
)
3498 d2zxy
+= gmx::square(jGridDims
.lowerCorner
[YY
] + cy
*jGridDims
.cellSize
[YY
] - by1
);
3500 else if (jGridDims
.lowerCorner
[YY
] + (cy
+ 1)*jGridDims
.cellSize
[YY
] < by0
)
3502 d2zxy
+= gmx::square(jGridDims
.lowerCorner
[YY
] + (cy
+ 1)*jGridDims
.cellSize
[YY
] - by0
);
3504 if (columnStart
< columnEnd
&& d2zxy
< listRangeBBToJCell2
)
3506 /* To improve efficiency in the common case
3507 * of a homogeneous particle distribution,
3508 * we estimate the index of the middle cell
3509 * in range (midCell). We search down and up
3510 * starting from this index.
3512 * Note that the bbcz_j array contains bounds
3513 * for i-clusters, thus for clusters of 4 atoms.
3514 * For the common case where the j-cluster size
3515 * is 8, we could step with a stride of 2,
3516 * but we do not do this because it would
3517 * complicate this code even more.
3519 int midCell
= columnStart
+ static_cast<int>(bz1_frac
*(columnEnd
- columnStart
));
3520 if (midCell
>= columnEnd
)
3522 midCell
= columnEnd
- 1;
3527 /* Find the lowest cell that can possibly
3529 * Check if we hit the bottom of the grid,
3530 * if the j-cell is below the i-cell and if so,
3531 * if it is within range.
3533 int downTestCell
= midCell
;
3534 while (downTestCell
>= columnStart
&&
3535 (bbcz_j
[downTestCell
].upper
>= bz0
||
3536 d2xy
+ gmx::square(bbcz_j
[downTestCell
].upper
- bz0
) < rlist2
))
3540 int firstCell
= downTestCell
+ 1;
3542 /* Find the highest cell that can possibly
3544 * Check if we hit the top of the grid,
3545 * if the j-cell is above the i-cell and if so,
3546 * if it is within range.
3548 int upTestCell
= midCell
+ 1;
3549 while (upTestCell
< columnEnd
&&
3550 (bbcz_j
[upTestCell
].lower
<= bz1
||
3551 d2xy
+ gmx::square(bbcz_j
[upTestCell
].lower
- bz1
) < rlist2
))
3555 int lastCell
= upTestCell
- 1;
3557 #define NBNXN_REFCODE 0
3560 /* Simple reference code, for debugging,
3561 * overrides the more complex code above.
3563 firstCell
= columnEnd
;
3565 for (int k
= columnStart
; k
< columnEnd
; k
++)
3567 if (d2xy
+ gmx::square(bbcz_j
[k
*NNBSBB_D
+ 1] - bz0
) < rlist2
&&
3572 if (d2xy
+ gmx::square(bbcz_j
[k
*NNBSBB_D
] - bz1
) < rlist2
&&
3581 if (isIntraGridList
)
3583 /* We want each atom/cell pair only once,
3584 * only use cj >= ci.
3586 if (!c_pbcShiftBackward
|| shift
== CENTRAL
)
3588 firstCell
= std::max(firstCell
, ci
);
3592 if (firstCell
<= lastCell
)
3594 GMX_ASSERT(firstCell
>= columnStart
&& lastCell
< columnEnd
, "The range should reside within the current grid column");
3596 /* For f buffer flags with simple lists */
3597 ncj_old_j
= getNumSimpleJClustersInList(*nbl
);
3599 makeClusterListWrapper(nbl
,
3601 jGrid
, firstCell
, lastCell
,
3606 &numDistanceChecks
);
3610 setBufferFlags(*nbl
, ncj_old_j
, gridj_flag_shift
,
3614 incrementNumSimpleJClustersInList(nbl
, ncj_old_j
);
3620 /* Set the exclusions for this ci list */
3621 setExclusionsForIEntry(gridSet
,
3625 *getOpenIEntry(nbl
),
3630 make_fep_list(gridSet
.atomIndices(), nbat
, nbl
,
3635 iGrid
, jGrid
, nbl_fep
);
3638 /* Close this ci list */
3641 progBal
, nsubpair_tot_est
,
3647 if (bFBufferFlag
&& getNumSimpleJClustersInList(*nbl
) > ncj_old_i
)
3649 bitmask_init_bit(&(work
->buffer_flags
.flag
[(iGrid
.cellOffset() + ci
) >> gridi_flag_shift
]), th
);
3653 work
->ndistc
= numDistanceChecks
;
3655 checkListSizeConsistency(*nbl
, haveFep
);
3659 fprintf(debug
, "number of distance checks %d\n", numDistanceChecks
);
3661 print_nblist_statistics(debug
, *nbl
, gridSet
, rlist
);
3665 fprintf(debug
, "nbl FEP list pairs: %d\n", nbl_fep
->nrj
);
3670 static void reduce_buffer_flags(gmx::ArrayRef
<PairsearchWork
> searchWork
,
3672 const nbnxn_buffer_flags_t
*dest
)
3674 for (int s
= 0; s
< nsrc
; s
++)
3676 gmx_bitmask_t
* flag
= searchWork
[s
].buffer_flags
.flag
;
3678 for (int b
= 0; b
< dest
->nflag
; b
++)
3680 bitmask_union(&(dest
->flag
[b
]), flag
[b
]);
3685 static void print_reduction_cost(const nbnxn_buffer_flags_t
*flags
, int nout
)
3687 int nelem
, nkeep
, ncopy
, nred
, out
;
3688 gmx_bitmask_t mask_0
;
3694 bitmask_init_bit(&mask_0
, 0);
3695 for (int b
= 0; b
< flags
->nflag
; b
++)
3697 if (bitmask_is_equal(flags
->flag
[b
], mask_0
))
3699 /* Only flag 0 is set, no copy of reduction required */
3703 else if (!bitmask_is_zero(flags
->flag
[b
]))
3706 for (out
= 0; out
< nout
; out
++)
3708 if (bitmask_is_set(flags
->flag
[b
], out
))
3725 fprintf(debug
, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
3727 nelem
/static_cast<double>(flags
->nflag
),
3728 nkeep
/static_cast<double>(flags
->nflag
),
3729 ncopy
/static_cast<double>(flags
->nflag
),
3730 nred
/static_cast<double>(flags
->nflag
));
3733 /* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
3734 * *cjGlobal is updated with the cj count in src.
3735 * When setFlags==true, flag bit t is set in flag for all i and j clusters.
3737 template<bool setFlags
>
3738 static void copySelectedListRange(const nbnxn_ci_t
* gmx_restrict srcCi
,
3739 const NbnxnPairlistCpu
* gmx_restrict src
,
3740 NbnxnPairlistCpu
* gmx_restrict dest
,
3741 gmx_bitmask_t
*flag
,
3742 int iFlagShift
, int jFlagShift
, int t
)
3744 const int ncj
= srcCi
->cj_ind_end
- srcCi
->cj_ind_start
;
3746 dest
->ci
.push_back(*srcCi
);
3747 dest
->ci
.back().cj_ind_start
= dest
->cj
.size();
3748 dest
->ci
.back().cj_ind_end
= dest
->cj
.size() + ncj
;
3752 bitmask_init_bit(&flag
[srcCi
->ci
>> iFlagShift
], t
);
3755 for (int j
= srcCi
->cj_ind_start
; j
< srcCi
->cj_ind_end
; j
++)
3757 dest
->cj
.push_back(src
->cj
[j
]);
3761 /* NOTE: This is relatively expensive, since this
3762 * operation is done for all elements in the list,
3763 * whereas at list generation this is done only
3764 * once for each flag entry.
3766 bitmask_init_bit(&flag
[src
->cj
[j
].cj
>> jFlagShift
], t
);
3771 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
3772 /* Avoid gcc 7 avx512 loop vectorization bug (actually only needed with -mavx512f) */
3773 #pragma GCC push_options
3774 #pragma GCC optimize ("no-tree-vectorize")
3777 /* Returns the number of cluster pairs that are in use summed over all lists */
3778 static int countClusterpairs(gmx::ArrayRef
<const NbnxnPairlistCpu
> pairlists
)
3780 /* gcc 7 with -mavx512f can miss the contributions of 16 consecutive
3781 * elements to the sum calculated in this loop. Above we have disabled
3782 * loop vectorization to avoid this bug.
3785 for (const auto &pairlist
: pairlists
)
3787 ncjTotal
+= pairlist
.ncjInUse
;
3792 #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
3793 #pragma GCC pop_options
3796 /* This routine re-balances the pairlists such that all are nearly equally
3797 * sized. Only whole i-entries are moved between lists. These are moved
3798 * between the ends of the lists, such that the buffer reduction cost should
3799 * not change significantly.
3800 * Note that all original reduction flags are currently kept. This can lead
3801 * to reduction of parts of the force buffer that could be avoided. But since
3802 * the original lists are quite balanced, this will only give minor overhead.
3804 static void rebalanceSimpleLists(gmx::ArrayRef
<const NbnxnPairlistCpu
> srcSet
,
3805 gmx::ArrayRef
<NbnxnPairlistCpu
> destSet
,
3806 gmx::ArrayRef
<PairsearchWork
> searchWork
)
3808 const int ncjTotal
= countClusterpairs(srcSet
);
3809 const int numLists
= srcSet
.ssize();
3810 const int ncjTarget
= (ncjTotal
+ numLists
- 1)/numLists
;
3812 #pragma omp parallel num_threads(numLists)
3814 int t
= gmx_omp_get_thread_num();
3816 int cjStart
= ncjTarget
* t
;
3817 int cjEnd
= ncjTarget
*(t
+ 1);
3819 /* The destination pair-list for task/thread t */
3820 NbnxnPairlistCpu
&dest
= destSet
[t
];
3822 clear_pairlist(&dest
);
3823 dest
.na_cj
= srcSet
[0].na_cj
;
3825 /* Note that the flags in the work struct (still) contain flags
3826 * for all entries that are present in srcSet->nbl[t].
3828 gmx_bitmask_t
*flag
= searchWork
[t
].buffer_flags
.flag
;
3830 int iFlagShift
= getBufferFlagShift(dest
.na_ci
);
3831 int jFlagShift
= getBufferFlagShift(dest
.na_cj
);
3834 for (int s
= 0; s
< numLists
&& cjGlobal
< cjEnd
; s
++)
3836 const NbnxnPairlistCpu
*src
= &srcSet
[s
];
3838 if (cjGlobal
+ src
->ncjInUse
> cjStart
)
3840 for (gmx::index i
= 0; i
< gmx::ssize(src
->ci
) && cjGlobal
< cjEnd
; i
++)
3842 const nbnxn_ci_t
*srcCi
= &src
->ci
[i
];
3843 int ncj
= srcCi
->cj_ind_end
- srcCi
->cj_ind_start
;
3844 if (cjGlobal
>= cjStart
)
3846 /* If the source list is not our own, we need to set
3847 * extra flags (the template bool parameter).
3851 copySelectedListRange
3854 flag
, iFlagShift
, jFlagShift
, t
);
3858 copySelectedListRange
3861 &dest
, flag
, iFlagShift
, jFlagShift
, t
);
3869 cjGlobal
+= src
->ncjInUse
;
3873 dest
.ncjInUse
= dest
.cj
.size();
3877 const int ncjTotalNew
= countClusterpairs(destSet
);
3878 GMX_RELEASE_ASSERT(ncjTotalNew
== ncjTotal
, "The total size of the lists before and after rebalancing should match");
3882 /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
3883 static bool checkRebalanceSimpleLists(gmx::ArrayRef
<const NbnxnPairlistCpu
> lists
)
3885 int numLists
= lists
.ssize();
3888 for (int s
= 0; s
< numLists
; s
++)
3890 ncjMax
= std::max(ncjMax
, lists
[s
].ncjInUse
);
3891 ncjTotal
+= lists
[s
].ncjInUse
;
3895 fprintf(debug
, "Pair-list ncjMax %d ncjTotal %d\n", ncjMax
, ncjTotal
);
3897 /* The rebalancing adds 3% extra time to the search. Heuristically we
3898 * determined that under common conditions the non-bonded kernel balance
3899 * improvement will outweigh this when the imbalance is more than 3%.
3900 * But this will, obviously, depend on search vs kernel time and nstlist.
3902 const real rebalanceTolerance
= 1.03;
3904 return numLists
*ncjMax
> ncjTotal
*rebalanceTolerance
;
3907 /* Perform a count (linear) sort to sort the smaller lists to the end.
3908 * This avoids load imbalance on the GPU, as large lists will be
3909 * scheduled and executed first and the smaller lists later.
3910 * Load balancing between multi-processors only happens at the end
3911 * and there smaller lists lead to more effective load balancing.
3912 * The sorting is done on the cj4 count, not on the actual pair counts.
3913 * Not only does this make the sort faster, but it also results in
3914 * better load balancing than using a list sorted on exact load.
3915 * This function swaps the pointer in the pair list to avoid a copy operation.
3917 static void sort_sci(NbnxnPairlistGpu
*nbl
)
3919 if (nbl
->cj4
.size() <= nbl
->sci
.size())
3921 /* nsci = 0 or all sci have size 1, sorting won't change the order */
3925 NbnxnPairlistGpuWork
&work
= *nbl
->work
;
3927 /* We will distinguish differences up to double the average */
3928 const int m
= (2*nbl
->cj4
.size())/nbl
->sci
.size();
3930 /* Resize work.sci_sort so we can sort into it */
3931 work
.sci_sort
.resize(nbl
->sci
.size());
3933 std::vector
<int> &sort
= work
.sortBuffer
;
3934 /* Set up m + 1 entries in sort, initialized at 0 */
3936 sort
.resize(m
+ 1, 0);
3937 /* Count the entries of each size */
3938 for (const nbnxn_sci_t
&sci
: nbl
->sci
)
3940 int i
= std::min(m
, sci
.numJClusterGroups());
3943 /* Calculate the offset for each count */
3946 for (int i
= m
- 1; i
>= 0; i
--)
3949 sort
[i
] = sort
[i
+ 1] + s0
;
3953 /* Sort entries directly into place */
3954 gmx::ArrayRef
<nbnxn_sci_t
> sci_sort
= work
.sci_sort
;
3955 for (const nbnxn_sci_t
&sci
: nbl
->sci
)
3957 int i
= std::min(m
, sci
.numJClusterGroups());
3958 sci_sort
[sort
[i
]++] = sci
;
3961 /* Swap the sci pointers so we use the new, sorted list */
3962 std::swap(nbl
->sci
, work
.sci_sort
);
3965 //! Prepares CPU lists produced by the search for dynamic pruning
3966 static void prepareListsForDynamicPruning(gmx::ArrayRef
<NbnxnPairlistCpu
> lists
);
3969 PairlistSet::constructPairlists(const Nbnxm::GridSet
&gridSet
,
3970 gmx::ArrayRef
<PairsearchWork
> searchWork
,
3971 nbnxn_atomdata_t
*nbat
,
3972 const t_blocka
*excl
,
3973 const int minimumIlistCountForGpuBalancing
,
3975 SearchCycleCounting
*searchCycleCounting
)
3977 const real rlist
= params_
.rlistOuter
;
3979 int nsubpair_target
;
3980 float nsubpair_tot_est
;
3983 int np_tot
, np_noq
, np_hlj
, nap
;
3985 const int numLists
= (isCpuType_
? cpuLists_
.size() : gpuLists_
.size());
3989 fprintf(debug
, "ns making %d nblists\n", numLists
);
3992 nbat
->bUseBufferFlags
= (nbat
->out
.size() > 1);
3993 /* We should re-init the flags before making the first list */
3994 if (nbat
->bUseBufferFlags
&& locality_
== InteractionLocality::Local
)
3996 init_buffer_flags(&nbat
->buffer_flags
, nbat
->numAtoms());
4000 if (locality_
== InteractionLocality::Local
)
4002 /* Only zone (grid) 0 vs 0 */
4007 nzi
= gridSet
.domainSetup().zones
->nizone
;
4010 if (!isCpuType_
&& minimumIlistCountForGpuBalancing
> 0)
4012 get_nsubpair_target(gridSet
, locality_
, rlist
, minimumIlistCountForGpuBalancing
,
4013 &nsubpair_target
, &nsubpair_tot_est
);
4017 nsubpair_target
= 0;
4018 nsubpair_tot_est
= 0;
4021 /* Clear all pair-lists */
4022 for (int th
= 0; th
< numLists
; th
++)
4026 clear_pairlist(&cpuLists_
[th
]);
4030 clear_pairlist(&gpuLists_
[th
]);
4033 if (params_
.haveFep
)
4035 clear_pairlist_fep(fepLists_
[th
].get());
4039 const gmx_domdec_zones_t
*ddZones
= gridSet
.domainSetup().zones
;
4041 for (int zi
= 0; zi
< nzi
; zi
++)
4043 const Grid
&iGrid
= gridSet
.grids()[zi
];
4047 if (locality_
== InteractionLocality::Local
)
4054 zj0
= ddZones
->izone
[zi
].j0
;
4055 zj1
= ddZones
->izone
[zi
].j1
;
4061 for (int zj
= zj0
; zj
< zj1
; zj
++)
4063 const Grid
&jGrid
= gridSet
.grids()[zj
];
4067 fprintf(debug
, "ns search grid %d vs %d\n", zi
, zj
);
4070 searchCycleCounting
->start(enbsCCsearch
);
4072 ci_block
= get_ci_block_size(iGrid
, gridSet
.domainSetup().haveMultipleDomains
, numLists
);
4074 /* With GPU: generate progressively smaller lists for
4075 * load balancing for local only or non-local with 2 zones.
4077 progBal
= (locality_
== InteractionLocality::Local
|| ddZones
->n
<= 2);
4079 #pragma omp parallel for num_threads(numLists) schedule(static)
4080 for (int th
= 0; th
< numLists
; th
++)
4084 /* Re-init the thread-local work flag data before making
4085 * the first list (not an elegant conditional).
4087 if (nbat
->bUseBufferFlags
&& ((zi
== 0 && zj
== 0)))
4089 init_buffer_flags(&searchWork
[th
].buffer_flags
, nbat
->numAtoms());
4092 if (combineLists_
&& th
> 0)
4094 GMX_ASSERT(!isCpuType_
, "Can only combine GPU lists");
4096 clear_pairlist(&gpuLists_
[th
]);
4099 PairsearchWork
&work
= searchWork
[th
];
4101 work
.cycleCounter
.start();
4103 t_nblist
*fepListPtr
= (fepLists_
.empty() ? nullptr : fepLists_
[th
].get());
4105 /* Divide the i cells equally over the pairlists */
4108 nbnxn_make_pairlist_part(gridSet
, iGrid
, jGrid
,
4111 params_
.pairlistType
,
4113 nbat
->bUseBufferFlags
,
4115 progBal
, nsubpair_tot_est
,
4122 nbnxn_make_pairlist_part(gridSet
, iGrid
, jGrid
,
4125 params_
.pairlistType
,
4127 nbat
->bUseBufferFlags
,
4129 progBal
, nsubpair_tot_est
,
4135 work
.cycleCounter
.stop();
4137 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
4139 searchCycleCounting
->stop(enbsCCsearch
);
4144 for (int th
= 0; th
< numLists
; th
++)
4146 inc_nrnb(nrnb
, eNR_NBNXN_DIST2
, searchWork
[th
].ndistc
);
4150 const NbnxnPairlistCpu
&nbl
= cpuLists_
[th
];
4151 np_tot
+= nbl
.cj
.size();
4152 np_noq
+= nbl
.work
->ncj_noq
;
4153 np_hlj
+= nbl
.work
->ncj_hlj
;
4157 const NbnxnPairlistGpu
&nbl
= gpuLists_
[th
];
4158 /* This count ignores potential subsequent pair pruning */
4159 np_tot
+= nbl
.nci_tot
;
4164 nap
= cpuLists_
[0].na_ci
*cpuLists_
[0].na_cj
;
4168 nap
= gmx::square(gpuLists_
[0].na_ci
);
4170 natpair_ljq_
= (np_tot
- np_noq
)*nap
- np_hlj
*nap
/2;
4171 natpair_lj_
= np_noq
*nap
;
4172 natpair_q_
= np_hlj
*nap
/2;
4174 if (combineLists_
&& numLists
> 1)
4176 GMX_ASSERT(!isCpuType_
, "Can only combine GPU lists");
4178 searchCycleCounting
->start(enbsCCcombine
);
4180 combine_nblists(gmx::constArrayRefFromArray(&gpuLists_
[1], numLists
- 1),
4183 searchCycleCounting
->stop(enbsCCcombine
);
4190 if (numLists
> 1 && checkRebalanceSimpleLists(cpuLists_
))
4192 rebalanceSimpleLists(cpuLists_
, cpuListsWork_
, searchWork
);
4194 /* Swap the sets of pair lists */
4195 cpuLists_
.swap(cpuListsWork_
);
4200 /* Sort the entries on size, large ones first */
4201 if (combineLists_
|| gpuLists_
.size() == 1)
4203 sort_sci(&gpuLists_
[0]);
4207 #pragma omp parallel for num_threads(numLists) schedule(static)
4208 for (int th
= 0; th
< numLists
; th
++)
4212 sort_sci(&gpuLists_
[th
]);
4214 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
4219 if (nbat
->bUseBufferFlags
)
4221 reduce_buffer_flags(searchWork
, numLists
, &nbat
->buffer_flags
);
4224 if (gridSet
.haveFep())
4226 /* Balance the free-energy lists over all the threads */
4227 balance_fep_lists(fepLists_
, searchWork
);
4232 /* This is a fresh list, so not pruned, stored using ci.
4233 * ciOuter is invalid at this point.
4235 GMX_ASSERT(cpuLists_
[0].ciOuter
.empty(), "ciOuter is invalid so it should be empty");
4238 /* If we have more than one list, they either got rebalancing (CPU)
4239 * or combined (GPU), so we should dump the final result to debug.
4243 if (isCpuType_
&& cpuLists_
.size() > 1)
4245 for (auto &cpuList
: cpuLists_
)
4247 print_nblist_statistics(debug
, cpuList
, gridSet
, rlist
);
4250 else if (!isCpuType_
&& gpuLists_
.size() > 1)
4252 print_nblist_statistics(debug
, gpuLists_
[0], gridSet
, rlist
);
4262 for (auto &cpuList
: cpuLists_
)
4264 print_nblist_ci_cj(debug
, cpuList
);
4269 print_nblist_sci_cj(debug
, gpuLists_
[0]);
4273 if (nbat
->bUseBufferFlags
)
4275 print_reduction_cost(&nbat
->buffer_flags
, numLists
);
4279 if (params_
.useDynamicPruning
&& isCpuType_
)
4281 prepareListsForDynamicPruning(cpuLists_
);
4286 PairlistSets::construct(const InteractionLocality iLocality
,
4287 PairSearch
*pairSearch
,
4288 nbnxn_atomdata_t
*nbat
,
4289 const t_blocka
*excl
,
4293 pairlistSet(iLocality
).constructPairlists(pairSearch
->gridSet(), pairSearch
->work(),
4294 nbat
, excl
, minimumIlistCountForGpuBalancing_
,
4295 nrnb
, &pairSearch
->cycleCounting_
);
4297 if (iLocality
== Nbnxm::InteractionLocality::Local
)
4299 outerListCreationStep_
= step
;
4303 GMX_RELEASE_ASSERT(outerListCreationStep_
== step
,
4304 "Outer list should be created at the same step as the inner list");
4307 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4308 if (iLocality
== InteractionLocality::Local
)
4310 pairSearch
->cycleCounting_
.searchCount_
++;
4312 if (pairSearch
->cycleCounting_
.recordCycles_
&&
4313 (!pairSearch
->gridSet().domainSetup().haveMultipleDomains
|| iLocality
== InteractionLocality::NonLocal
) &&
4314 pairSearch
->cycleCounting_
.searchCount_
% 100 == 0)
4316 pairSearch
->cycleCounting_
.printCycles(stderr
, pairSearch
->work());
4321 nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality iLocality
,
4322 const t_blocka
*excl
,
4326 pairlistSets_
->construct(iLocality
, pairSearch_
.get(), nbat
.get(), excl
,
4331 /* Launch the transfer of the pairlist to the GPU.
4333 * NOTE: The launch overhead is currently not timed separately
4335 Nbnxm::gpu_init_pairlist(gpu_nbv
,
4336 pairlistSets().pairlistSet(iLocality
).gpuList(),
4341 static void prepareListsForDynamicPruning(gmx::ArrayRef
<NbnxnPairlistCpu
> lists
)
4343 /* TODO: Restructure the lists so we have actual outer and inner
4344 * list objects so we can set a single pointer instead of
4345 * swapping several pointers.
4348 for (auto &list
: lists
)
4350 /* The search produced a list in ci/cj.
4351 * Swap the list pointers so we get the outer list is ciOuter,cjOuter
4352 * and we can prune that to get an inner list in ci/cj.
4354 GMX_RELEASE_ASSERT(list
.ciOuter
.empty() && list
.cjOuter
.empty(),
4355 "The outer lists should be empty before preparation");
4357 std::swap(list
.ci
, list
.ciOuter
);
4358 std::swap(list
.cj
, list
.cjOuter
);