2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_atomdata.h"
47 #include "thread_mpi/atomic.h"
49 #include "gromacs/math/functions.h"
50 #include "gromacs/math/utilities.h"
51 #include "gromacs/math/vec.h"
52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
53 #include "gromacs/mdlib/nb_verlet.h"
54 #include "gromacs/mdlib/nbnxn_consts.h"
55 #include "gromacs/mdlib/nbnxn_internal.h"
56 #include "gromacs/mdlib/nbnxn_search.h"
57 #include "gromacs/mdlib/nbnxn_util.h"
58 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
59 #include "gromacs/mdtypes/mdatom.h"
60 #include "gromacs/pbcutil/ishift.h"
61 #include "gromacs/simd/simd.h"
62 #include "gromacs/timing/wallcycle.h"
63 #include "gromacs/utility/exceptions.h"
64 #include "gromacs/utility/fatalerror.h"
65 #include "gromacs/utility/gmxomp.h"
66 #include "gromacs/utility/logger.h"
67 #include "gromacs/utility/smalloc.h"
68 #include "gromacs/utility/strconvert.h"
69 #include "gromacs/utility/stringutil.h"
71 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
73 /* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
74 void nbnxn_alloc_aligned(void **ptr
, size_t nbytes
)
76 *ptr
= save_malloc_aligned("ptr", __FILE__
, __LINE__
, nbytes
, 1, NBNXN_MEM_ALIGN
);
79 /* Free function for memory allocated with nbnxn_alloc_aligned */
80 void nbnxn_free_aligned(void *ptr
)
85 /* Reallocation wrapper function for nbnxn data structures */
86 void nbnxn_realloc_void(void **ptr
,
87 size_t nbytes_copy
, size_t nbytes_new
,
93 ma(&ptr_new
, nbytes_new
);
95 if (nbytes_new
> 0 && ptr_new
== nullptr)
97 gmx_fatal(FARGS
, "Allocation of %zu bytes failed", nbytes_new
);
102 if (nbytes_new
< nbytes_copy
)
104 gmx_incons("In nbnxn_realloc_void: new size less than copy size");
106 memcpy(ptr_new
, *ptr
, nbytes_copy
);
115 /* Reallocate the nbnxn_atomdata_t for a size of n atoms */
116 void nbnxn_atomdata_realloc(nbnxn_atomdata_t
*nbat
, int n
)
118 GMX_ASSERT(nbat
->nalloc
>= nbat
->natoms
, "We should have at least as many elelements allocated as there are set");
122 nbnxn_realloc_void(reinterpret_cast<void **>(&nbat
->type
),
123 nbat
->natoms
*sizeof(*nbat
->type
),
124 n
*sizeof(*nbat
->type
),
125 nbat
->alloc
, nbat
->free
);
126 nbnxn_realloc_void(reinterpret_cast<void **>(&nbat
->lj_comb
),
127 nbat
->natoms
*2*sizeof(*nbat
->lj_comb
),
128 n
*2*sizeof(*nbat
->lj_comb
),
129 nbat
->alloc
, nbat
->free
);
130 if (nbat
->XFormat
!= nbatXYZQ
)
132 nbnxn_realloc_void(reinterpret_cast<void **>(&nbat
->q
),
133 nbat
->natoms
*sizeof(*nbat
->q
),
135 nbat
->alloc
, nbat
->free
);
137 if (nbat
->nenergrp
> 1)
139 nbnxn_realloc_void(reinterpret_cast<void **>(&nbat
->energrp
),
140 nbat
->natoms
/nbat
->na_c
*sizeof(*nbat
->energrp
),
141 n
/nbat
->na_c
*sizeof(*nbat
->energrp
),
142 nbat
->alloc
, nbat
->free
);
144 nbnxn_realloc_void(reinterpret_cast<void **>(&nbat
->x
),
145 nbat
->natoms
*nbat
->xstride
*sizeof(*nbat
->x
),
146 n
*nbat
->xstride
*sizeof(*nbat
->x
),
147 nbat
->alloc
, nbat
->free
);
148 for (t
= 0; t
< nbat
->nout
; t
++)
150 /* Allocate one element extra for possible signaling with GPUs */
151 nbnxn_realloc_void(reinterpret_cast<void **>(&nbat
->out
[t
].f
),
152 nbat
->natoms
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
153 n
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
154 nbat
->alloc
, nbat
->free
);
159 /* Initializes an nbnxn_atomdata_output_t data structure */
160 static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t
*out
,
162 int nenergrp
, int stride
,
166 ma(reinterpret_cast<void **>(&out
->fshift
), SHIFTS
*DIM
*sizeof(*out
->fshift
));
167 out
->nV
= nenergrp
*nenergrp
;
168 ma(reinterpret_cast<void **>(&out
->Vvdw
), out
->nV
*sizeof(*out
->Vvdw
));
169 ma(reinterpret_cast<void **>(&out
->Vc
), out
->nV
*sizeof(*out
->Vc
));
171 if (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
172 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
)
174 int cj_size
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
175 out
->nVS
= nenergrp
*nenergrp
*stride
*(cj_size
>>1)*cj_size
;
176 ma(reinterpret_cast<void **>(&out
->VSvdw
), out
->nVS
*sizeof(*out
->VSvdw
));
177 ma(reinterpret_cast<void **>(&out
->VSc
), out
->nVS
*sizeof(*out
->VSc
));
185 static void copy_int_to_nbat_int(const int *a
, int na
, int na_round
,
186 const int *in
, int fill
, int *innb
)
191 for (i
= 0; i
< na
; i
++)
193 innb
[j
++] = in
[a
[i
]];
195 /* Complete the partially filled last cell with fill */
196 for (; i
< na_round
; i
++)
202 void copy_rvec_to_nbat_real(const int *a
, int na
, int na_round
,
203 const rvec
*x
, int nbatFormat
,
206 /* We complete partially filled cells, can only be the last one in each
207 * column, with coordinates farAway. The actual coordinate value does
208 * not influence the results, since these filler particles do not interact.
209 * Clusters with normal atoms + fillers have a bounding box based only
210 * on the coordinates of the atoms. Clusters with only fillers have as
211 * the bounding box the coordinates of the first filler. Such clusters
212 * are not considered as i-entries, but they are considered as j-entries.
213 * So for performance it is better to have their bounding boxes far away,
214 * such that filler only clusters don't end up in the pair list.
216 const real farAway
= -1000000;
224 for (i
= 0; i
< na
; i
++)
226 xnb
[j
++] = x
[a
[i
]][XX
];
227 xnb
[j
++] = x
[a
[i
]][YY
];
228 xnb
[j
++] = x
[a
[i
]][ZZ
];
230 /* Complete the partially filled last cell with farAway elements */
231 for (; i
< na_round
; i
++)
240 for (i
= 0; i
< na
; i
++)
242 xnb
[j
++] = x
[a
[i
]][XX
];
243 xnb
[j
++] = x
[a
[i
]][YY
];
244 xnb
[j
++] = x
[a
[i
]][ZZ
];
247 /* Complete the partially filled last cell with zeros */
248 for (; i
< na_round
; i
++)
257 j
= atom_to_x_index
<c_packX4
>(a0
);
258 c
= a0
& (c_packX4
-1);
259 for (i
= 0; i
< na
; i
++)
261 xnb
[j
+XX
*c_packX4
] = x
[a
[i
]][XX
];
262 xnb
[j
+YY
*c_packX4
] = x
[a
[i
]][YY
];
263 xnb
[j
+ZZ
*c_packX4
] = x
[a
[i
]][ZZ
];
268 j
+= (DIM
-1)*c_packX4
;
272 /* Complete the partially filled last cell with zeros */
273 for (; i
< na_round
; i
++)
275 xnb
[j
+XX
*c_packX4
] = farAway
;
276 xnb
[j
+YY
*c_packX4
] = farAway
;
277 xnb
[j
+ZZ
*c_packX4
] = farAway
;
282 j
+= (DIM
-1)*c_packX4
;
288 j
= atom_to_x_index
<c_packX8
>(a0
);
289 c
= a0
& (c_packX8
- 1);
290 for (i
= 0; i
< na
; i
++)
292 xnb
[j
+XX
*c_packX8
] = x
[a
[i
]][XX
];
293 xnb
[j
+YY
*c_packX8
] = x
[a
[i
]][YY
];
294 xnb
[j
+ZZ
*c_packX8
] = x
[a
[i
]][ZZ
];
299 j
+= (DIM
-1)*c_packX8
;
303 /* Complete the partially filled last cell with zeros */
304 for (; i
< na_round
; i
++)
306 xnb
[j
+XX
*c_packX8
] = farAway
;
307 xnb
[j
+YY
*c_packX8
] = farAway
;
308 xnb
[j
+ZZ
*c_packX8
] = farAway
;
313 j
+= (DIM
-1)*c_packX8
;
319 gmx_incons("Unsupported nbnxn_atomdata_t format");
323 /* Stores the LJ parameter data in a format convenient for different kernels */
324 static void set_lj_parameter_data(nbnxn_atomdata_t
*nbat
, gmx_bool bSIMD
)
328 int nt
= nbat
->ntype
;
333 /* nbfp_aligned stores two parameters using the stride most suitable
334 * for the present SIMD architecture, as specified by the constant
335 * c_simdBestPairAlignment from the SIMD header.
336 * There's a slight inefficiency in allocating and initializing nbfp_aligned
337 * when it might not be used, but introducing the conditional code is not
340 nbat
->alloc(reinterpret_cast<void **>(&nbat
->nbfp_aligned
),
341 nt
*nt
*c_simdBestPairAlignment
*sizeof(*nbat
->nbfp_aligned
));
342 for (int i
= 0; i
< nt
; i
++)
344 for (int j
= 0; j
< nt
; j
++)
346 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+0] = nbat
->nbfp
[(i
*nt
+j
)*2+0];
347 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+1] = nbat
->nbfp
[(i
*nt
+j
)*2+1];
348 if (c_simdBestPairAlignment
> 2)
350 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+2] = 0;
351 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+3] = 0;
358 /* We use combination rule data for SIMD combination rule kernels
359 * and with LJ-PME kernels. We then only need parameters per atom type,
360 * not per pair of atom types.
362 switch (nbat
->comb_rule
)
365 nbat
->comb_rule
= ljcrGEOM
;
367 for (int i
= 0; i
< nt
; i
++)
369 /* Store the sqrt of the diagonal from the nbfp matrix */
370 nbat
->nbfp_comb
[i
*2 ] = std::sqrt(nbat
->nbfp
[(i
*nt
+i
)*2 ]);
371 nbat
->nbfp_comb
[i
*2+1] = std::sqrt(nbat
->nbfp
[(i
*nt
+i
)*2+1]);
375 for (int i
= 0; i
< nt
; i
++)
377 /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
378 c6
= nbat
->nbfp
[(i
*nt
+i
)*2 ];
379 c12
= nbat
->nbfp
[(i
*nt
+i
)*2+1];
380 if (c6
> 0 && c12
> 0)
382 /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
383 * so we get 6*C6 and 12*C12 after combining.
385 nbat
->nbfp_comb
[i
*2 ] = 0.5*gmx::sixthroot(c12
/c6
);
386 nbat
->nbfp_comb
[i
*2+1] = std::sqrt(c6
*c6
/c12
);
390 nbat
->nbfp_comb
[i
*2 ] = 0;
391 nbat
->nbfp_comb
[i
*2+1] = 0;
396 /* We always store the full matrix (see code above) */
399 gmx_incons("Unknown combination rule");
405 nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t
*nbat
)
407 const int simd_width
= GMX_SIMD_REAL_WIDTH
;
409 /* Set the diagonal cluster pair exclusion mask setup data.
410 * In the kernel we check 0 < j - i to generate the masks.
411 * Here we store j - i for generating the mask for the first i,
412 * we substract 0.5 to avoid rounding issues.
413 * In the kernel we can subtract 1 to generate the subsequent mask.
415 int simd_4xn_diag_size
;
417 simd_4xn_diag_size
= std::max(NBNXN_CPU_CLUSTER_I_SIZE
, simd_width
);
418 snew_aligned(nbat
->simd_4xn_diagonal_j_minus_i
, simd_4xn_diag_size
, NBNXN_MEM_ALIGN
);
419 for (int j
= 0; j
< simd_4xn_diag_size
; j
++)
421 nbat
->simd_4xn_diagonal_j_minus_i
[j
] = j
- 0.5;
424 snew_aligned(nbat
->simd_2xnn_diagonal_j_minus_i
, simd_width
, NBNXN_MEM_ALIGN
);
425 for (int j
= 0; j
< simd_width
/2; j
++)
427 /* The j-cluster size is half the SIMD width */
428 nbat
->simd_2xnn_diagonal_j_minus_i
[j
] = j
- 0.5;
429 /* The next half of the SIMD width is for i + 1 */
430 nbat
->simd_2xnn_diagonal_j_minus_i
[simd_width
/2+j
] = j
- 1 - 0.5;
433 /* We use up to 32 bits for exclusion masking.
434 * The same masks are used for the 4xN and 2x(N+N) kernels.
435 * The masks are read either into integer SIMD registers or into
436 * real SIMD registers (together with a cast).
437 * In single precision this means the real and integer SIMD registers
440 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*simd_width
;
441 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
442 snew_aligned(nbat
->simd_exclusion_filter64
, simd_excl_size
, NBNXN_MEM_ALIGN
);
444 snew_aligned(nbat
->simd_exclusion_filter
, simd_excl_size
, NBNXN_MEM_ALIGN
);
447 for (int j
= 0; j
< simd_excl_size
; j
++)
449 /* Set the consecutive bits for masking pair exclusions */
450 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
451 nbat
->simd_exclusion_filter64
[j
] = (1U << j
);
453 nbat
->simd_exclusion_filter
[j
] = (1U << j
);
457 #if !GMX_SIMD_HAVE_LOGICAL && !GMX_SIMD_HAVE_INT32_LOGICAL
458 // If the SIMD implementation has no bitwise logical operation support
459 // whatsoever we cannot use the normal masking. Instead,
460 // we generate a vector of all 2^4 possible ways an i atom
461 // interacts with its 4 j atoms. Each array entry contains
462 // GMX_SIMD_REAL_WIDTH values that are read with a single aligned SIMD load.
463 // Since there is no logical value representation in this case, we use
464 // any nonzero value to indicate 'true', while zero mean 'false'.
465 // This can then be converted to a SIMD boolean internally in the SIMD
466 // module by comparing to zero.
467 // Each array entry encodes how this i atom will interact with the 4 j atoms.
468 // Matching code exists in set_ci_top_excls() to generate indices into this array.
469 // Those indices are used in the kernels.
471 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
472 const real simdFalse
= 0.0;
473 const real simdTrue
= 1.0;
474 real
*simd_interaction_array
;
476 snew_aligned(simd_interaction_array
, simd_excl_size
* GMX_SIMD_REAL_WIDTH
, NBNXN_MEM_ALIGN
);
477 for (int j
= 0; j
< simd_excl_size
; j
++)
479 int index
= j
* GMX_SIMD_REAL_WIDTH
;
480 for (int i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
482 simd_interaction_array
[index
+ i
] = (j
& (1 << i
)) ? simdTrue
: simdFalse
;
485 nbat
->simd_interaction_array
= simd_interaction_array
;
490 /* Initializes an nbnxn_atomdata_t data structure */
491 void nbnxn_atomdata_init(const gmx::MDLogger
&mdlog
,
492 nbnxn_atomdata_t
*nbat
,
494 int enbnxninitcombrule
,
495 int ntype
, const real
*nbfp
,
498 nbnxn_alloc_t
*alloc
,
504 gmx_bool simple
, bCombGeom
, bCombLB
, bSIMD
;
506 if (alloc
== nullptr)
508 nbat
->alloc
= nbnxn_alloc_aligned
;
516 nbat
->free
= nbnxn_free_aligned
;
525 fprintf(debug
, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype
);
527 nbat
->ntype
= ntype
+ 1;
528 nbat
->alloc(reinterpret_cast<void **>(&nbat
->nbfp
),
529 nbat
->ntype
*nbat
->ntype
*2*sizeof(*nbat
->nbfp
));
530 nbat
->alloc(reinterpret_cast<void **>(&nbat
->nbfp_comb
), nbat
->ntype
*2*sizeof(*nbat
->nbfp_comb
));
532 /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
533 * force-field floating point parameters.
536 ptr
= getenv("GMX_LJCOMB_TOL");
541 sscanf(ptr
, "%lf", &dbl
);
547 /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
548 * to check for the LB rule.
550 for (int i
= 0; i
< ntype
; i
++)
552 c6
= nbfp
[(i
*ntype
+i
)*2 ]/6.0;
553 c12
= nbfp
[(i
*ntype
+i
)*2+1]/12.0;
554 if (c6
> 0 && c12
> 0)
556 nbat
->nbfp_comb
[i
*2 ] = gmx::sixthroot(c12
/c6
);
557 nbat
->nbfp_comb
[i
*2+1] = 0.25*c6
*c6
/c12
;
559 else if (c6
== 0 && c12
== 0)
561 nbat
->nbfp_comb
[i
*2 ] = 0;
562 nbat
->nbfp_comb
[i
*2+1] = 0;
566 /* Can not use LB rule with only dispersion or repulsion */
571 for (int i
= 0; i
< nbat
->ntype
; i
++)
573 for (int j
= 0; j
< nbat
->ntype
; j
++)
575 if (i
< ntype
&& j
< ntype
)
577 /* fr->nbfp has been updated, so that array too now stores c6/c12 including
578 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
580 c6
= nbfp
[(i
*ntype
+j
)*2 ];
581 c12
= nbfp
[(i
*ntype
+j
)*2+1];
582 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = c6
;
583 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = c12
;
585 /* Compare 6*C6 and 12*C12 for geometric cobination rule */
586 bCombGeom
= bCombGeom
&&
587 gmx_within_tol(c6
*c6
, nbfp
[(i
*ntype
+i
)*2 ]*nbfp
[(j
*ntype
+j
)*2 ], tol
) &&
588 gmx_within_tol(c12
*c12
, nbfp
[(i
*ntype
+i
)*2+1]*nbfp
[(j
*ntype
+j
)*2+1], tol
);
590 /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
594 ((c6
== 0 && c12
== 0 &&
595 (nbat
->nbfp_comb
[i
*2+1] == 0 || nbat
->nbfp_comb
[j
*2+1] == 0)) ||
596 (c6
> 0 && c12
> 0 &&
597 gmx_within_tol(gmx::sixthroot(c12
/c6
),
598 0.5*(nbat
->nbfp_comb
[i
*2]+nbat
->nbfp_comb
[j
*2]), tol
) &&
599 gmx_within_tol(0.25*c6
*c6
/c12
, std::sqrt(nbat
->nbfp_comb
[i
*2+1]*nbat
->nbfp_comb
[j
*2+1]), tol
)));
603 /* Add zero parameters for the additional dummy atom type */
604 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 0;
605 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 0;
611 fprintf(debug
, "Combination rules: geometric %s Lorentz-Berthelot %s\n",
612 gmx::boolToString(bCombGeom
), gmx::boolToString(bCombLB
));
615 simple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
617 switch (enbnxninitcombrule
)
619 case enbnxninitcombruleDETECT
:
620 /* We prefer the geometic combination rule,
621 * as that gives a slightly faster kernel than the LB rule.
625 nbat
->comb_rule
= ljcrGEOM
;
629 nbat
->comb_rule
= ljcrLB
;
633 nbat
->comb_rule
= ljcrNONE
;
635 nbat
->free(nbat
->nbfp_comb
);
640 if (nbat
->comb_rule
== ljcrNONE
)
642 mesg
= "Using full Lennard-Jones parameter combination matrix";
646 mesg
= gmx::formatString("Using %s Lennard-Jones combination rule",
647 nbat
->comb_rule
== ljcrGEOM
? "geometric" : "Lorentz-Berthelot");
649 GMX_LOG(mdlog
.info
).asParagraph().appendText(mesg
);
652 case enbnxninitcombruleGEOM
:
653 nbat
->comb_rule
= ljcrGEOM
;
655 case enbnxninitcombruleLB
:
656 nbat
->comb_rule
= ljcrLB
;
658 case enbnxninitcombruleNONE
:
659 nbat
->comb_rule
= ljcrNONE
;
661 nbat
->free(nbat
->nbfp_comb
);
664 gmx_incons("Unknown enbnxninitcombrule");
667 bSIMD
= (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
668 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
);
670 set_lj_parameter_data(nbat
, bSIMD
);
673 nbat
->type
= nullptr;
674 nbat
->lj_comb
= nullptr;
681 pack_x
= std::max(NBNXN_CPU_CLUSTER_I_SIZE
,
682 nbnxn_kernel_to_cluster_j_size(nb_kernel_type
));
686 nbat
->XFormat
= nbatX4
;
689 nbat
->XFormat
= nbatX8
;
692 gmx_incons("Unsupported packing width");
697 nbat
->XFormat
= nbatXYZ
;
700 nbat
->FFormat
= nbat
->XFormat
;
704 nbat
->XFormat
= nbatXYZQ
;
705 nbat
->FFormat
= nbatXYZ
;
708 nbat
->nenergrp
= n_energygroups
;
711 // We now check for energy groups already when starting mdrun
712 GMX_RELEASE_ASSERT(n_energygroups
== 1, "GPU kernels do not support energy groups");
714 /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
715 if (nbat
->nenergrp
> 64)
717 gmx_fatal(FARGS
, "With NxN kernels not more than 64 energy groups are supported\n");
720 while (nbat
->nenergrp
> (1<<nbat
->neg_2log
))
724 nbat
->energrp
= nullptr;
725 nbat
->alloc(reinterpret_cast<void **>(&nbat
->shift_vec
), SHIFTS
*sizeof(*nbat
->shift_vec
));
726 nbat
->xstride
= (nbat
->XFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
727 nbat
->fstride
= (nbat
->FFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
733 nbnxn_atomdata_init_simple_exclusion_masks(nbat
);
737 /* Initialize the output data structures */
739 snew(nbat
->out
, nbat
->nout
);
741 for (int i
= 0; i
< nbat
->nout
; i
++)
743 nbnxn_atomdata_output_init(&nbat
->out
[i
],
745 nbat
->nenergrp
, 1<<nbat
->neg_2log
,
748 nbat
->buffer_flags
.flag
= nullptr;
749 nbat
->buffer_flags
.flag_nalloc
= 0;
751 nth
= gmx_omp_nthreads_get(emntNonbonded
);
753 ptr
= getenv("GMX_USE_TREEREDUCE");
756 nbat
->bUseTreeReduce
= (strtol(ptr
, nullptr, 10) != 0);
759 else if (nth
> 8) /*on the CPU we currently don't benefit even at 32*/
761 nbat
->bUseTreeReduce
= 1;
766 nbat
->bUseTreeReduce
= false;
768 if (nbat
->bUseTreeReduce
)
770 GMX_LOG(mdlog
.info
).asParagraph().appendText("Using tree force reduction");
772 nbat
->syncStep
= new tMPI_Atomic
[nth
];
776 template<int packSize
>
777 static void copy_lj_to_nbat_lj_comb(const real
*ljparam_type
,
778 const int *type
, int na
,
781 /* The LJ params follow the combination rule:
782 * copy the params for the type array to the atom array.
784 for (int is
= 0; is
< na
; is
+= packSize
)
786 for (int k
= 0; k
< packSize
; k
++)
789 ljparam_at
[is
*2 + k
] = ljparam_type
[type
[i
]*2 ];
790 ljparam_at
[is
*2 + packSize
+ k
] = ljparam_type
[type
[i
]*2 + 1];
795 /* Sets the atom type in nbnxn_atomdata_t */
796 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t
*nbat
,
797 const nbnxn_search
*nbs
,
800 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
802 /* Loop over all columns and copy and fill */
803 for (int i
= 0; i
< grid
.numCells
[XX
]*grid
.numCells
[YY
]; i
++)
805 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
806 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
808 copy_int_to_nbat_int(nbs
->a
.data() + ash
, grid
.cxy_na
[i
], ncz
*grid
.na_sc
,
809 type
, nbat
->ntype
-1, nbat
->type
+ash
);
814 /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */
815 static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t
*nbat
,
816 const nbnxn_search
*nbs
)
818 if (nbat
->comb_rule
!= ljcrNONE
)
820 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
822 /* Loop over all columns and copy and fill */
823 for (int i
= 0; i
< grid
.numCells
[XX
]*grid
.numCells
[YY
]; i
++)
825 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
826 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
828 if (nbat
->XFormat
== nbatX4
)
830 copy_lj_to_nbat_lj_comb
<c_packX4
>(nbat
->nbfp_comb
,
833 nbat
->lj_comb
+ ash
*2);
835 else if (nbat
->XFormat
== nbatX8
)
837 copy_lj_to_nbat_lj_comb
<c_packX8
>(nbat
->nbfp_comb
,
840 nbat
->lj_comb
+ ash
*2);
842 else if (nbat
->XFormat
== nbatXYZQ
)
844 copy_lj_to_nbat_lj_comb
<1>(nbat
->nbfp_comb
,
847 nbat
->lj_comb
+ ash
*2);
854 /* Sets the charges in nbnxn_atomdata_t *nbat */
855 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t
*nbat
,
856 const nbnxn_search
*nbs
,
859 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
861 /* Loop over all columns and copy and fill */
862 for (int cxy
= 0; cxy
< grid
.numCells
[XX
]*grid
.numCells
[YY
]; cxy
++)
864 int ash
= (grid
.cell0
+ grid
.cxy_ind
[cxy
])*grid
.na_sc
;
865 int na
= grid
.cxy_na
[cxy
];
866 int na_round
= (grid
.cxy_ind
[cxy
+1] - grid
.cxy_ind
[cxy
])*grid
.na_sc
;
868 if (nbat
->XFormat
== nbatXYZQ
)
870 real
*q
= nbat
->x
+ ash
*STRIDE_XYZQ
+ ZZ
+ 1;
872 for (i
= 0; i
< na
; i
++)
874 *q
= charge
[nbs
->a
[ash
+i
]];
877 /* Complete the partially filled last cell with zeros */
878 for (; i
< na_round
; i
++)
886 real
*q
= nbat
->q
+ ash
;
888 for (i
= 0; i
< na
; i
++)
890 *q
= charge
[nbs
->a
[ash
+i
]];
893 /* Complete the partially filled last cell with zeros */
894 for (; i
< na_round
; i
++)
904 /* Set the charges of perturbed atoms in nbnxn_atomdata_t to 0.
905 * This is to automatically remove the RF/PME self term in the nbnxn kernels.
906 * Part of the zero interactions are still calculated in the normal kernels.
907 * All perturbed interactions are calculated in the free energy kernel,
908 * using the original charge and LJ data, not nbnxn_atomdata_t.
910 static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t
*nbat
,
911 const nbnxn_search
*nbs
)
916 if (nbat
->XFormat
== nbatXYZQ
)
918 q
= nbat
->x
+ ZZ
+ 1;
919 stride_q
= STRIDE_XYZQ
;
927 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
935 nsubc
= c_gpuNumClusterPerCell
;
938 int c_offset
= grid
.cell0
*grid
.na_sc
;
940 /* Loop over all columns and copy and fill */
941 for (int c
= 0; c
< grid
.nc
*nsubc
; c
++)
943 /* Does this cluster contain perturbed particles? */
944 if (grid
.fep
[c
] != 0)
946 for (int i
= 0; i
< grid
.na_c
; i
++)
948 /* Is this a perturbed particle? */
949 if (grid
.fep
[c
] & (1 << i
))
951 int ind
= c_offset
+ c
*grid
.na_c
+ i
;
952 /* Set atom type and charge to non-interacting */
953 nbat
->type
[ind
] = nbat
->ntype
- 1;
962 /* Copies the energy group indices to a reordered and packed array */
963 static void copy_egp_to_nbat_egps(const int *a
, int na
, int na_round
,
964 int na_c
, int bit_shift
,
965 const int *in
, int *innb
)
971 for (i
= 0; i
< na
; i
+= na_c
)
973 /* Store na_c energy group numbers into one int */
975 for (int sa
= 0; sa
< na_c
; sa
++)
980 comb
|= (GET_CGINFO_GID(in
[at
]) << (sa
*bit_shift
));
985 /* Complete the partially filled last cell with fill */
986 for (; i
< na_round
; i
+= na_c
)
992 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
993 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t
*nbat
,
994 const nbnxn_search
*nbs
,
997 if (nbat
->nenergrp
== 1)
1002 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
1004 /* Loop over all columns and copy and fill */
1005 for (int i
= 0; i
< grid
.numCells
[XX
]*grid
.numCells
[YY
]; i
++)
1007 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
1008 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
1010 copy_egp_to_nbat_egps(nbs
->a
.data() + ash
, grid
.cxy_na
[i
], ncz
*grid
.na_sc
,
1011 nbat
->na_c
, nbat
->neg_2log
,
1012 atinfo
, nbat
->energrp
+(ash
>>grid
.na_c_2log
));
1017 /* Sets all required atom parameter data in nbnxn_atomdata_t */
1018 void nbnxn_atomdata_set(nbnxn_atomdata_t
*nbat
,
1019 const nbnxn_search
*nbs
,
1020 const t_mdatoms
*mdatoms
,
1023 nbnxn_atomdata_set_atomtypes(nbat
, nbs
, mdatoms
->typeA
);
1025 nbnxn_atomdata_set_charges(nbat
, nbs
, mdatoms
->chargeA
);
1029 nbnxn_atomdata_mask_fep(nbat
, nbs
);
1032 /* This must be done after masking types for FEP */
1033 nbnxn_atomdata_set_ljcombparams(nbat
, nbs
);
1035 nbnxn_atomdata_set_energygroups(nbat
, nbs
, atinfo
);
1038 /* Copies the shift vector array to nbnxn_atomdata_t */
1039 void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox
,
1041 nbnxn_atomdata_t
*nbat
)
1045 nbat
->bDynamicBox
= bDynamicBox
;
1046 for (i
= 0; i
< SHIFTS
; i
++)
1048 copy_rvec(shift_vec
[i
], nbat
->shift_vec
[i
]);
1052 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
1053 void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search
*nbs
,
1057 nbnxn_atomdata_t
*nbat
,
1058 gmx_wallcycle
*wcycle
)
1060 wallcycle_start(wcycle
, ewcNB_XF_BUF_OPS
);
1061 wallcycle_sub_start(wcycle
, ewcsNB_X_BUF_OPS
);
1070 g1
= nbs
->grid
.size();
1078 g1
= nbs
->grid
.size();
1084 nbat
->natoms_local
= nbs
->grid
[0].nc
*nbs
->grid
[0].na_sc
;
1087 nth
= gmx_omp_nthreads_get(emntPairsearch
);
1089 #pragma omp parallel for num_threads(nth) schedule(static)
1090 for (th
= 0; th
< nth
; th
++)
1094 for (int g
= g0
; g
< g1
; g
++)
1096 const nbnxn_grid_t
&grid
= nbs
->grid
[g
];
1097 const int numCellsXY
= grid
.numCells
[XX
]*grid
.numCells
[YY
];
1099 const int cxy0
= (numCellsXY
* th
+ nth
- 1)/nth
;
1100 const int cxy1
= (numCellsXY
*(th
+ 1) + nth
- 1)/nth
;
1102 for (int cxy
= cxy0
; cxy
< cxy1
; cxy
++)
1104 int na
, ash
, na_fill
;
1106 na
= grid
.cxy_na
[cxy
];
1107 ash
= (grid
.cell0
+ grid
.cxy_ind
[cxy
])*grid
.na_sc
;
1109 if (g
== 0 && FillLocal
)
1112 (grid
.cxy_ind
[cxy
+1] - grid
.cxy_ind
[cxy
])*grid
.na_sc
;
1116 /* We fill only the real particle locations.
1117 * We assume the filling entries at the end have been
1118 * properly set before during pair-list generation.
1122 copy_rvec_to_nbat_real(nbs
->a
.data() + ash
, na
, na_fill
, x
,
1123 nbat
->XFormat
, nbat
->x
, ash
);
1127 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1130 wallcycle_sub_stop(wcycle
, ewcsNB_X_BUF_OPS
);
1131 wallcycle_stop(wcycle
, ewcNB_XF_BUF_OPS
);
1135 nbnxn_atomdata_clear_reals(real
* gmx_restrict dest
,
1138 for (int i
= i0
; i
< i1
; i
++)
1144 gmx_unused
static void
1145 nbnxn_atomdata_reduce_reals(real
* gmx_restrict dest
,
1147 real
** gmx_restrict src
,
1153 /* The destination buffer contains data, add to it */
1154 for (int i
= i0
; i
< i1
; i
++)
1156 for (int s
= 0; s
< nsrc
; s
++)
1158 dest
[i
] += src
[s
][i
];
1164 /* The destination buffer is unitialized, set it first */
1165 for (int i
= i0
; i
< i1
; i
++)
1167 dest
[i
] = src
[0][i
];
1168 for (int s
= 1; s
< nsrc
; s
++)
1170 dest
[i
] += src
[s
][i
];
1176 gmx_unused
static void
1177 nbnxn_atomdata_reduce_reals_simd(real gmx_unused
* gmx_restrict dest
,
1178 gmx_bool gmx_unused bDestSet
,
1179 real gmx_unused
** gmx_restrict src
,
1180 int gmx_unused nsrc
,
1181 int gmx_unused i0
, int gmx_unused i1
)
1184 /* The SIMD width here is actually independent of that in the kernels,
1185 * but we use the same width for simplicity (usually optimal anyhow).
1187 SimdReal dest_SSE
, src_SSE
;
1191 for (int i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1193 dest_SSE
= load
<SimdReal
>(dest
+i
);
1194 for (int s
= 0; s
< nsrc
; s
++)
1196 src_SSE
= load
<SimdReal
>(src
[s
]+i
);
1197 dest_SSE
= dest_SSE
+ src_SSE
;
1199 store(dest
+i
, dest_SSE
);
1204 for (int i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1206 dest_SSE
= load
<SimdReal
>(src
[0]+i
);
1207 for (int s
= 1; s
< nsrc
; s
++)
1209 src_SSE
= load
<SimdReal
>(src
[s
]+i
);
1210 dest_SSE
= dest_SSE
+ src_SSE
;
1212 store(dest
+i
, dest_SSE
);
1218 /* Add part of the force array(s) from nbnxn_atomdata_t to f */
1220 nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search
*nbs
,
1221 const nbnxn_atomdata_t
*nbat
,
1222 nbnxn_atomdata_output_t
*out
,
1227 gmx::ArrayRef
<const int> cell
= nbs
->cell
;
1229 /* Loop over all columns and copy and fill */
1230 switch (nbat
->FFormat
)
1236 const real
*fnb
= out
[0].f
;
1238 for (int a
= a0
; a
< a1
; a
++)
1240 int i
= cell
[a
]*nbat
->fstride
;
1243 f
[a
][YY
] += fnb
[i
+1];
1244 f
[a
][ZZ
] += fnb
[i
+2];
1249 for (int a
= a0
; a
< a1
; a
++)
1251 int i
= cell
[a
]*nbat
->fstride
;
1253 for (int fa
= 0; fa
< nfa
; fa
++)
1255 f
[a
][XX
] += out
[fa
].f
[i
];
1256 f
[a
][YY
] += out
[fa
].f
[i
+1];
1257 f
[a
][ZZ
] += out
[fa
].f
[i
+2];
1265 const real
*fnb
= out
[0].f
;
1267 for (int a
= a0
; a
< a1
; a
++)
1269 int i
= atom_to_x_index
<c_packX4
>(cell
[a
]);
1271 f
[a
][XX
] += fnb
[i
+XX
*c_packX4
];
1272 f
[a
][YY
] += fnb
[i
+YY
*c_packX4
];
1273 f
[a
][ZZ
] += fnb
[i
+ZZ
*c_packX4
];
1278 for (int a
= a0
; a
< a1
; a
++)
1280 int i
= atom_to_x_index
<c_packX4
>(cell
[a
]);
1282 for (int fa
= 0; fa
< nfa
; fa
++)
1284 f
[a
][XX
] += out
[fa
].f
[i
+XX
*c_packX4
];
1285 f
[a
][YY
] += out
[fa
].f
[i
+YY
*c_packX4
];
1286 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*c_packX4
];
1294 const real
*fnb
= out
[0].f
;
1296 for (int a
= a0
; a
< a1
; a
++)
1298 int i
= atom_to_x_index
<c_packX8
>(cell
[a
]);
1300 f
[a
][XX
] += fnb
[i
+XX
*c_packX8
];
1301 f
[a
][YY
] += fnb
[i
+YY
*c_packX8
];
1302 f
[a
][ZZ
] += fnb
[i
+ZZ
*c_packX8
];
1307 for (int a
= a0
; a
< a1
; a
++)
1309 int i
= atom_to_x_index
<c_packX8
>(cell
[a
]);
1311 for (int fa
= 0; fa
< nfa
; fa
++)
1313 f
[a
][XX
] += out
[fa
].f
[i
+XX
*c_packX8
];
1314 f
[a
][YY
] += out
[fa
].f
[i
+YY
*c_packX8
];
1315 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*c_packX8
];
1321 gmx_incons("Unsupported nbnxn_atomdata_t format");
1325 static inline unsigned char reverse_bits(unsigned char b
)
1327 /* http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
1328 return (b
* 0x0202020202ULL
& 0x010884422010ULL
) % 1023;
1331 static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t
*nbat
,
1334 const nbnxn_buffer_flags_t
*flags
= &nbat
->buffer_flags
;
1336 int next_pow2
= 1<<(gmx::log2I(nth
-1)+1);
1338 assert(nbat
->nout
== nth
); /* tree-reduce currently only works for nout==nth */
1340 memset(nbat
->syncStep
, 0, sizeof(*(nbat
->syncStep
))*nth
);
1342 #pragma omp parallel num_threads(nth)
1350 th
= gmx_omp_get_thread_num();
1352 for (group_size
= 2; group_size
< 2*next_pow2
; group_size
*= 2)
1354 int index
[2], group_pos
, partner_pos
, wu
;
1355 int partner_th
= th
^ (group_size
/2);
1360 /* wait on partner thread - replaces full barrier */
1361 int sync_th
, sync_group_size
;
1363 tMPI_Atomic_memory_barrier(); /* gurantee data is saved before marking work as done */
1364 tMPI_Atomic_set(&(nbat
->syncStep
[th
]), group_size
/2); /* mark previous step as completed */
1366 /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
1367 for (sync_th
= partner_th
, sync_group_size
= group_size
; sync_th
>= nth
&& sync_group_size
> 2; sync_group_size
/= 2)
1369 sync_th
&= ~(sync_group_size
/4);
1371 if (sync_th
< nth
) /* otherwise nothing to sync index[1] will be >=nout */
1373 /* wait on the thread which computed input data in previous step */
1374 while (tMPI_Atomic_get(static_cast<volatile tMPI_Atomic_t
*>(&(nbat
->syncStep
[sync_th
]))) < group_size
/2)
1378 /* guarantee that no later load happens before wait loop is finisehd */
1379 tMPI_Atomic_memory_barrier();
1381 #else /* TMPI_ATOMICS */
1386 /* Calculate buffers to sum (result goes into first buffer) */
1387 group_pos
= th
% group_size
;
1388 index
[0] = th
- group_pos
;
1389 index
[1] = index
[0] + group_size
/2;
1391 /* If no second buffer, nothing to do */
1392 if (index
[1] >= nbat
->nout
&& group_size
> 2)
1397 #if NBNXN_BUFFERFLAG_MAX_THREADS > 256
1398 #error reverse_bits assumes max 256 threads
1400 /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
1401 This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
1402 The permutation which allows this corresponds to reversing the bits of the group position.
1404 group_pos
= reverse_bits(group_pos
)/(256/group_size
);
1406 partner_pos
= group_pos
^ 1;
1408 /* loop over two work-units (own and partner) */
1409 for (wu
= 0; wu
< 2; wu
++)
1413 if (partner_th
< nth
)
1415 break; /* partner exists we don't have to do his work */
1419 group_pos
= partner_pos
;
1423 /* Calculate the cell-block range for our thread */
1424 b0
= (flags
->nflag
* group_pos
)/group_size
;
1425 b1
= (flags
->nflag
*(group_pos
+1))/group_size
;
1427 for (b
= b0
; b
< b1
; b
++)
1429 i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1430 i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1432 if (bitmask_is_set(flags
->flag
[b
], index
[1]) || group_size
> 2)
1435 nbnxn_atomdata_reduce_reals_simd
1437 nbnxn_atomdata_reduce_reals
1439 (nbat
->out
[index
[0]].f
,
1440 bitmask_is_set(flags
->flag
[b
], index
[0]) || group_size
> 2,
1441 &(nbat
->out
[index
[1]].f
), 1, i0
, i1
);
1444 else if (!bitmask_is_set(flags
->flag
[b
], index
[0]))
1446 nbnxn_atomdata_clear_reals(nbat
->out
[index
[0]].f
,
1453 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1458 static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t
*nbat
,
1461 #pragma omp parallel for num_threads(nth) schedule(static)
1462 for (int th
= 0; th
< nth
; th
++)
1466 const nbnxn_buffer_flags_t
*flags
;
1468 real
*fptr
[NBNXN_BUFFERFLAG_MAX_THREADS
];
1470 flags
= &nbat
->buffer_flags
;
1472 /* Calculate the cell-block range for our thread */
1473 int b0
= (flags
->nflag
* th
)/nth
;
1474 int b1
= (flags
->nflag
*(th
+1))/nth
;
1476 for (int b
= b0
; b
< b1
; b
++)
1478 int i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1479 int i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1482 for (int out
= 1; out
< nbat
->nout
; out
++)
1484 if (bitmask_is_set(flags
->flag
[b
], out
))
1486 fptr
[nfptr
++] = nbat
->out
[out
].f
;
1492 nbnxn_atomdata_reduce_reals_simd
1494 nbnxn_atomdata_reduce_reals
1497 bitmask_is_set(flags
->flag
[b
], 0),
1501 else if (!bitmask_is_set(flags
->flag
[b
], 0))
1503 nbnxn_atomdata_clear_reals(nbat
->out
[0].f
,
1508 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1512 /* Add the force array(s) from nbnxn_atomdata_t to f */
1513 void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search
*nbs
,
1515 const nbnxn_atomdata_t
*nbat
,
1517 gmx_wallcycle
*wcycle
)
1519 wallcycle_start(wcycle
, ewcNB_XF_BUF_OPS
);
1520 wallcycle_sub_start(wcycle
, ewcsNB_F_BUF_OPS
);
1524 nbs_cycle_start(&nbs
->cc
[enbsCCreducef
]);
1530 na
= nbs
->natoms_nonlocal
;
1534 na
= nbs
->natoms_local
;
1537 a0
= nbs
->natoms_local
;
1538 na
= nbs
->natoms_nonlocal
- nbs
->natoms_local
;
1542 int nth
= gmx_omp_nthreads_get(emntNonbonded
);
1546 if (locality
!= eatAll
)
1548 gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
1551 /* Reduce the force thread output buffers into buffer 0, before adding
1552 * them to the, differently ordered, "real" force buffer.
1554 if (nbat
->bUseTreeReduce
)
1556 nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat
, nth
);
1560 nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat
, nth
);
1563 #pragma omp parallel for num_threads(nth) schedule(static)
1564 for (int th
= 0; th
< nth
; th
++)
1568 nbnxn_atomdata_add_nbat_f_to_f_part(nbs
, nbat
,
1575 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1578 nbs_cycle_stop(&nbs
->cc
[enbsCCreducef
]);
1580 wallcycle_sub_stop(wcycle
, ewcsNB_F_BUF_OPS
);
1581 wallcycle_stop(wcycle
, ewcNB_XF_BUF_OPS
);
1584 /* Adds the shift forces from nbnxn_atomdata_t to fshift */
1585 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t
*nbat
,
1588 const nbnxn_atomdata_output_t
* out
= nbat
->out
;
1590 for (int s
= 0; s
< SHIFTS
; s
++)
1594 for (int th
= 0; th
< nbat
->nout
; th
++)
1596 sum
[XX
] += out
[th
].fshift
[s
*DIM
+XX
];
1597 sum
[YY
] += out
[th
].fshift
[s
*DIM
+YY
];
1598 sum
[ZZ
] += out
[th
].fshift
[s
*DIM
+ZZ
];
1600 rvec_inc(fshift
[s
], sum
);