2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file defines functions for managing threading of listed
41 * \author Mark Abraham <mark.j.abraham@gmail.com>
42 * \ingroup module_listed-forces
46 #include "manage-threading.h"
54 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
55 #include "gromacs/legacyheaders/types/ifunc.h"
56 #include "gromacs/listed-forces/listed-forces.h"
57 #include "gromacs/pbcutil/ishift.h"
58 #include "gromacs/utility/exceptions.h"
59 #include "gromacs/utility/fatalerror.h"
60 #include "gromacs/utility/smalloc.h"
61 #include "gromacs/utility/stringutil.h"
63 #include "listed-internal.h"
65 /*! \brief struct for passing all data required for a function type */
67 int ftype
; /**< the function type index */
68 t_ilist
*il
; /**< pointer to t_ilist entry corresponding to ftype */
69 int nat
; /**< nr of atoms involved in a single ftype interaction */
72 /*! \brief Divides listed interactions over threads
74 * This routine attempts to divide all interactions of the ntype bondeds
75 * types stored in ild over the threads such that each thread has roughly
76 * equal load and different threads avoid touching the same atoms as much
79 static void divide_bondeds_by_locality(int ntype
,
80 const ilist_data_t
*ild
,
85 int ind
[F_NRE
]; /* index into the ild[].il->iatoms */
86 int at_ind
[F_NRE
]; /* index of the first atom of the interaction at ind */
89 assert(ntype
<= F_NRE
);
92 for (f
= 0; f
< ntype
; f
++)
94 /* Sum #bondeds*#atoms_per_bond over all bonded types */
95 nat_tot
+= ild
[f
].il
->nr
/(ild
[f
].nat
+ 1)*ild
[f
].nat
;
96 /* The start bound for thread 0 is 0 for all interactions */
98 /* Initialize the next atom index array */
99 assert(ild
[f
].il
->nr
> 0);
100 at_ind
[f
] = ild
[f
].il
->iatoms
[1];
104 /* Loop over the end bounds of the nthread threads to determine
105 * which interactions threads 0 to nthread shall calculate.
107 * NOTE: The cost of these combined loops is #interactions*ntype.
108 * This code is running single threaded (difficult to parallelize
109 * over threads). So the relative cost of this function increases
110 * linearly with the number of threads. Since the inner-most loop
111 * is cheap and this is done only at DD repartitioning, the cost should
112 * be negligble. At high thread count many other parts of the code
113 * scale the same way, so it's (currently) not worth improving this.
115 for (t
= 1; t
<= nthread
; t
++)
119 /* Here we assume that the computational cost is proportional
120 * to the number of atoms in the interaction. This is a rough
121 * measure, but roughly correct. Usually there are very few
122 * interactions anyhow and there are distributed relatively
123 * uniformly. Proper and RB dihedrals are often distributed
124 * non-uniformly, but their cost is roughly equal.
126 nat_thread
= (nat_tot
*t
)/nthread
;
128 while (nat_sum
< nat_thread
)
130 /* To divide bonds based on atom order, we compare
131 * the index of the first atom in the bonded interaction.
132 * This works well, since the domain decomposition generates
133 * bondeds in order of the atoms by looking up interactions
134 * which are linked to the first atom in each interaction.
135 * It usually also works well without DD, since than the atoms
136 * in bonded interactions are usually in increasing order.
137 * If they are not assigned in increasing order, the balancing
138 * is still good, but the memory access and reduction cost will
143 /* Find out which of the types has the lowest atom index */
145 for (f
= 1; f
< ntype
; f
++)
147 if (at_ind
[f
] < at_ind
[f_min
])
152 assert(f_min
>= 0 && f_min
< ntype
);
154 /* Assign the interaction with the lowest atom index (of type
155 * index f_min) to thread t-1 by increasing ind.
157 ind
[f_min
] += ild
[f_min
].nat
+ 1;
158 nat_sum
+= ild
[f_min
].nat
;
160 /* Update the first unassigned atom index for this type */
161 if (ind
[f_min
] < ild
[f_min
].il
->nr
)
163 at_ind
[f_min
] = ild
[f_min
].il
->iatoms
[ind
[f_min
] + 1];
167 /* We have assigned all interactions of this type.
168 * Setting at_ind to INT_MAX ensures this type will not be
169 * chosen in the for loop above during next iterations.
171 at_ind
[f_min
] = INT_MAX
;
175 /* Store the bonded end boundaries (at index t) for thread t-1 */
176 for (f
= 0; f
< ntype
; f
++)
178 idef
->il_thread_division
[ild
[f
].ftype
*(nthread
+ 1) + t
] = ind
[f
];
182 for (f
= 0; f
< ntype
; f
++)
184 assert(ind
[f
] == ild
[f
].il
->nr
);
188 //! Divides bonded interactions over threads
189 static void divide_bondeds_over_threads(t_idef
*idef
,
191 int max_nthread_uniform
)
193 ilist_data_t ild
[F_NRE
];
199 idef
->nthreads
= nthread
;
201 if (F_NRE
*(nthread
+ 1) > idef
->il_thread_division_nalloc
)
203 idef
->il_thread_division_nalloc
= F_NRE
*(nthread
+ 1);
204 snew(idef
->il_thread_division
, idef
->il_thread_division_nalloc
);
208 for (f
= 0; f
< F_NRE
; f
++)
210 if (!ftype_is_bonded_potential(f
))
215 if (idef
->il
[f
].nr
== 0)
217 /* No interactions, avoid all the integer math below */
219 for (t
= 0; t
<= nthread
; t
++)
221 idef
->il_thread_division
[f
*(nthread
+ 1) + t
] = 0;
224 else if (nthread
<= max_nthread_uniform
|| f
== F_DISRES
)
226 /* On up to 4 threads, load balancing the bonded work
227 * is more important than minimizing the reduction cost.
231 /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
234 for (t
= 0; t
<= nthread
; t
++)
238 /* Divide equally over the threads */
239 nr_t
= (((idef
->il
[f
].nr
/nat1
)*t
)/nthread
)*nat1
;
243 /* Ensure that distance restraint pairs with the same label
244 * end up on the same thread.
246 while (nr_t
> 0 && nr_t
< idef
->il
[f
].nr
&&
247 idef
->iparams
[idef
->il
[f
].iatoms
[nr_t
]].disres
.label
==
248 idef
->iparams
[idef
->il
[f
].iatoms
[nr_t
-nat1
]].disres
.label
)
254 idef
->il_thread_division
[f
*(nthread
+ 1) + t
] = nr_t
;
259 /* Add this ftype to the list to be distributed */
263 ild
[ntype
].ftype
= f
;
264 ild
[ntype
].il
= &idef
->il
[f
];
265 ild
[ntype
].nat
= nat
;
267 /* The first index for the thread division is always 0 */
268 idef
->il_thread_division
[f
*(nthread
+ 1)] = 0;
276 divide_bondeds_by_locality(ntype
, ild
, nthread
, idef
);
283 fprintf(debug
, "Division of bondeds over threads:\n");
284 for (f
= 0; f
< F_NRE
; f
++)
286 if (ftype_is_bonded_potential(f
) && idef
->il
[f
].nr
> 0)
290 fprintf(debug
, "%16s", interaction_function
[f
].name
);
291 for (t
= 0; t
< nthread
; t
++)
293 fprintf(debug
, " %4d",
294 (idef
->il_thread_division
[f
*(nthread
+ 1) + t
+ 1] -
295 idef
->il_thread_division
[f
*(nthread
+ 1) + t
])/
298 fprintf(debug
, "\n");
304 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
306 calc_bonded_reduction_mask(int natoms
,
307 f_thread_t
*f_thread
,
309 int thread
, int nthread
)
311 assert(nthread
<= BITMASK_SIZE
);
313 int nblock
= (natoms
+ reduction_block_size
- 1) >> reduction_block_bits
;
315 if (nblock
> f_thread
->block_nalloc
)
317 f_thread
->block_nalloc
= over_alloc_large(nblock
);
318 srenew(f_thread
->mask
, f_thread
->block_nalloc
);
319 srenew(f_thread
->block_index
, f_thread
->block_nalloc
);
320 srenew(f_thread
->f
, f_thread
->block_nalloc
*reduction_block_size
);
323 gmx_bitmask_t
*mask
= f_thread
->mask
;
325 for (int b
= 0; b
< nblock
; b
++)
327 bitmask_clear(&mask
[b
]);
330 for (int ftype
= 0; ftype
< F_NRE
; ftype
++)
332 if (ftype_is_bonded_potential(ftype
))
334 int nb
= idef
->il
[ftype
].nr
;
337 int nat1
= interaction_function
[ftype
].nratoms
+ 1;
339 int nb0
= idef
->il_thread_division
[ftype
*(nthread
+ 1) + thread
];
340 int nb1
= idef
->il_thread_division
[ftype
*(nthread
+ 1) + thread
+ 1];
342 for (int i
= nb0
; i
< nb1
; i
+= nat1
)
344 for (int a
= 1; a
< nat1
; a
++)
346 bitmask_set_bit(&mask
[idef
->il
[ftype
].iatoms
[i
+a
] >> reduction_block_bits
], thread
);
353 /* Make an index of the blocks our thread touches, so we can do fast
354 * force buffer clearing.
356 f_thread
->nblock_used
= 0;
357 for (int b
= 0; b
< nblock
; b
++)
359 if (bitmask_is_set(mask
[b
], thread
))
361 f_thread
->block_index
[f_thread
->nblock_used
++] = b
;
366 void setup_bonded_threading(t_forcerec
*fr
, t_idef
*idef
)
368 bonded_threading_t
*bt
= fr
->bonded_threading
;
371 assert(bt
->nthreads
>= 1);
373 /* Divide the bonded interaction over the threads */
374 divide_bondeds_over_threads(idef
,
376 bt
->bonded_max_nthread_uniform
);
378 if (bt
->nthreads
== 1)
385 /* Determine to which blocks each thread's bonded force calculation
386 * contributes. Store this as a mask for each thread.
388 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
389 for (int t
= 0; t
< bt
->nthreads
; t
++)
395 calc_bonded_reduction_mask(fr
->natoms_force
, &bt
->f_t
[t
],
396 idef
, t
, bt
->nthreads
);
399 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
402 /* Reduce the masks over the threads and determine which blocks
403 * we need to reduce over.
405 int nblock_tot
= (fr
->natoms_force
+ reduction_block_size
- 1) >> reduction_block_bits
;
406 if (nblock_tot
> bt
->block_nalloc
)
408 bt
->block_nalloc
= over_alloc_large(nblock_tot
);
409 srenew(bt
->block_index
, bt
->block_nalloc
);
410 srenew(bt
->mask
, bt
->block_nalloc
);
413 for (int b
= 0; b
< nblock_tot
; b
++)
415 gmx_bitmask_t
*mask
= &bt
->mask
[b
];
417 /* Generate the union over the threads of the bitmask */
419 for (int t
= 1; t
< bt
->nthreads
; t
++)
421 bitmask_union(mask
, bt
->f_t
[t
].mask
[b
]);
423 if (!bitmask_is_zero(*mask
))
425 bt
->block_index
[bt
->nblock_used
++] = b
;
431 for (int t
= 0; t
< bt
->nthreads
; t
++)
433 if (bitmask_is_set(*mask
, t
))
442 #if BITMASK_SIZE <= 64 //move into bitmask when it is C++
443 std::string flags
= gmx::formatString("%x", *mask
);
445 std::string flags
= gmx::formatAndJoin(*mask
,
447 "", gmx::StringFormatter("%x"));
449 fprintf(debug
, "block %d flags %s count %d\n",
450 b
, flags
.c_str(), c
);
456 fprintf(debug
, "Number of %d atom blocks to reduce: %d\n",
457 reduction_block_size
, bt
->nblock_used
);
458 fprintf(debug
, "Reduction density %.2f for touched blocks only %.2f\n",
459 ctot
*reduction_block_size
/(double)fr
->natoms_force
,
460 ctot
/(double)bt
->nblock_used
);
464 void init_bonded_threading(FILE *fplog
, int nenergrp
,
465 struct bonded_threading_t
**bt_ptr
)
467 bonded_threading_t
*bt
;
471 /* These thread local data structures are used for bondeds only */
472 bt
->nthreads
= gmx_omp_nthreads_get(emntBonded
);
474 if (bt
->nthreads
> 1)
478 snew(bt
->f_t
, bt
->nthreads
);
479 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
480 for (t
= 0; t
< bt
->nthreads
; t
++)
484 /* Thread 0 uses the global force and energy arrays */
490 bt
->f_t
[t
].f_nalloc
= 0;
491 snew(bt
->f_t
[t
].fshift
, SHIFTS
);
492 bt
->f_t
[t
].grpp
.nener
= nenergrp
*nenergrp
;
493 for (i
= 0; i
< egNR
; i
++)
495 snew(bt
->f_t
[t
].grpp
.ener
[i
], bt
->f_t
[t
].grpp
.nener
);
499 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
503 bt
->block_index
= NULL
;
505 bt
->block_nalloc
= 0;
507 /* The optimal value after which to switch from uniform to localized
508 * bonded interaction distribution is 3, 4 or 5 depending on the system
511 const int max_nthread_uniform
= 4;
514 if ((ptr
= getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL
)
516 sscanf(ptr
, "%d", &bt
->bonded_max_nthread_uniform
);
519 fprintf(fplog
, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
520 bt
->bonded_max_nthread_uniform
);
525 bt
->bonded_max_nthread_uniform
= max_nthread_uniform
;