Optimized the bonded thread force reduction
[gromacs.git] / src / gromacs / listed-forces / manage-threading.cpp
blobd2a39f8b3bb61eea049806c7ba164f0356941180
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 /*! \internal \file
38 * \brief This file defines functions for managing threading of listed
39 * interactions.
41 * \author Mark Abraham <mark.j.abraham@gmail.com>
42 * \ingroup module_listed-forces
44 #include "gmxpre.h"
46 #include "manage-threading.h"
48 #include <assert.h>
49 #include <limits.h>
50 #include <stdlib.h>
52 #include <algorithm>
54 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
55 #include "gromacs/legacyheaders/types/ifunc.h"
56 #include "gromacs/listed-forces/listed-forces.h"
57 #include "gromacs/pbcutil/ishift.h"
58 #include "gromacs/utility/exceptions.h"
59 #include "gromacs/utility/fatalerror.h"
60 #include "gromacs/utility/smalloc.h"
61 #include "gromacs/utility/stringutil.h"
63 #include "listed-internal.h"
65 /*! \brief struct for passing all data required for a function type */
66 typedef struct {
67 int ftype; /**< the function type index */
68 t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */
69 int nat; /**< nr of atoms involved in a single ftype interaction */
70 } ilist_data_t;
72 /*! \brief Divides listed interactions over threads
74 * This routine attempts to divide all interactions of the ntype bondeds
75 * types stored in ild over the threads such that each thread has roughly
76 * equal load and different threads avoid touching the same atoms as much
77 * as possible.
79 static void divide_bondeds_by_locality(int ntype,
80 const ilist_data_t *ild,
81 int nthread,
82 t_idef *idef)
84 int nat_tot, nat_sum;
85 int ind[F_NRE]; /* index into the ild[].il->iatoms */
86 int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
87 int f, t;
89 assert(ntype <= F_NRE);
91 nat_tot = 0;
92 for (f = 0; f < ntype; f++)
94 /* Sum #bondeds*#atoms_per_bond over all bonded types */
95 nat_tot += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
96 /* The start bound for thread 0 is 0 for all interactions */
97 ind[f] = 0;
98 /* Initialize the next atom index array */
99 assert(ild[f].il->nr > 0);
100 at_ind[f] = ild[f].il->iatoms[1];
103 nat_sum = 0;
104 /* Loop over the end bounds of the nthread threads to determine
105 * which interactions threads 0 to nthread shall calculate.
107 * NOTE: The cost of these combined loops is #interactions*ntype.
108 * This code is running single threaded (difficult to parallelize
109 * over threads). So the relative cost of this function increases
110 * linearly with the number of threads. Since the inner-most loop
111 * is cheap and this is done only at DD repartitioning, the cost should
112 * be negligble. At high thread count many other parts of the code
113 * scale the same way, so it's (currently) not worth improving this.
115 for (t = 1; t <= nthread; t++)
117 int nat_thread;
119 /* Here we assume that the computational cost is proportional
120 * to the number of atoms in the interaction. This is a rough
121 * measure, but roughly correct. Usually there are very few
122 * interactions anyhow and there are distributed relatively
123 * uniformly. Proper and RB dihedrals are often distributed
124 * non-uniformly, but their cost is roughly equal.
126 nat_thread = (nat_tot*t)/nthread;
128 while (nat_sum < nat_thread)
130 /* To divide bonds based on atom order, we compare
131 * the index of the first atom in the bonded interaction.
132 * This works well, since the domain decomposition generates
133 * bondeds in order of the atoms by looking up interactions
134 * which are linked to the first atom in each interaction.
135 * It usually also works well without DD, since than the atoms
136 * in bonded interactions are usually in increasing order.
137 * If they are not assigned in increasing order, the balancing
138 * is still good, but the memory access and reduction cost will
139 * be higher.
141 int f_min;
143 /* Find out which of the types has the lowest atom index */
144 f_min = 0;
145 for (f = 1; f < ntype; f++)
147 if (at_ind[f] < at_ind[f_min])
149 f_min = f;
152 assert(f_min >= 0 && f_min < ntype);
154 /* Assign the interaction with the lowest atom index (of type
155 * index f_min) to thread t-1 by increasing ind.
157 ind[f_min] += ild[f_min].nat + 1;
158 nat_sum += ild[f_min].nat;
160 /* Update the first unassigned atom index for this type */
161 if (ind[f_min] < ild[f_min].il->nr)
163 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
165 else
167 /* We have assigned all interactions of this type.
168 * Setting at_ind to INT_MAX ensures this type will not be
169 * chosen in the for loop above during next iterations.
171 at_ind[f_min] = INT_MAX;
175 /* Store the bonded end boundaries (at index t) for thread t-1 */
176 for (f = 0; f < ntype; f++)
178 idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
182 for (f = 0; f < ntype; f++)
184 assert(ind[f] == ild[f].il->nr);
188 //! Divides bonded interactions over threads
189 static void divide_bondeds_over_threads(t_idef *idef,
190 int nthread,
191 int max_nthread_uniform)
193 ilist_data_t ild[F_NRE];
194 int ntype;
195 int f;
197 assert(nthread > 0);
199 idef->nthreads = nthread;
201 if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
203 idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
204 snew(idef->il_thread_division, idef->il_thread_division_nalloc);
207 ntype = 0;
208 for (f = 0; f < F_NRE; f++)
210 if (!ftype_is_bonded_potential(f))
212 continue;
215 if (idef->il[f].nr == 0)
217 /* No interactions, avoid all the integer math below */
218 int t;
219 for (t = 0; t <= nthread; t++)
221 idef->il_thread_division[f*(nthread + 1) + t] = 0;
224 else if (nthread <= max_nthread_uniform || f == F_DISRES)
226 /* On up to 4 threads, load balancing the bonded work
227 * is more important than minimizing the reduction cost.
229 int nat1, t;
231 /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
232 nat1 = 1 + NRAL(f);
234 for (t = 0; t <= nthread; t++)
236 int nr_t;
238 /* Divide equally over the threads */
239 nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
241 if (f == F_DISRES)
243 /* Ensure that distance restraint pairs with the same label
244 * end up on the same thread.
246 while (nr_t > 0 && nr_t < idef->il[f].nr &&
247 idef->iparams[idef->il[f].iatoms[nr_t]].disres.label ==
248 idef->iparams[idef->il[f].iatoms[nr_t-nat1]].disres.label)
250 nr_t += nat1;
254 idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
257 else
259 /* Add this ftype to the list to be distributed */
260 int nat;
262 nat = NRAL(f);
263 ild[ntype].ftype = f;
264 ild[ntype].il = &idef->il[f];
265 ild[ntype].nat = nat;
267 /* The first index for the thread division is always 0 */
268 idef->il_thread_division[f*(nthread + 1)] = 0;
270 ntype++;
274 if (ntype > 0)
276 divide_bondeds_by_locality(ntype, ild, nthread, idef);
279 if (debug)
281 int f;
283 fprintf(debug, "Division of bondeds over threads:\n");
284 for (f = 0; f < F_NRE; f++)
286 if (ftype_is_bonded_potential(f) && idef->il[f].nr > 0)
288 int t;
290 fprintf(debug, "%16s", interaction_function[f].name);
291 for (t = 0; t < nthread; t++)
293 fprintf(debug, " %4d",
294 (idef->il_thread_division[f*(nthread + 1) + t + 1] -
295 idef->il_thread_division[f*(nthread + 1) + t])/
296 (1 + NRAL(f)));
298 fprintf(debug, "\n");
304 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
305 static void
306 calc_bonded_reduction_mask(int natoms,
307 f_thread_t *f_thread,
308 const t_idef *idef,
309 int thread, int nthread)
311 assert(nthread <= BITMASK_SIZE);
313 int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
315 if (nblock > f_thread->block_nalloc)
317 f_thread->block_nalloc = over_alloc_large(nblock);
318 srenew(f_thread->mask, f_thread->block_nalloc);
319 srenew(f_thread->block_index, f_thread->block_nalloc);
320 srenew(f_thread->f, f_thread->block_nalloc*reduction_block_size);
323 gmx_bitmask_t *mask = f_thread->mask;
325 for (int b = 0; b < nblock; b++)
327 bitmask_clear(&mask[b]);
330 for (int ftype = 0; ftype < F_NRE; ftype++)
332 if (ftype_is_bonded_potential(ftype))
334 int nb = idef->il[ftype].nr;
335 if (nb > 0)
337 int nat1 = interaction_function[ftype].nratoms + 1;
339 int nb0 = idef->il_thread_division[ftype*(nthread + 1) + thread];
340 int nb1 = idef->il_thread_division[ftype*(nthread + 1) + thread + 1];
342 for (int i = nb0; i < nb1; i += nat1)
344 for (int a = 1; a < nat1; a++)
346 bitmask_set_bit(&mask[idef->il[ftype].iatoms[i+a] >> reduction_block_bits], thread);
353 /* Make an index of the blocks our thread touches, so we can do fast
354 * force buffer clearing.
356 f_thread->nblock_used = 0;
357 for (int b = 0; b < nblock; b++)
359 if (bitmask_is_set(mask[b], thread))
361 f_thread->block_index[f_thread->nblock_used++] = b;
366 void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
368 bonded_threading_t *bt = fr->bonded_threading;
369 int ctot = 0;
371 assert(bt->nthreads >= 1);
373 /* Divide the bonded interaction over the threads */
374 divide_bondeds_over_threads(idef,
375 bt->nthreads,
376 bt->bonded_max_nthread_uniform);
378 if (bt->nthreads == 1)
380 bt->nblock_used = 0;
382 return;
385 /* Determine to which blocks each thread's bonded force calculation
386 * contributes. Store this as a mask for each thread.
388 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
389 for (int t = 0; t < bt->nthreads; t++)
393 if (t > 0)
395 calc_bonded_reduction_mask(fr->natoms_force, &bt->f_t[t],
396 idef, t, bt->nthreads);
399 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
402 /* Reduce the masks over the threads and determine which blocks
403 * we need to reduce over.
405 int nblock_tot = (fr->natoms_force + reduction_block_size - 1) >> reduction_block_bits;
406 if (nblock_tot > bt->block_nalloc)
408 bt->block_nalloc = over_alloc_large(nblock_tot);
409 srenew(bt->block_index, bt->block_nalloc);
410 srenew(bt->mask, bt->block_nalloc);
412 bt->nblock_used = 0;
413 for (int b = 0; b < nblock_tot; b++)
415 gmx_bitmask_t *mask = &bt->mask[b];
417 /* Generate the union over the threads of the bitmask */
418 bitmask_clear(mask);
419 for (int t = 1; t < bt->nthreads; t++)
421 bitmask_union(mask, bt->f_t[t].mask[b]);
423 if (!bitmask_is_zero(*mask))
425 bt->block_index[bt->nblock_used++] = b;
428 if (debug)
430 int c = 0;
431 for (int t = 0; t < bt->nthreads; t++)
433 if (bitmask_is_set(*mask, t))
435 c++;
438 ctot += c;
440 if (gmx_debug_at)
442 #if BITMASK_SIZE <= 64 //move into bitmask when it is C++
443 std::string flags = gmx::formatString("%x", *mask);
444 #else
445 std::string flags = gmx::formatAndJoin(*mask,
446 *mask+BITMASK_ALEN,
447 "", gmx::StringFormatter("%x"));
448 #endif
449 fprintf(debug, "block %d flags %s count %d\n",
450 b, flags.c_str(), c);
454 if (debug)
456 fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
457 reduction_block_size, bt->nblock_used);
458 fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
459 ctot*reduction_block_size/(double)fr->natoms_force,
460 ctot/(double)bt->nblock_used);
464 void init_bonded_threading(FILE *fplog, int nenergrp,
465 struct bonded_threading_t **bt_ptr)
467 bonded_threading_t *bt;
469 snew(bt, 1);
471 /* These thread local data structures are used for bondeds only */
472 bt->nthreads = gmx_omp_nthreads_get(emntBonded);
474 if (bt->nthreads > 1)
476 int t;
478 snew(bt->f_t, bt->nthreads);
479 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
480 for (t = 0; t < bt->nthreads; t++)
484 /* Thread 0 uses the global force and energy arrays */
485 if (t > 0)
487 int i;
489 bt->f_t[t].f = NULL;
490 bt->f_t[t].f_nalloc = 0;
491 snew(bt->f_t[t].fshift, SHIFTS);
492 bt->f_t[t].grpp.nener = nenergrp*nenergrp;
493 for (i = 0; i < egNR; i++)
495 snew(bt->f_t[t].grpp.ener[i], bt->f_t[t].grpp.nener);
499 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
502 bt->nblock_used = 0;
503 bt->block_index = NULL;
504 bt->mask = NULL;
505 bt->block_nalloc = 0;
507 /* The optimal value after which to switch from uniform to localized
508 * bonded interaction distribution is 3, 4 or 5 depending on the system
509 * and hardware.
511 const int max_nthread_uniform = 4;
512 char * ptr;
514 if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL)
516 sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
517 if (fplog != NULL)
519 fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
520 bt->bonded_max_nthread_uniform);
523 else
525 bt->bonded_max_nthread_uniform = max_nthread_uniform;
529 *bt_ptr = bt;