src/gromacs/listed-forces/manage-threading.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief This file defines functions for managing threading of listed
  39  * interactions.
  40  *
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_listed-forces
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "manage-threading.h"
  47
  48 #include <assert.h>
  49 #include <limits.h>
  50 #include <stdlib.h>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
  55 #include "gromacs/legacyheaders/types/ifunc.h"
  56 #include "gromacs/listed-forces/listed-forces.h"
  57 #include "gromacs/pbcutil/ishift.h"
  58 #include "gromacs/utility/exceptions.h"
  59 #include "gromacs/utility/fatalerror.h"
  60 #include "gromacs/utility/smalloc.h"
  61 #include "gromacs/utility/stringutil.h"
  62
  63 #include "listed-internal.h"
  64
  65 /*! \brief struct for passing all data required for a function type */
  66 typedef struct {
  67     int      ftype; /**< the function type index */
  68     t_ilist *il;    /**< pointer to t_ilist entry corresponding to ftype */
  69     int      nat;   /**< nr of atoms involved in a single ftype interaction */
  70 } ilist_data_t;
  71
  72 /*! \brief Divides listed interactions over threads
  73  *
  74  * This routine attempts to divide all interactions of the ntype bondeds
  75  * types stored in ild over the threads such that each thread has roughly
  76  * equal load and different threads avoid touching the same atoms as much
  77  * as possible.
  78  */
  79 static void divide_bondeds_by_locality(int                 ntype,
  80                                        const ilist_data_t *ild,
  81                                        int                 nthread,
  82                                        t_idef             *idef)
  83 {
  84     int nat_tot, nat_sum;
  85     int ind[F_NRE];    /* index into the ild[].il->iatoms */
  86     int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
  87     int f, t;
  88
  89     assert(ntype <= F_NRE);
  90
  91     nat_tot = 0;
  92     for (f = 0; f < ntype; f++)
  93     {
  94         /* Sum #bondeds*#atoms_per_bond over all bonded types */
  95         nat_tot  += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
  96         /* The start bound for thread 0 is 0 for all interactions */
  97         ind[f]    = 0;
  98         /* Initialize the next atom index array */
  99         assert(ild[f].il->nr > 0);
 100         at_ind[f] = ild[f].il->iatoms[1];
 101     }
 102
 103     nat_sum = 0;
 104     /* Loop over the end bounds of the nthread threads to determine
 105      * which interactions threads 0 to nthread shall calculate.
 106      *
 107      * NOTE: The cost of these combined loops is #interactions*ntype.
 108      * This code is running single threaded (difficult to parallelize
 109      * over threads). So the relative cost of this function increases
 110      * linearly with the number of threads. Since the inner-most loop
 111      * is cheap and this is done only at DD repartitioning, the cost should
 112      * be negligble. At high thread count many other parts of the code
 113      * scale the same way, so it's (currently) not worth improving this.
 114      */
 115     for (t = 1; t <= nthread; t++)
 116     {
 117         int nat_thread;
 118
 119         /* Here we assume that the computational cost is proportional
 120          * to the number of atoms in the interaction. This is a rough
 121          * measure, but roughly correct. Usually there are very few
 122          * interactions anyhow and there are distributed relatively
 123          * uniformly. Proper and RB dihedrals are often distributed
 124          * non-uniformly, but their cost is roughly equal.
 125          */
 126         nat_thread = (nat_tot*t)/nthread;
 127
 128         while (nat_sum < nat_thread)
 129         {
 130             /* To divide bonds based on atom order, we compare
 131              * the index of the first atom in the bonded interaction.
 132              * This works well, since the domain decomposition generates
 133              * bondeds in order of the atoms by looking up interactions
 134              * which are linked to the first atom in each interaction.
 135              * It usually also works well without DD, since than the atoms
 136              * in bonded interactions are usually in increasing order.
 137              * If they are not assigned in increasing order, the balancing
 138              * is still good, but the memory access and reduction cost will
 139              * be higher.
 140              */
 141             int f_min;
 142
 143             /* Find out which of the types has the lowest atom index */
 144             f_min = 0;
 145             for (f = 1; f < ntype; f++)
 146             {
 147                 if (at_ind[f] < at_ind[f_min])
 148                 {
 149                     f_min = f;
 150                 }
 151             }
 152             assert(f_min >= 0 && f_min < ntype);
 153
 154             /* Assign the interaction with the lowest atom index (of type
 155              * index f_min) to thread t-1 by increasing ind.
 156              */
 157             ind[f_min] += ild[f_min].nat + 1;
 158             nat_sum    += ild[f_min].nat;
 159
 160             /* Update the first unassigned atom index for this type */
 161             if (ind[f_min] < ild[f_min].il->nr)
 162             {
 163                 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
 164             }
 165             else
 166             {
 167                 /* We have assigned all interactions of this type.
 168                  * Setting at_ind to INT_MAX ensures this type will not be
 169                  * chosen in the for loop above during next iterations.
 170                  */
 171                 at_ind[f_min] = INT_MAX;
 172             }
 173         }
 174
 175         /* Store the bonded end boundaries (at index t) for thread t-1 */
 176         for (f = 0; f < ntype; f++)
 177         {
 178             idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
 179         }
 180     }
 181
 182     for (f = 0; f < ntype; f++)
 183     {
 184         assert(ind[f] == ild[f].il->nr);
 185     }
 186 }
 187
 188 //! Divides bonded interactions over threads
 189 static void divide_bondeds_over_threads(t_idef *idef,
 190                                         int     nthread,
 191                                         int     max_nthread_uniform)
 192 {
 193     ilist_data_t ild[F_NRE];
 194     int          ntype;
 195     int          f;
 196
 197     assert(nthread > 0);
 198
 199     idef->nthreads = nthread;
 200
 201     if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
 202     {
 203         idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
 204         snew(idef->il_thread_division, idef->il_thread_division_nalloc);
 205     }
 206
 207     ntype = 0;
 208     for (f = 0; f < F_NRE; f++)
 209     {
 210         if (!ftype_is_bonded_potential(f))
 211         {
 212             continue;
 213         }
 214
 215         if (idef->il[f].nr == 0)
 216         {
 217             /* No interactions, avoid all the integer math below */
 218             int t;
 219             for (t = 0; t <= nthread; t++)
 220             {
 221                 idef->il_thread_division[f*(nthread + 1) + t] = 0;
 222             }
 223         }
 224         else if (nthread <= max_nthread_uniform || f == F_DISRES)
 225         {
 226             /* On up to 4 threads, load balancing the bonded work
 227              * is more important than minimizing the reduction cost.
 228              */
 229             int nat1, t;
 230
 231             /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
 232             nat1 = 1 + NRAL(f);
 233
 234             for (t = 0; t <= nthread; t++)
 235             {
 236                 int nr_t;
 237
 238                 /* Divide equally over the threads */
 239                 nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
 240
 241                 if (f == F_DISRES)
 242                 {
 243                     /* Ensure that distance restraint pairs with the same label
 244                      * end up on the same thread.
 245                      */
 246                     while (nr_t > 0 && nr_t < idef->il[f].nr &&
 247                            idef->iparams[idef->il[f].iatoms[nr_t]].disres.label ==
 248                            idef->iparams[idef->il[f].iatoms[nr_t-nat1]].disres.label)
 249                     {
 250                         nr_t += nat1;
 251                     }
 252                 }
 253
 254                 idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
 255             }
 256         }
 257         else
 258         {
 259             /* Add this ftype to the list to be distributed */
 260             int nat;
 261
 262             nat              = NRAL(f);
 263             ild[ntype].ftype = f;
 264             ild[ntype].il    = &idef->il[f];
 265             ild[ntype].nat   = nat;
 266
 267             /* The first index for the thread division is always 0 */
 268             idef->il_thread_division[f*(nthread + 1)] = 0;
 269
 270             ntype++;
 271         }
 272     }
 273
 274     if (ntype > 0)
 275     {
 276         divide_bondeds_by_locality(ntype, ild, nthread, idef);
 277     }
 278
 279     if (debug)
 280     {
 281         int f;
 282
 283         fprintf(debug, "Division of bondeds over threads:\n");
 284         for (f = 0; f < F_NRE; f++)
 285         {
 286             if (ftype_is_bonded_potential(f) && idef->il[f].nr > 0)
 287             {
 288                 int t;
 289
 290                 fprintf(debug, "%16s", interaction_function[f].name);
 291                 for (t = 0; t < nthread; t++)
 292                 {
 293                     fprintf(debug, " %4d",
 294                             (idef->il_thread_division[f*(nthread + 1) + t + 1] -
 295                              idef->il_thread_division[f*(nthread + 1) + t])/
 296                             (1 + NRAL(f)));
 297                 }
 298                 fprintf(debug, "\n");
 299             }
 300         }
 301     }
 302 }
 303
 304 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
 305 static void
 306 calc_bonded_reduction_mask(int natoms,
 307                            f_thread_t *f_thread,
 308                            const t_idef *idef,
 309                            int thread, int nthread)
 310 {
 311     assert(nthread <= BITMASK_SIZE);
 312
 313     int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
 314
 315     if (nblock > f_thread->block_nalloc)
 316     {
 317         f_thread->block_nalloc = over_alloc_large(nblock);
 318         srenew(f_thread->mask,        f_thread->block_nalloc);
 319         srenew(f_thread->block_index, f_thread->block_nalloc);
 320         srenew(f_thread->f,           f_thread->block_nalloc*reduction_block_size);
 321     }
 322
 323     gmx_bitmask_t *mask = f_thread->mask;
 324
 325     for (int b = 0; b < nblock; b++)
 326     {
 327         bitmask_clear(&mask[b]);
 328     }
 329
 330     for (int ftype = 0; ftype < F_NRE; ftype++)
 331     {
 332         if (ftype_is_bonded_potential(ftype))
 333         {
 334             int nb = idef->il[ftype].nr;
 335             if (nb > 0)
 336             {
 337                 int nat1 = interaction_function[ftype].nratoms + 1;
 338
 339                 int nb0 = idef->il_thread_division[ftype*(nthread + 1) + thread];
 340                 int nb1 = idef->il_thread_division[ftype*(nthread + 1) + thread + 1];
 341
 342                 for (int i = nb0; i < nb1; i += nat1)
 343                 {
 344                     for (int a = 1; a < nat1; a++)
 345                     {
 346                         bitmask_set_bit(&mask[idef->il[ftype].iatoms[i+a] >> reduction_block_bits], thread);
 347                     }
 348                 }
 349             }
 350         }
 351     }
 352
 353     /* Make an index of the blocks our thread touches, so we can do fast
 354      * force buffer clearing.
 355      */
 356     f_thread->nblock_used = 0;
 357     for (int b = 0; b < nblock; b++)
 358     {
 359         if (bitmask_is_set(mask[b], thread))
 360         {
 361             f_thread->block_index[f_thread->nblock_used++] = b;
 362         }
 363     }
 364 }
 365
 366 void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
 367 {
 368     bonded_threading_t *bt   = fr->bonded_threading;
 369     int                 ctot = 0;
 370
 371     assert(bt->nthreads >= 1);
 372
 373     /* Divide the bonded interaction over the threads */
 374     divide_bondeds_over_threads(idef,
 375                                 bt->nthreads,
 376                                 bt->bonded_max_nthread_uniform);
 377
 378     if (bt->nthreads == 1)
 379     {
 380         bt->nblock_used = 0;
 381
 382         return;
 383     }
 384
 385     /* Determine to which blocks each thread's bonded force calculation
 386      * contributes. Store this as a mask for each thread.
 387      */
 388 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
 389     for (int t = 0; t < bt->nthreads; t++)
 390     {
 391         try
 392         {
 393             if (t > 0)
 394             {
 395                 calc_bonded_reduction_mask(fr->natoms_force, &bt->f_t[t],
 396                                            idef, t, bt->nthreads);
 397             }
 398         }
 399         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 400     }
 401
 402     /* Reduce the masks over the threads and determine which blocks
 403      * we need to reduce over.
 404      */
 405     int nblock_tot = (fr->natoms_force + reduction_block_size - 1) >> reduction_block_bits;
 406     if (nblock_tot > bt->block_nalloc)
 407     {
 408         bt->block_nalloc = over_alloc_large(nblock_tot);
 409         srenew(bt->block_index, bt->block_nalloc);
 410         srenew(bt->mask,        bt->block_nalloc);
 411     }
 412     bt->nblock_used = 0;
 413     for (int b = 0; b < nblock_tot; b++)
 414     {
 415         gmx_bitmask_t *mask = &bt->mask[b];
 416
 417         /* Generate the union over the threads of the bitmask */
 418         bitmask_clear(mask);
 419         for (int t = 1; t < bt->nthreads; t++)
 420         {
 421             bitmask_union(mask, bt->f_t[t].mask[b]);
 422         }
 423         if (!bitmask_is_zero(*mask))
 424         {
 425             bt->block_index[bt->nblock_used++] = b;
 426         }
 427
 428         if (debug)
 429         {
 430             int c = 0;
 431             for (int t = 0; t < bt->nthreads; t++)
 432             {
 433                 if (bitmask_is_set(*mask, t))
 434                 {
 435                     c++;
 436                 }
 437             }
 438             ctot += c;
 439
 440             if (gmx_debug_at)
 441             {
 442 #if BITMASK_SIZE <= 64 //move into bitmask when it is C++
 443                 std::string flags = gmx::formatString("%x", *mask);
 444 #else
 445                 std::string flags = gmx::formatAndJoin(*mask,
 446                                                        *mask+BITMASK_ALEN,
 447                                                        "", gmx::StringFormatter("%x"));
 448 #endif
 449                 fprintf(debug, "block %d flags %s count %d\n",
 450                         b, flags.c_str(), c);
 451             }
 452         }
 453     }
 454     if (debug)
 455     {
 456         fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
 457                 reduction_block_size, bt->nblock_used);
 458         fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
 459                 ctot*reduction_block_size/(double)fr->natoms_force,
 460                 ctot/(double)bt->nblock_used);
 461     }
 462 }
 463
 464 void init_bonded_threading(FILE *fplog, int nenergrp,
 465                            struct bonded_threading_t **bt_ptr)
 466 {
 467     bonded_threading_t *bt;
 468
 469     snew(bt, 1);
 470
 471     /* These thread local data structures are used for bondeds only */
 472     bt->nthreads = gmx_omp_nthreads_get(emntBonded);
 473
 474     if (bt->nthreads > 1)
 475     {
 476         int t;
 477
 478         snew(bt->f_t, bt->nthreads);
 479 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
 480         for (t = 0; t < bt->nthreads; t++)
 481         {
 482             try
 483             {
 484                 /* Thread 0 uses the global force and energy arrays */
 485                 if (t > 0)
 486                 {
 487                     int i;
 488
 489                     bt->f_t[t].f        = NULL;
 490                     bt->f_t[t].f_nalloc = 0;
 491                     snew(bt->f_t[t].fshift, SHIFTS);
 492                     bt->f_t[t].grpp.nener = nenergrp*nenergrp;
 493                     for (i = 0; i < egNR; i++)
 494                     {
 495                         snew(bt->f_t[t].grpp.ener[i], bt->f_t[t].grpp.nener);
 496                     }
 497                 }
 498             }
 499             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 500         }
 501
 502         bt->nblock_used  = 0;
 503         bt->block_index  = NULL;
 504         bt->mask         = NULL;
 505         bt->block_nalloc = 0;
 506
 507         /* The optimal value after which to switch from uniform to localized
 508          * bonded interaction distribution is 3, 4 or 5 depending on the system
 509          * and hardware.
 510          */
 511         const int max_nthread_uniform = 4;
 512         char *    ptr;
 513
 514         if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL)
 515         {
 516             sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
 517             if (fplog != NULL)
 518             {
 519                 fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
 520                         bt->bonded_max_nthread_uniform);
 521             }
 522         }
 523         else
 524         {
 525             bt->bonded_max_nthread_uniform = max_nthread_uniform;
 526         }
 527     }
 528
 529     *bt_ptr = bt;
 530 }