src/gromacs/domdec/partition.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines functions for mdrun to call to make a new
  38  * domain decomposition, and check it.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup module_domdec
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "partition.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cstdio>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/domdec/collect.h"
  56 #include "gromacs/domdec/dlb.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/ga2la.h"
  61 #include "gromacs/domdec/localatomsetmanager.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/ewald/pme.h"
  64 #include "gromacs/gmxlib/chargegroup.h"
  65 #include "gromacs/gmxlib/network.h"
  66 #include "gromacs/gmxlib/nrnb.h"
  67 #include "gromacs/imd/imd.h"
  68 #include "gromacs/math/functions.h"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/mdlib/forcerec.h"
  71 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  72 #include "gromacs/mdlib/mdatoms.h"
  73 #include "gromacs/mdlib/nsgrid.h"
  74 #include "gromacs/mdlib/vsite.h"
  75 #include "gromacs/mdtypes/commrec.h"
  76 #include "gromacs/mdtypes/forcerec.h"
  77 #include "gromacs/mdtypes/inputrec.h"
  78 #include "gromacs/mdtypes/md_enums.h"
  79 #include "gromacs/mdtypes/nblist.h"
  80 #include "gromacs/mdtypes/state.h"
  81 #include "gromacs/nbnxm/nbnxm.h"
  82 #include "gromacs/pulling/pull.h"
  83 #include "gromacs/timing/wallcycle.h"
  84 #include "gromacs/topology/mtop_util.h"
  85 #include "gromacs/topology/topology.h"
  86 #include "gromacs/utility/cstringutil.h"
  87 #include "gromacs/utility/fatalerror.h"
  88 #include "gromacs/utility/logger.h"
  89 #include "gromacs/utility/real.h"
  90 #include "gromacs/utility/smalloc.h"
  91 #include "gromacs/utility/strconvert.h"
  92 #include "gromacs/utility/stringstream.h"
  93 #include "gromacs/utility/stringutil.h"
  94 #include "gromacs/utility/textwriter.h"
  95
  96 #include "box.h"
  97 #include "cellsizes.h"
  98 #include "distribute.h"
  99 #include "domdec_constraints.h"
 100 #include "domdec_internal.h"
 101 #include "domdec_vsite.h"
 102 #include "dump.h"
 103 #include "redistribute.h"
 104 #include "utility.h"
 105
 106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
 107  *
 108  * There is a bit of overhead with DLB and it's difficult to achieve
 109  * a load imbalance of less than 2% with DLB.
 110  */
 111 #define DD_PERF_LOSS_DLB_ON  0.02
 112
 113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
 114 #define DD_PERF_LOSS_WARN    0.05
 115
 116
 117 //! Debug helper printing a DD zone
 118 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 119 {
 120     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 121             d, i, j,
 122             zone->min0, zone->max1,
 123             zone->mch0, zone->mch0,
 124             zone->p1_0, zone->p1_1);
 125 }
 126
 127 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
 128  * takes the extremes over all home and remote zones in the halo
 129  * and returns the results in cell_ns_x0 and cell_ns_x1.
 130  * Note: only used with the group cut-off scheme.
 131  */
 132 static void dd_move_cellx(gmx_domdec_t      *dd,
 133                           const gmx_ddbox_t *ddbox,
 134                           rvec               cell_ns_x0,
 135                           rvec               cell_ns_x1)
 136 {
 137     constexpr int      c_ddZoneCommMaxNumZones = 5;
 138     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
 139     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
 140     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
 141     gmx_domdec_comm_t *comm = dd->comm;
 142
 143     rvec               extr_s[2];
 144     rvec               extr_r[2];
 145     for (int d = 1; d < dd->ndim; d++)
 146     {
 147         int           dim = dd->dim[d];
 148         gmx_ddzone_t &zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 149
 150         /* Copy the base sizes of the home zone */
 151         zp.min0    = cell_ns_x0[dim];
 152         zp.max1    = cell_ns_x1[dim];
 153         zp.min1    = cell_ns_x1[dim];
 154         zp.mch0    = cell_ns_x0[dim];
 155         zp.mch1    = cell_ns_x1[dim];
 156         zp.p1_0    = cell_ns_x0[dim];
 157         zp.p1_1    = cell_ns_x1[dim];
 158         zp.dataSet = 1;
 159     }
 160
 161     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
 162
 163     /* Loop backward over the dimensions and aggregate the extremes
 164      * of the cell sizes.
 165      */
 166     for (int d = dd->ndim - 2; d >= 0; d--)
 167     {
 168         const int  dim      = dd->dim[d];
 169         const bool applyPbc = (dim < ddbox->npbcdim);
 170
 171         /* Use an rvec to store two reals */
 172         extr_s[d][0] = cellsizes[d + 1].fracLower;
 173         extr_s[d][1] = cellsizes[d + 1].fracUpper;
 174         extr_s[d][2] = cellsizes[d + 1].fracUpper;
 175
 176         int pos = 0;
 177         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
 178         /* Store the extremes in the backward sending buffer,
 179          * so they get updated separately from the forward communication.
 180          */
 181         for (int d1 = d; d1 < dd->ndim-1; d1++)
 182         {
 183             gmx_ddzone_t &buf = buf_s[pos];
 184
 185             /* We invert the order to be able to use the same loop for buf_e */
 186             buf.min0    = extr_s[d1][1];
 187             buf.max1    = extr_s[d1][0];
 188             buf.min1    = extr_s[d1][2];
 189             buf.mch0    = 0;
 190             buf.mch1    = 0;
 191             /* Store the cell corner of the dimension we communicate along */
 192             buf.p1_0    = comm->cell_x0[dim];
 193             buf.p1_1    = 0;
 194             buf.dataSet = 1;
 195             pos++;
 196         }
 197
 198         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 199         pos++;
 200
 201         if (dd->ndim == 3 && d == 0)
 202         {
 203             buf_s[pos] = comm->zone_d2[0][1];
 204             pos++;
 205             buf_s[pos] = comm->zone_d1[0];
 206             pos++;
 207         }
 208
 209         /* We only need to communicate the extremes
 210          * in the forward direction
 211          */
 212         int numPulses = comm->cd[d].numPulses();
 213         int numPulsesMin;
 214         if (applyPbc)
 215         {
 216             /* Take the minimum to avoid double communication */
 217             numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
 218         }
 219         else
 220         {
 221             /* Without PBC we should really not communicate over
 222              * the boundaries, but implementing that complicates
 223              * the communication setup and therefore we simply
 224              * do all communication, but ignore some data.
 225              */
 226             numPulsesMin = numPulses;
 227         }
 228         for (int pulse = 0; pulse < numPulsesMin; pulse++)
 229         {
 230             /* Communicate the extremes forward */
 231             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
 232
 233             int  numElements      = dd->ndim - d - 1;
 234             ddSendrecv(dd, d, dddirForward,
 235                        extr_s + d, numElements,
 236                        extr_r + d, numElements);
 237
 238             if (receiveValidData)
 239             {
 240                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 241                 {
 242                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 243                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 244                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 245                 }
 246             }
 247         }
 248
 249         const int numElementsInBuffer = pos;
 250         for (int pulse = 0; pulse < numPulses; pulse++)
 251         {
 252             /* Communicate all the zone information backward */
 253             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
 254
 255             static_assert(sizeof(gmx_ddzone_t) == c_ddzoneNumReals*sizeof(real), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
 256
 257             int numReals = numElementsInBuffer*c_ddzoneNumReals;
 258             ddSendrecv(dd, d, dddirBackward,
 259                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
 260                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
 261
 262             rvec dh = { 0 };
 263             if (pulse > 0)
 264             {
 265                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
 266                 {
 267                     /* Determine the decrease of maximum required
 268                      * communication height along d1 due to the distance along d,
 269                      * this avoids a lot of useless atom communication.
 270                      */
 271                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 272
 273                     real c;
 274                     if (ddbox->tric_dir[dim])
 275                     {
 276                         /* c is the off-diagonal coupling between the cell planes
 277                          * along directions d and d1.
 278                          */
 279                         c = ddbox->v[dim][dd->dim[d1]][dim];
 280                     }
 281                     else
 282                     {
 283                         c = 0;
 284                     }
 285                     real det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 286                     if (det > 0)
 287                     {
 288                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 289                     }
 290                     else
 291                     {
 292                         /* A negative value signals out of range */
 293                         dh[d1] = -1;
 294                     }
 295                 }
 296             }
 297
 298             /* Accumulate the extremes over all pulses */
 299             for (int i = 0; i < numElementsInBuffer; i++)
 300             {
 301                 if (pulse == 0)
 302                 {
 303                     buf_e[i] = buf_r[i];
 304                 }
 305                 else
 306                 {
 307                     if (receiveValidData)
 308                     {
 309                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 310                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 311                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 312                     }
 313
 314                     int d1;
 315                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
 316                     {
 317                         d1 = 1;
 318                     }
 319                     else
 320                     {
 321                         d1 = d + 1;
 322                     }
 323                     if (receiveValidData && dh[d1] >= 0)
 324                     {
 325                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 326                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 327                     }
 328                 }
 329                 /* Copy the received buffer to the send buffer,
 330                  * to pass the data through with the next pulse.
 331                  */
 332                 buf_s[i] = buf_r[i];
 333             }
 334             if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1) ||
 335                 (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
 336             {
 337                 /* Store the extremes */
 338                 int pos = 0;
 339
 340                 for (int d1 = d; d1 < dd->ndim-1; d1++)
 341                 {
 342                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 343                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 344                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 345                     pos++;
 346                 }
 347
 348                 if (d == 1 || (d == 0 && dd->ndim == 3))
 349                 {
 350                     for (int i = d; i < 2; i++)
 351                     {
 352                         comm->zone_d2[1-d][i] = buf_e[pos];
 353                         pos++;
 354                     }
 355                 }
 356                 if (d == 0)
 357                 {
 358                     comm->zone_d1[1] = buf_e[pos];
 359                     pos++;
 360                 }
 361             }
 362             else
 363             {
 364                 if (d == 1 || (d == 0 && dd->ndim == 3))
 365                 {
 366                     for (int i = d; i < 2; i++)
 367                     {
 368                         comm->zone_d2[1 - d][i].dataSet = 0;
 369                     }
 370                 }
 371                 if (d == 0)
 372                 {
 373                     comm->zone_d1[1].dataSet = 0;
 374                 }
 375             }
 376         }
 377     }
 378
 379     if (dd->ndim >= 2)
 380     {
 381         int dim = dd->dim[1];
 382         for (int i = 0; i < 2; i++)
 383         {
 384             if (comm->zone_d1[i].dataSet != 0)
 385             {
 386                 if (debug)
 387                 {
 388                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 389                 }
 390                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 391                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 392             }
 393         }
 394     }
 395     if (dd->ndim >= 3)
 396     {
 397         int dim = dd->dim[2];
 398         for (int i = 0; i < 2; i++)
 399         {
 400             for (int j = 0; j < 2; j++)
 401             {
 402                 if (comm->zone_d2[i][j].dataSet != 0)
 403                 {
 404                     if (debug)
 405                     {
 406                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 407                     }
 408                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 409                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 410                 }
 411             }
 412         }
 413     }
 414     for (int d = 1; d < dd->ndim; d++)
 415     {
 416         cellsizes[d].fracLowerMax = extr_s[d-1][0];
 417         cellsizes[d].fracUpperMin = extr_s[d-1][1];
 418         if (debug)
 419         {
 420             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 421                     d, cellsizes[d].fracLowerMax, cellsizes[d].fracUpperMin);
 422         }
 423     }
 424 }
 425
 426 //! Sets the charge-group zones to be equal to the home zone.
 427 static void set_zones_ncg_home(gmx_domdec_t *dd)
 428 {
 429     gmx_domdec_zones_t *zones;
 430     int                 i;
 431
 432     zones = &dd->comm->zones;
 433
 434     zones->cg_range[0] = 0;
 435     for (i = 1; i < zones->n+1; i++)
 436     {
 437         zones->cg_range[i] = dd->ncg_home;
 438     }
 439     /* zone_ncg1[0] should always be equal to ncg_home */
 440     dd->comm->zone_ncg1[0] = dd->ncg_home;
 441 }
 442
 443 //! Restore atom groups for the charge groups.
 444 static void restoreAtomGroups(gmx_domdec_t *dd,
 445                               const int *gcgs_index, const t_state *state)
 446 {
 447     gmx::ArrayRef<const int>  atomGroupsState        = state->cg_gl;
 448
 449     std::vector<int>         &globalAtomGroupIndices = dd->globalAtomGroupIndices;
 450     gmx::RangePartitioning   &atomGrouping           = dd->atomGrouping_;
 451
 452     globalAtomGroupIndices.resize(atomGroupsState.size());
 453     atomGrouping.clear();
 454
 455     /* Copy back the global charge group indices from state
 456      * and rebuild the local charge group to atom index.
 457      */
 458     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
 459     {
 460         const int atomGroupGlobal  = atomGroupsState[i];
 461         const int groupSize        = gcgs_index[atomGroupGlobal + 1] - gcgs_index[atomGroupGlobal];
 462         globalAtomGroupIndices[i]  = atomGroupGlobal;
 463         atomGrouping.appendBlock(groupSize);
 464     }
 465
 466     dd->ncg_home = atomGroupsState.size();
 467     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGrouping.fullRange().end());
 468
 469     set_zones_ncg_home(dd);
 470 }
 471
 472 //! Sets the cginfo structures.
 473 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1,
 474                           t_forcerec *fr, char *bLocalCG)
 475 {
 476     cginfo_mb_t *cginfo_mb;
 477     int         *cginfo;
 478     int          cg;
 479
 480     if (fr != nullptr)
 481     {
 482         cginfo_mb = fr->cginfo_mb;
 483         cginfo    = fr->cginfo;
 484
 485         for (cg = cg0; cg < cg1; cg++)
 486         {
 487             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 488         }
 489     }
 490
 491     if (bLocalCG != nullptr)
 492     {
 493         for (cg = cg0; cg < cg1; cg++)
 494         {
 495             bLocalCG[index_gl[cg]] = TRUE;
 496         }
 497     }
 498 }
 499
 500 //! Makes the mappings between global and local atom indices during DD repartioning.
 501 static void make_dd_indices(gmx_domdec_t *dd,
 502                             const int *gcgs_index, int cg_start)
 503 {
 504     const int                 numZones               = dd->comm->zones.n;
 505     const int                *zone2cg                = dd->comm->zones.cg_range;
 506     const int                *zone_ncg1              = dd->comm->zone_ncg1;
 507     gmx::ArrayRef<const int>  globalAtomGroupIndices = dd->globalAtomGroupIndices;
 508     const gmx_bool            bCGs                   = dd->comm->bCGs;
 509
 510     std::vector<int>         &globalAtomIndices      = dd->globalAtomIndices;
 511     gmx_ga2la_t              &ga2la                  = *dd->ga2la;
 512
 513     if (zone2cg[1] != dd->ncg_home)
 514     {
 515         gmx_incons("dd->ncg_zone is not up to date");
 516     }
 517
 518     /* Make the local to global and global to local atom index */
 519     int a = dd->atomGrouping().subRange(cg_start, cg_start).begin();
 520     globalAtomIndices.resize(a);
 521     for (int zone = 0; zone < numZones; zone++)
 522     {
 523         int cg0;
 524         if (zone == 0)
 525         {
 526             cg0 = cg_start;
 527         }
 528         else
 529         {
 530             cg0 = zone2cg[zone];
 531         }
 532         int cg1    = zone2cg[zone+1];
 533         int cg1_p1 = cg0 + zone_ncg1[zone];
 534
 535         for (int cg = cg0; cg < cg1; cg++)
 536         {
 537             int zone1 = zone;
 538             if (cg >= cg1_p1)
 539             {
 540                 /* Signal that this cg is from more than one pulse away */
 541                 zone1 += numZones;
 542             }
 543             int cg_gl = globalAtomGroupIndices[cg];
 544             if (bCGs)
 545             {
 546                 for (int a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 547                 {
 548                     globalAtomIndices.push_back(a_gl);
 549                     ga2la.insert(a_gl, {a, zone1});
 550                     a++;
 551                 }
 552             }
 553             else
 554             {
 555                 globalAtomIndices.push_back(cg_gl);
 556                 ga2la.insert(cg_gl, {a, zone1});
 557                 a++;
 558             }
 559         }
 560     }
 561 }
 562
 563 //! Checks the charge-group assignements.
 564 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 565                           const char *where)
 566 {
 567     int nerr = 0;
 568     if (bLocalCG == nullptr)
 569     {
 570         return nerr;
 571     }
 572     for (size_t i = 0; i < dd->globalAtomGroupIndices.size(); i++)
 573     {
 574         if (!bLocalCG[dd->globalAtomGroupIndices[i]])
 575         {
 576             fprintf(stderr,
 577                     "DD rank %d, %s: atom group %zu, global atom group %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i + 1, dd->globalAtomGroupIndices[i] + 1, dd->ncg_home);
 578             nerr++;
 579         }
 580     }
 581     size_t ngl = 0;
 582     for (int i = 0; i < ncg_sys; i++)
 583     {
 584         if (bLocalCG[i])
 585         {
 586             ngl++;
 587         }
 588     }
 589     if (ngl != dd->globalAtomGroupIndices.size())
 590     {
 591         fprintf(stderr, "DD rank %d, %s: In bLocalCG %zu atom groups are marked as local, whereas there are %zu\n", dd->rank, where, ngl, dd->globalAtomGroupIndices.size());
 592         nerr++;
 593     }
 594
 595     return nerr;
 596 }
 597
 598 //! Checks whether global and local atom indices are consistent.
 599 static void check_index_consistency(gmx_domdec_t *dd,
 600                                     int natoms_sys, int ncg_sys,
 601                                     const char *where)
 602 {
 603     int       nerr = 0;
 604
 605     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 606
 607     if (dd->comm->DD_debug > 1)
 608     {
 609         std::vector<int> have(natoms_sys);
 610         for (int a = 0; a < numAtomsInZones; a++)
 611         {
 612             int globalAtomIndex = dd->globalAtomIndices[a];
 613             if (have[globalAtomIndex] > 0)
 614             {
 615                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a+1);
 616             }
 617             else
 618             {
 619                 have[globalAtomIndex] = a + 1;
 620             }
 621         }
 622     }
 623
 624     std::vector<int> have(numAtomsInZones);
 625
 626     int              ngl = 0;
 627     for (int i = 0; i < natoms_sys; i++)
 628     {
 629         if (const auto entry = dd->ga2la->find(i))
 630         {
 631             const int a = entry->la;
 632             if (a >= numAtomsInZones)
 633             {
 634                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, numAtomsInZones);
 635                 nerr++;
 636             }
 637             else
 638             {
 639                 have[a] = 1;
 640                 if (dd->globalAtomIndices[a] != i)
 641                 {
 642                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->globalAtomIndices[a]+1);
 643                     nerr++;
 644                 }
 645             }
 646             ngl++;
 647         }
 648     }
 649     if (ngl != numAtomsInZones)
 650     {
 651         fprintf(stderr,
 652                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
 653                 dd->rank, where, ngl, numAtomsInZones);
 654     }
 655     for (int a = 0; a < numAtomsInZones; a++)
 656     {
 657         if (have[a] == 0)
 658         {
 659             fprintf(stderr,
 660                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
 661                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
 662         }
 663     }
 664
 665     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 666
 667     if (nerr > 0)
 668     {
 669         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies",
 670                   dd->rank, where, nerr);
 671     }
 672 }
 673
 674 //! Clear all DD global state indices, starting from \p atomGroupStart and \p atomStart
 675 static void clearDDStateIndices(gmx_domdec_t *dd,
 676                                 int           atomGroupStart,
 677                                 int           atomStart)
 678 {
 679     gmx_ga2la_t &ga2la = *dd->ga2la;
 680
 681     if (atomStart == 0)
 682     {
 683         /* Clear the whole list without the overhead of searching */
 684         ga2la.clear();
 685     }
 686     else
 687     {
 688         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 689         for (int i = 0; i < numAtomsInZones; i++)
 690         {
 691             ga2la.erase(dd->globalAtomIndices[i]);
 692         }
 693     }
 694
 695     char *bLocalCG = dd->comm->bLocalCG;
 696     if (bLocalCG)
 697     {
 698         for (size_t atomGroup = atomGroupStart; atomGroup < dd->globalAtomGroupIndices.size(); atomGroup++)
 699         {
 700             bLocalCG[dd->globalAtomGroupIndices[atomGroup]] = FALSE;
 701         }
 702     }
 703
 704     dd_clear_local_vsite_indices(dd);
 705
 706     if (dd->constraints)
 707     {
 708         dd_clear_local_constraint_indices(dd);
 709     }
 710 }
 711
 712 bool check_grid_jump(int64_t             step,
 713                      const gmx_domdec_t *dd,
 714                      real                cutoff,
 715                      const gmx_ddbox_t  *ddbox,
 716                      gmx_bool            bFatal)
 717 {
 718     gmx_domdec_comm_t *comm    = dd->comm;
 719     bool               invalid = false;
 720
 721     for (int d = 1; d < dd->ndim; d++)
 722     {
 723         const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
 724         const int                 dim       = dd->dim[d];
 725         const real                limit     = grid_jump_limit(comm, cutoff, d);
 726         real                      bfac      = ddbox->box_size[dim];
 727         if (ddbox->tric_dir[dim])
 728         {
 729             bfac *= ddbox->skew_fac[dim];
 730         }
 731         if ((cellsizes.fracUpper - cellsizes.fracLowerMax)*bfac <  limit ||
 732             (cellsizes.fracLower - cellsizes.fracUpperMin)*bfac > -limit)
 733         {
 734             invalid = true;
 735
 736             if (bFatal)
 737             {
 738                 char buf[22];
 739
 740                 /* This error should never be triggered under normal
 741                  * circumstances, but you never know ...
 742                  */
 743                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
 744                           gmx_step_str(step, buf),
 745                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 746             }
 747         }
 748     }
 749
 750     return invalid;
 751 }
 752 //! Return the duration of force calculations on this rank.
 753 static float dd_force_load(gmx_domdec_comm_t *comm)
 754 {
 755     float load;
 756
 757     if (comm->eFlop)
 758     {
 759         load = comm->flop;
 760         if (comm->eFlop > 1)
 761         {
 762             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 763         }
 764     }
 765     else
 766     {
 767         load = comm->cycl[ddCyclF];
 768         if (comm->cycl_n[ddCyclF] > 1)
 769         {
 770             /* Subtract the maximum of the last n cycle counts
 771              * to get rid of possible high counts due to other sources,
 772              * for instance system activity, that would otherwise
 773              * affect the dynamic load balancing.
 774              */
 775             load -= comm->cycl_max[ddCyclF];
 776         }
 777
 778 #if GMX_MPI
 779         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 780         {
 781             float gpu_wait, gpu_wait_sum;
 782
 783             gpu_wait = comm->cycl[ddCyclWaitGPU];
 784             if (comm->cycl_n[ddCyclF] > 1)
 785             {
 786                 /* We should remove the WaitGPU time of the same MD step
 787                  * as the one with the maximum F time, since the F time
 788                  * and the wait time are not independent.
 789                  * Furthermore, the step for the max F time should be chosen
 790                  * the same on all ranks that share the same GPU.
 791                  * But to keep the code simple, we remove the average instead.
 792                  * The main reason for artificially long times at some steps
 793                  * is spurious CPU activity or MPI time, so we don't expect
 794                  * that changes in the GPU wait time matter a lot here.
 795                  */
 796                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/static_cast<float>(comm->cycl_n[ddCyclF]);
 797             }
 798             /* Sum the wait times over the ranks that share the same GPU */
 799             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
 800                           comm->mpi_comm_gpu_shared);
 801             /* Replace the wait time by the average over the ranks */
 802             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
 803         }
 804 #endif
 805     }
 806
 807     return load;
 808 }
 809
 810 //! Runs cell size checks and communicates the boundaries.
 811 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 812                                   gmx_ddbox_t *ddbox,
 813                                   rvec cell_ns_x0, rvec cell_ns_x1,
 814                                   int64_t step)
 815 {
 816     gmx_domdec_comm_t *comm;
 817     int                dim_ind, dim;
 818
 819     comm = dd->comm;
 820
 821     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 822     {
 823         dim = dd->dim[dim_ind];
 824
 825         /* Without PBC we don't have restrictions on the outer cells */
 826         if (!(dim >= ddbox->npbcdim &&
 827               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 828             isDlbOn(comm) &&
 829             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 830             comm->cellsize_min[dim])
 831         {
 832             char buf[22];
 833             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 834                       gmx_step_str(step, buf), dim2char(dim),
 835                       comm->cell_x1[dim] - comm->cell_x0[dim],
 836                       ddbox->skew_fac[dim],
 837                       dd->comm->cellsize_min[dim],
 838                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 839         }
 840     }
 841
 842     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 843     {
 844         /* Communicate the boundaries and update cell_ns_x0/1 */
 845         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 846         if (isDlbOn(dd->comm) && dd->ndim > 1)
 847         {
 848             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 849         }
 850     }
 851 }
 852
 853 //! Compute and communicate to determine the load distribution across PP ranks.
 854 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 855 {
 856     gmx_domdec_comm_t *comm;
 857     domdec_load_t     *load;
 858     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 859     gmx_bool           bSepPME;
 860
 861     if (debug)
 862     {
 863         fprintf(debug, "get_load_distribution start\n");
 864     }
 865
 866     wallcycle_start(wcycle, ewcDDCOMMLOAD);
 867
 868     comm = dd->comm;
 869
 870     bSepPME = (dd->pme_nodeid >= 0);
 871
 872     if (dd->ndim == 0 && bSepPME)
 873     {
 874         /* Without decomposition, but with PME nodes, we need the load */
 875         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
 876         comm->load[0].pme = comm->cycl[ddCyclPME];
 877     }
 878
 879     for (int d = dd->ndim - 1; d >= 0; d--)
 880     {
 881         const DDCellsizesWithDlb *cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
 882         const int                 dim       = dd->dim[d];
 883         /* Check if we participate in the communication in this dimension */
 884         if (d == dd->ndim-1 ||
 885             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 886         {
 887             load = &comm->load[d];
 888             if (isDlbOn(dd->comm))
 889             {
 890                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
 891             }
 892             int pos = 0;
 893             if (d == dd->ndim-1)
 894             {
 895                 sbuf[pos++] = dd_force_load(comm);
 896                 sbuf[pos++] = sbuf[0];
 897                 if (isDlbOn(dd->comm))
 898                 {
 899                     sbuf[pos++] = sbuf[0];
 900                     sbuf[pos++] = cell_frac;
 901                     if (d > 0)
 902                     {
 903                         sbuf[pos++] = cellsizes->fracLowerMax;
 904                         sbuf[pos++] = cellsizes->fracUpperMin;
 905                     }
 906                 }
 907                 if (bSepPME)
 908                 {
 909                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 910                     sbuf[pos++] = comm->cycl[ddCyclPME];
 911                 }
 912             }
 913             else
 914             {
 915                 sbuf[pos++] = comm->load[d+1].sum;
 916                 sbuf[pos++] = comm->load[d+1].max;
 917                 if (isDlbOn(dd->comm))
 918                 {
 919                     sbuf[pos++] = comm->load[d+1].sum_m;
 920                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 921                     sbuf[pos++] = comm->load[d+1].flags;
 922                     if (d > 0)
 923                     {
 924                         sbuf[pos++] = cellsizes->fracLowerMax;
 925                         sbuf[pos++] = cellsizes->fracUpperMin;
 926                     }
 927                 }
 928                 if (bSepPME)
 929                 {
 930                     sbuf[pos++] = comm->load[d+1].mdf;
 931                     sbuf[pos++] = comm->load[d+1].pme;
 932                 }
 933             }
 934             load->nload = pos;
 935             /* Communicate a row in DD direction d.
 936              * The communicators are setup such that the root always has rank 0.
 937              */
 938 #if GMX_MPI
 939             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 940                        load->load, load->nload*sizeof(float), MPI_BYTE,
 941                        0, comm->mpi_comm_load[d]);
 942 #endif
 943             if (dd->ci[dim] == dd->master_ci[dim])
 944             {
 945                 /* We are the master along this row, process this row */
 946                 RowMaster *rowMaster = nullptr;
 947
 948                 if (isDlbOn(comm))
 949                 {
 950                     rowMaster = cellsizes->rowMaster.get();
 951                 }
 952                 load->sum      = 0;
 953                 load->max      = 0;
 954                 load->sum_m    = 0;
 955                 load->cvol_min = 1;
 956                 load->flags    = 0;
 957                 load->mdf      = 0;
 958                 load->pme      = 0;
 959                 int pos        = 0;
 960                 for (int i = 0; i < dd->nc[dim]; i++)
 961                 {
 962                     load->sum += load->load[pos++];
 963                     load->max  = std::max(load->max, load->load[pos]);
 964                     pos++;
 965                     if (isDlbOn(dd->comm))
 966                     {
 967                         if (rowMaster->dlbIsLimited)
 968                         {
 969                             /* This direction could not be load balanced properly,
 970                              * therefore we need to use the maximum iso the average load.
 971                              */
 972                             load->sum_m = std::max(load->sum_m, load->load[pos]);
 973                         }
 974                         else
 975                         {
 976                             load->sum_m += load->load[pos];
 977                         }
 978                         pos++;
 979                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
 980                         pos++;
 981                         if (d < dd->ndim-1)
 982                         {
 983                             load->flags = gmx::roundToInt(load->load[pos++]);
 984                         }
 985                         if (d > 0)
 986                         {
 987                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
 988                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
 989                         }
 990                     }
 991                     if (bSepPME)
 992                     {
 993                         load->mdf = std::max(load->mdf, load->load[pos]);
 994                         pos++;
 995                         load->pme = std::max(load->pme, load->load[pos]);
 996                         pos++;
 997                     }
 998                 }
 999                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
1000                 {
1001                     load->sum_m *= dd->nc[dim];
1002                     load->flags |= (1<<d);
1003                 }
1004             }
1005         }
1006     }
1007
1008     if (DDMASTER(dd))
1009     {
1010         comm->nload      += dd_load_count(comm);
1011         comm->load_step  += comm->cycl[ddCyclStep];
1012         comm->load_sum   += comm->load[0].sum;
1013         comm->load_max   += comm->load[0].max;
1014         if (isDlbOn(comm))
1015         {
1016             for (int d = 0; d < dd->ndim; d++)
1017             {
1018                 if (comm->load[0].flags & (1<<d))
1019                 {
1020                     comm->load_lim[d]++;
1021                 }
1022             }
1023         }
1024         if (bSepPME)
1025         {
1026             comm->load_mdf += comm->load[0].mdf;
1027             comm->load_pme += comm->load[0].pme;
1028         }
1029     }
1030
1031     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
1032
1033     if (debug)
1034     {
1035         fprintf(debug, "get_load_distribution finished\n");
1036     }
1037 }
1038
1039 /*! \brief Return the relative performance loss on the total run time
1040  * due to the force calculation load imbalance. */
1041 static float dd_force_load_fraction(gmx_domdec_t *dd)
1042 {
1043     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1044     {
1045         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
1046     }
1047     else
1048     {
1049         return 0;
1050     }
1051 }
1052
1053 /*! \brief Return the relative performance loss on the total run time
1054  * due to the force calculation load imbalance. */
1055 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
1056 {
1057     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1058     {
1059         return
1060             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
1061             (dd->comm->load_step*dd->nnodes);
1062     }
1063     else
1064     {
1065         return 0;
1066     }
1067 }
1068
1069 //! Print load-balance report e.g. at the end of a run.
1070 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
1071 {
1072     gmx_domdec_comm_t *comm = dd->comm;
1073
1074     /* Only the master rank prints loads and only if we measured loads */
1075     if (!DDMASTER(dd) || comm->nload == 0)
1076     {
1077         return;
1078     }
1079
1080     char  buf[STRLEN];
1081     int   numPpRanks   = dd->nnodes;
1082     int   numPmeRanks  = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
1083     int   numRanks     = numPpRanks + numPmeRanks;
1084     float lossFraction = 0;
1085
1086     /* Print the average load imbalance and performance loss */
1087     if (dd->nnodes > 1 && comm->load_sum > 0)
1088     {
1089         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
1090         lossFraction    = dd_force_imb_perf_loss(dd);
1091
1092         std::string msg         = "\nDynamic load balancing report:\n";
1093         std::string dlbStateStr;
1094
1095         switch (dd->comm->dlbState)
1096         {
1097             case DlbState::offUser:
1098                 dlbStateStr = "DLB was off during the run per user request.";
1099                 break;
1100             case DlbState::offForever:
1101                 /* Currectly this can happen due to performance loss observed, cell size
1102                  * limitations or incompatibility with other settings observed during
1103                  * determineInitialDlbState(). */
1104                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1105                 break;
1106             case DlbState::offCanTurnOn:
1107                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1108                 break;
1109             case DlbState::offTemporarilyLocked:
1110                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
1111                 break;
1112             case DlbState::onCanTurnOff:
1113                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1114                 break;
1115             case DlbState::onUser:
1116                 dlbStateStr = "DLB was permanently on during the run per user request.";
1117                 break;
1118             default:
1119                 GMX_ASSERT(false, "Undocumented DLB state");
1120         }
1121
1122         msg += " " + dlbStateStr + "\n";
1123         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
1124         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
1125                                  gmx::roundToInt(dd_force_load_fraction(dd)*100));
1126         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1127                                  lossFraction*100);
1128         fprintf(fplog, "%s", msg.c_str());
1129         fprintf(stderr, "\n%s", msg.c_str());
1130     }
1131
1132     /* Print during what percentage of steps the  load balancing was limited */
1133     bool dlbWasLimited = false;
1134     if (isDlbOn(comm))
1135     {
1136         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1137         for (int d = 0; d < dd->ndim; d++)
1138         {
1139             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
1140             sprintf(buf+strlen(buf), " %c %d %%",
1141                     dim2char(dd->dim[d]), limitPercentage);
1142             if (limitPercentage >= 50)
1143             {
1144                 dlbWasLimited = true;
1145             }
1146         }
1147         sprintf(buf + strlen(buf), "\n");
1148         fprintf(fplog, "%s", buf);
1149         fprintf(stderr, "%s", buf);
1150     }
1151
1152     /* Print the performance loss due to separate PME - PP rank imbalance */
1153     float lossFractionPme = 0;
1154     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1155     {
1156         float pmeForceRatio = comm->load_pme/comm->load_mdf;
1157         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
1158         if (lossFractionPme <= 0)
1159         {
1160             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
1161         }
1162         else
1163         {
1164             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
1165         }
1166         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1167         fprintf(fplog, "%s", buf);
1168         fprintf(stderr, "%s", buf);
1169         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme)*100);
1170         fprintf(fplog, "%s", buf);
1171         fprintf(stderr, "%s", buf);
1172     }
1173     fprintf(fplog, "\n");
1174     fprintf(stderr, "\n");
1175
1176     if (lossFraction >= DD_PERF_LOSS_WARN)
1177     {
1178         std::string message = gmx::formatString(
1179                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1180                     "      in the domain decomposition.\n", lossFraction*100);
1181
1182         bool hadSuggestion = false;
1183         if (!isDlbOn(comm))
1184         {
1185             message      += "      You might want to use dynamic load balancing (option -dlb.)\n";
1186             hadSuggestion = true;
1187         }
1188         else if (dlbWasLimited)
1189         {
1190             message      += "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n";
1191             hadSuggestion = true;
1192         }
1193         message += gmx::formatString(
1194                     "      You can %sconsider manually changing the decomposition (option -dd);\n"
1195                     "      e.g. by using fewer domains along the box dimension in which there is\n"
1196                     "      considerable inhomogeneity in the simulated system.",
1197                     hadSuggestion ? "also " : "");
1198
1199
1200         fprintf(fplog, "%s\n", message.c_str());
1201         fprintf(stderr, "%s\n", message.c_str());
1202     }
1203     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1204     {
1205         sprintf(buf,
1206                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1207                 "      had %s work to do than the PP ranks.\n"
1208                 "      You might want to %s the number of PME ranks\n"
1209                 "      or %s the cut-off and the grid spacing.\n",
1210                 std::fabs(lossFractionPme*100),
1211                 (lossFractionPme < 0) ? "less"     : "more",
1212                 (lossFractionPme < 0) ? "decrease" : "increase",
1213                 (lossFractionPme < 0) ? "decrease" : "increase");
1214         fprintf(fplog, "%s\n", buf);
1215         fprintf(stderr, "%s\n", buf);
1216     }
1217 }
1218
1219 //! Return the minimum communication volume.
1220 static float dd_vol_min(gmx_domdec_t *dd)
1221 {
1222     return dd->comm->load[0].cvol_min*dd->nnodes;
1223 }
1224
1225 //! Return the DD load flags.
1226 static int dd_load_flags(gmx_domdec_t *dd)
1227 {
1228     return dd->comm->load[0].flags;
1229 }
1230
1231 //! Return the reported load imbalance in force calculations.
1232 static float dd_f_imbal(gmx_domdec_t *dd)
1233 {
1234     if (dd->comm->load[0].sum > 0)
1235     {
1236         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
1237     }
1238     else
1239     {
1240         /* Something is wrong in the cycle counting, report no load imbalance */
1241         return 0.0f;
1242     }
1243 }
1244
1245 //! Returns DD load balance report.
1246 static std::string
1247 dd_print_load(gmx_domdec_t *dd,
1248               int64_t       step)
1249 {
1250     gmx::StringOutputStream stream;
1251     gmx::TextWriter         log(&stream);
1252
1253     int                     flags = dd_load_flags(dd);
1254     if (flags)
1255     {
1256         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1257         for (int d = 0; d < dd->ndim; d++)
1258         {
1259             if (flags & (1<<d))
1260             {
1261                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1262             }
1263         }
1264         log.ensureLineBreak();
1265     }
1266     log.writeString("DD  step " + gmx::toString(step));
1267     if (isDlbOn(dd->comm))
1268     {
1269         log.writeStringFormatted("  vol min/aver %5.3f%c",
1270                                  dd_vol_min(dd), flags ? '!' : ' ');
1271     }
1272     if (dd->nnodes > 1)
1273     {
1274         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
1275     }
1276     if (dd->comm->cycl_n[ddCyclPME])
1277     {
1278         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1279     }
1280     log.ensureLineBreak();
1281     return stream.toString();
1282 }
1283
1284 //! Prints DD load balance report in mdrun verbose mode.
1285 static void dd_print_load_verbose(gmx_domdec_t *dd)
1286 {
1287     if (isDlbOn(dd->comm))
1288     {
1289         fprintf(stderr, "vol %4.2f%c ",
1290                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1291     }
1292     if (dd->nnodes > 1)
1293     {
1294         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd)*100));
1295     }
1296     if (dd->comm->cycl_n[ddCyclPME])
1297     {
1298         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1299     }
1300 }
1301
1302 //! Turns on dynamic load balancing if possible and needed.
1303 static void turn_on_dlb(const gmx::MDLogger &mdlog,
1304                         gmx_domdec_t        *dd,
1305                         int64_t              step)
1306 {
1307     gmx_domdec_comm_t *comm         = dd->comm;
1308
1309     real               cellsize_min = comm->cellsize_min[dd->dim[0]];
1310     for (int d = 1; d < dd->ndim; d++)
1311     {
1312         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1313     }
1314
1315     /* Turn off DLB if we're too close to the cell size limit. */
1316     if (cellsize_min < comm->cellsize_limit*1.05)
1317     {
1318         GMX_LOG(mdlog.info).appendTextFormatted(
1319                 "step %s Measured %.1f %% performance loss due to load imbalance, "
1320                 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1321                 "Will no longer try dynamic load balancing.",
1322                 gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1323
1324         comm->dlbState = DlbState::offForever;
1325         return;
1326     }
1327
1328     GMX_LOG(mdlog.info).appendTextFormatted(
1329             "step %s Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.",
1330             gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1331     comm->dlbState = DlbState::onCanTurnOff;
1332
1333     /* Store the non-DLB performance, so we can check if DLB actually
1334      * improves performance.
1335      */
1336     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
1337     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
1338
1339     set_dlb_limits(dd);
1340
1341     /* We can set the required cell size info here,
1342      * so we do not need to communicate this.
1343      * The grid is completely uniform.
1344      */
1345     for (int d = 0; d < dd->ndim; d++)
1346     {
1347         RowMaster *rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1348
1349         if (rowMaster)
1350         {
1351             comm->load[d].sum_m = comm->load[d].sum;
1352
1353             int nc = dd->nc[dd->dim[d]];
1354             for (int i = 0; i < nc; i++)
1355             {
1356                 rowMaster->cellFrac[i] = i/static_cast<real>(nc);
1357                 if (d > 0)
1358                 {
1359                     rowMaster->bounds[i].cellFracLowerMax =  i     /static_cast<real>(nc);
1360                     rowMaster->bounds[i].cellFracUpperMin = (i + 1)/static_cast<real>(nc);
1361                 }
1362             }
1363             rowMaster->cellFrac[nc] = 1.0;
1364         }
1365     }
1366 }
1367
1368 //! Turns off dynamic load balancing (but leave it able to turn back on).
1369 static void turn_off_dlb(const gmx::MDLogger &mdlog,
1370                          gmx_domdec_t        *dd,
1371                          int64_t              step)
1372 {
1373     GMX_LOG(mdlog.info).appendText(
1374             "step " + gmx::toString(step) + " Turning off dynamic load balancing, because it is degrading performance.");
1375     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1376     dd->comm->haveTurnedOffDlb             = true;
1377     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1378 }
1379
1380 //! Turns off dynamic load balancing permanently.
1381 static void turn_off_dlb_forever(const gmx::MDLogger &mdlog,
1382                                  gmx_domdec_t        *dd,
1383                                  int64_t              step)
1384 {
1385     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
1386     GMX_LOG(mdlog.info).appendText(
1387             "step " + gmx::toString(step) + " Will no longer try dynamic load balancing, as it degraded performance.");
1388     dd->comm->dlbState = DlbState::offForever;
1389 }
1390
1391 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
1392 {
1393     gmx_domdec_comm_t *comm;
1394
1395     comm = cr->dd->comm;
1396
1397     /* Turn on the DLB limiting (might have been on already) */
1398     comm->bPMELoadBalDLBLimits = TRUE;
1399
1400     /* Change the cut-off limit */
1401     comm->PMELoadBal_max_cutoff = cutoff;
1402
1403     if (debug)
1404     {
1405         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
1406                 comm->PMELoadBal_max_cutoff);
1407     }
1408 }
1409
1410 //! Merges charge-group buffers.
1411 static void merge_cg_buffers(int ncell,
1412                              gmx_domdec_comm_dim_t *cd, int pulse,
1413                              int  *ncg_cell,
1414                              gmx::ArrayRef<int> index_gl,
1415                              const int  *recv_i,
1416                              rvec *cg_cm,    rvec *recv_vr,
1417                              gmx::ArrayRef<int> cgindex,
1418                              cginfo_mb_t *cginfo_mb, int *cginfo)
1419 {
1420     gmx_domdec_ind_t *ind, *ind_p;
1421     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
1422     int               shift, shift_at;
1423
1424     ind = &cd->ind[pulse];
1425
1426     /* First correct the already stored data */
1427     shift = ind->nrecv[ncell];
1428     for (cell = ncell-1; cell >= 0; cell--)
1429     {
1430         shift -= ind->nrecv[cell];
1431         if (shift > 0)
1432         {
1433             /* Move the cg's present from previous grid pulses */
1434             cg0                = ncg_cell[ncell+cell];
1435             cg1                = ncg_cell[ncell+cell+1];
1436             cgindex[cg1+shift] = cgindex[cg1];
1437             for (cg = cg1-1; cg >= cg0; cg--)
1438             {
1439                 index_gl[cg+shift] = index_gl[cg];
1440                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
1441                 cgindex[cg+shift] = cgindex[cg];
1442                 cginfo[cg+shift]  = cginfo[cg];
1443             }
1444             /* Correct the already stored send indices for the shift */
1445             for (p = 1; p <= pulse; p++)
1446             {
1447                 ind_p = &cd->ind[p];
1448                 cg0   = 0;
1449                 for (c = 0; c < cell; c++)
1450                 {
1451                     cg0 += ind_p->nsend[c];
1452                 }
1453                 cg1 = cg0 + ind_p->nsend[cell];
1454                 for (cg = cg0; cg < cg1; cg++)
1455                 {
1456                     ind_p->index[cg] += shift;
1457                 }
1458             }
1459         }
1460     }
1461
1462     /* Merge in the communicated buffers */
1463     shift    = 0;
1464     shift_at = 0;
1465     cg0      = 0;
1466     for (cell = 0; cell < ncell; cell++)
1467     {
1468         cg1 = ncg_cell[ncell+cell+1] + shift;
1469         if (shift_at > 0)
1470         {
1471             /* Correct the old cg indices */
1472             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
1473             {
1474                 cgindex[cg+1] += shift_at;
1475             }
1476         }
1477         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1478         {
1479             /* Copy this charge group from the buffer */
1480             index_gl[cg1] = recv_i[cg0];
1481             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
1482             /* Add it to the cgindex */
1483             cg_gl          = index_gl[cg1];
1484             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
1485             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
1486             cgindex[cg1+1] = cgindex[cg1] + nat;
1487             cg0++;
1488             cg1++;
1489             shift_at += nat;
1490         }
1491         shift                 += ind->nrecv[cell];
1492         ncg_cell[ncell+cell+1] = cg1;
1493     }
1494 }
1495
1496 //! Makes a range partitioning for the atom groups wthin a cell
1497 static void make_cell2at_index(gmx_domdec_comm_dim_t        *cd,
1498                                int                           nzone,
1499                                int                           atomGroupStart,
1500                                const gmx::RangePartitioning &atomGroups)
1501 {
1502     /* Store the atom block boundaries for easy copying of communication buffers
1503      */
1504     int g = atomGroupStart;
1505     for (int zone = 0; zone < nzone; zone++)
1506     {
1507         for (gmx_domdec_ind_t &ind : cd->ind)
1508         {
1509             const auto range    = atomGroups.subRange(g, g + ind.nrecv[zone]);
1510             ind.cell2at0[zone]  = range.begin();
1511             ind.cell2at1[zone]  = range.end();
1512             g                  += ind.nrecv[zone];
1513         }
1514     }
1515 }
1516
1517 //! Returns whether a link is missing.
1518 static gmx_bool missing_link(t_blocka *link, int cg_gl, const char *bLocalCG)
1519 {
1520     int      i;
1521     gmx_bool bMiss;
1522
1523     bMiss = FALSE;
1524     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
1525     {
1526         if (!bLocalCG[link->a[i]])
1527         {
1528             bMiss = TRUE;
1529         }
1530     }
1531
1532     return bMiss;
1533 }
1534
1535 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1536 typedef struct {
1537     //! The corners for the non-bonded communication.
1538     real c[DIM][4];
1539     //! Corner for rounding.
1540     real cr0;
1541     //! Corners for rounding.
1542     real cr1[4];
1543     //! Corners for bounded communication.
1544     real bc[DIM];
1545     //! Corner for rounding for bonded communication.
1546     real bcr1;
1547 } dd_corners_t;
1548
1549 //! Determine the corners of the domain(s) we are communicating with.
1550 static void
1551 set_dd_corners(const gmx_domdec_t *dd,
1552                int dim0, int dim1, int dim2,
1553                gmx_bool bDistMB,
1554                dd_corners_t *c)
1555 {
1556     const gmx_domdec_comm_t  *comm;
1557     const gmx_domdec_zones_t *zones;
1558     int i, j;
1559
1560     comm = dd->comm;
1561
1562     zones = &comm->zones;
1563
1564     /* Keep the compiler happy */
1565     c->cr0  = 0;
1566     c->bcr1 = 0;
1567
1568     /* The first dimension is equal for all cells */
1569     c->c[0][0] = comm->cell_x0[dim0];
1570     if (bDistMB)
1571     {
1572         c->bc[0] = c->c[0][0];
1573     }
1574     if (dd->ndim >= 2)
1575     {
1576         dim1 = dd->dim[1];
1577         /* This cell row is only seen from the first row */
1578         c->c[1][0] = comm->cell_x0[dim1];
1579         /* All rows can see this row */
1580         c->c[1][1] = comm->cell_x0[dim1];
1581         if (isDlbOn(dd->comm))
1582         {
1583             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1584             if (bDistMB)
1585             {
1586                 /* For the multi-body distance we need the maximum */
1587                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1588             }
1589         }
1590         /* Set the upper-right corner for rounding */
1591         c->cr0 = comm->cell_x1[dim0];
1592
1593         if (dd->ndim >= 3)
1594         {
1595             dim2 = dd->dim[2];
1596             for (j = 0; j < 4; j++)
1597             {
1598                 c->c[2][j] = comm->cell_x0[dim2];
1599             }
1600             if (isDlbOn(dd->comm))
1601             {
1602                 /* Use the maximum of the i-cells that see a j-cell */
1603                 for (i = 0; i < zones->nizone; i++)
1604                 {
1605                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
1606                     {
1607                         if (j >= 4)
1608                         {
1609                             c->c[2][j-4] =
1610                                 std::max(c->c[2][j-4],
1611                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
1612                         }
1613                     }
1614                 }
1615                 if (bDistMB)
1616                 {
1617                     /* For the multi-body distance we need the maximum */
1618                     c->bc[2] = comm->cell_x0[dim2];
1619                     for (i = 0; i < 2; i++)
1620                     {
1621                         for (j = 0; j < 2; j++)
1622                         {
1623                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1624                         }
1625                     }
1626                 }
1627             }
1628
1629             /* Set the upper-right corner for rounding */
1630             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1631              * Only cell (0,0,0) can see cell 7 (1,1,1)
1632              */
1633             c->cr1[0] = comm->cell_x1[dim1];
1634             c->cr1[3] = comm->cell_x1[dim1];
1635             if (isDlbOn(dd->comm))
1636             {
1637                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1638                 if (bDistMB)
1639                 {
1640                     /* For the multi-body distance we need the maximum */
1641                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1642                 }
1643             }
1644         }
1645     }
1646 }
1647
1648 /*! \brief Add the atom groups we need to send in this pulse from this
1649  * zone to \p localAtomGroups and \p work. */
1650 static void
1651 get_zone_pulse_cgs(gmx_domdec_t *dd,
1652                    int zonei, int zone,
1653                    int cg0, int cg1,
1654                    gmx::ArrayRef<const int> globalAtomGroupIndices,
1655                    const gmx::RangePartitioning &atomGroups,
1656                    int dim, int dim_ind,
1657                    int dim0, int dim1, int dim2,
1658                    real r_comm2, real r_bcomm2,
1659                    matrix box,
1660                    bool distanceIsTriclinic,
1661                    rvec *normal,
1662                    real skew_fac2_d, real skew_fac_01,
1663                    rvec *v_d, rvec *v_0, rvec *v_1,
1664                    const dd_corners_t *c,
1665                    const rvec sf2_round,
1666                    gmx_bool bDistBonded,
1667                    gmx_bool bBondComm,
1668                    gmx_bool bDist2B,
1669                    gmx_bool bDistMB,
1670                    rvec *cg_cm,
1671                    const int *cginfo,
1672                    std::vector<int>     *localAtomGroups,
1673                    dd_comm_setup_work_t *work)
1674 {
1675     gmx_domdec_comm_t *comm;
1676     gmx_bool           bScrew;
1677     gmx_bool           bDistMB_pulse;
1678     int                cg, i;
1679     real               r2, rb2, r, tric_sh;
1680     rvec               rn, rb;
1681     int                dimd;
1682     int                nsend_z, nat;
1683
1684     comm = dd->comm;
1685
1686     bScrew = (dd->bScrewPBC && dim == XX);
1687
1688     bDistMB_pulse = (bDistMB && bDistBonded);
1689
1690     /* Unpack the work data */
1691     std::vector<int>       &ibuf = work->atomGroupBuffer;
1692     std::vector<gmx::RVec> &vbuf = work->positionBuffer;
1693     nsend_z                      = 0;
1694     nat                          = work->nat;
1695
1696     for (cg = cg0; cg < cg1; cg++)
1697     {
1698         r2  = 0;
1699         rb2 = 0;
1700         if (!distanceIsTriclinic)
1701         {
1702             /* Rectangular direction, easy */
1703             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1704             if (r > 0)
1705             {
1706                 r2 += r*r;
1707             }
1708             if (bDistMB_pulse)
1709             {
1710                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1711                 if (r > 0)
1712                 {
1713                     rb2 += r*r;
1714                 }
1715             }
1716             /* Rounding gives at most a 16% reduction
1717              * in communicated atoms
1718              */
1719             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1720             {
1721                 r = cg_cm[cg][dim0] - c->cr0;
1722                 /* This is the first dimension, so always r >= 0 */
1723                 r2 += r*r;
1724                 if (bDistMB_pulse)
1725                 {
1726                     rb2 += r*r;
1727                 }
1728             }
1729             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1730             {
1731                 r = cg_cm[cg][dim1] - c->cr1[zone];
1732                 if (r > 0)
1733                 {
1734                     r2 += r*r;
1735                 }
1736                 if (bDistMB_pulse)
1737                 {
1738                     r = cg_cm[cg][dim1] - c->bcr1;
1739                     if (r > 0)
1740                     {
1741                         rb2 += r*r;
1742                     }
1743                 }
1744             }
1745         }
1746         else
1747         {
1748             /* Triclinic direction, more complicated */
1749             clear_rvec(rn);
1750             clear_rvec(rb);
1751             /* Rounding, conservative as the skew_fac multiplication
1752              * will slightly underestimate the distance.
1753              */
1754             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1755             {
1756                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1757                 for (i = dim0+1; i < DIM; i++)
1758                 {
1759                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
1760                 }
1761                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
1762                 if (bDistMB_pulse)
1763                 {
1764                     rb[dim0] = rn[dim0];
1765                     rb2      = r2;
1766                 }
1767                 /* Take care that the cell planes along dim0 might not
1768                  * be orthogonal to those along dim1 and dim2.
1769                  */
1770                 for (i = 1; i <= dim_ind; i++)
1771                 {
1772                     dimd = dd->dim[i];
1773                     if (normal[dim0][dimd] > 0)
1774                     {
1775                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
1776                         if (bDistMB_pulse)
1777                         {
1778                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
1779                         }
1780                     }
1781                 }
1782             }
1783             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1784             {
1785                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1786                 tric_sh   = 0;
1787                 for (i = dim1+1; i < DIM; i++)
1788                 {
1789                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
1790                 }
1791                 rn[dim1] += tric_sh;
1792                 if (rn[dim1] > 0)
1793                 {
1794                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
1795                     /* Take care of coupling of the distances
1796                      * to the planes along dim0 and dim1 through dim2.
1797                      */
1798                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
1799                     /* Take care that the cell planes along dim1
1800                      * might not be orthogonal to that along dim2.
1801                      */
1802                     if (normal[dim1][dim2] > 0)
1803                     {
1804                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
1805                     }
1806                 }
1807                 if (bDistMB_pulse)
1808                 {
1809                     rb[dim1] +=
1810                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1811                     if (rb[dim1] > 0)
1812                     {
1813                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
1814                         /* Take care of coupling of the distances
1815                          * to the planes along dim0 and dim1 through dim2.
1816                          */
1817                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
1818                         /* Take care that the cell planes along dim1
1819                          * might not be orthogonal to that along dim2.
1820                          */
1821                         if (normal[dim1][dim2] > 0)
1822                         {
1823                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
1824                         }
1825                     }
1826                 }
1827             }
1828             /* The distance along the communication direction */
1829             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1830             tric_sh  = 0;
1831             for (i = dim+1; i < DIM; i++)
1832             {
1833                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
1834             }
1835             rn[dim] += tric_sh;
1836             if (rn[dim] > 0)
1837             {
1838                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
1839                 /* Take care of coupling of the distances
1840                  * to the planes along dim0 and dim1 through dim2.
1841                  */
1842                 if (dim_ind == 1 && zonei == 1)
1843                 {
1844                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
1845                 }
1846             }
1847             if (bDistMB_pulse)
1848             {
1849                 clear_rvec(rb);
1850                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1851                 if (rb[dim] > 0)
1852                 {
1853                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
1854                     /* Take care of coupling of the distances
1855                      * to the planes along dim0 and dim1 through dim2.
1856                      */
1857                     if (dim_ind == 1 && zonei == 1)
1858                     {
1859                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
1860                     }
1861                 }
1862             }
1863         }
1864
1865         if (r2 < r_comm2 ||
1866             (bDistBonded &&
1867              ((bDistMB && rb2 < r_bcomm2) ||
1868               (bDist2B && r2  < r_bcomm2)) &&
1869              (!bBondComm ||
1870               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
1871                missing_link(comm->cglink, globalAtomGroupIndices[cg],
1872                             comm->bLocalCG)))))
1873         {
1874             /* Store the local and global atom group indices and position */
1875             localAtomGroups->push_back(cg);
1876             ibuf.push_back(globalAtomGroupIndices[cg]);
1877             nsend_z++;
1878
1879             rvec posPbc;
1880             if (dd->ci[dim] == 0)
1881             {
1882                 /* Correct cg_cm for pbc */
1883                 rvec_add(cg_cm[cg], box[dim], posPbc);
1884                 if (bScrew)
1885                 {
1886                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1887                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1888                 }
1889             }
1890             else
1891             {
1892                 copy_rvec(cg_cm[cg], posPbc);
1893             }
1894             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1895
1896             nat += atomGroups.block(cg).size();
1897         }
1898     }
1899
1900     work->nat        = nat;
1901     work->nsend_zone = nsend_z;
1902 }
1903
1904 //! Clear data.
1905 static void clearCommSetupData(dd_comm_setup_work_t *work)
1906 {
1907     work->localAtomGroupBuffer.clear();
1908     work->atomGroupBuffer.clear();
1909     work->positionBuffer.clear();
1910     work->nat        = 0;
1911     work->nsend_zone = 0;
1912 }
1913
1914 //! Prepare DD communication.
1915 static void setup_dd_communication(gmx_domdec_t *dd,
1916                                    matrix box, gmx_ddbox_t *ddbox,
1917                                    t_forcerec *fr,
1918                                    t_state *state,
1919                                    PaddedVector<gmx::RVec> *f)
1920 {
1921     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1922     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1923     int                    c, i, cg, cg_gl, nrcg;
1924     int                   *zone_cg_range, pos_cg;
1925     gmx_domdec_comm_t     *comm;
1926     gmx_domdec_zones_t    *zones;
1927     gmx_domdec_comm_dim_t *cd;
1928     cginfo_mb_t           *cginfo_mb;
1929     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1930     dd_corners_t           corners;
1931     rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1932     real                   skew_fac2_d, skew_fac_01;
1933     rvec                   sf2_round;
1934
1935     if (debug)
1936     {
1937         fprintf(debug, "Setting up DD communication\n");
1938     }
1939
1940     comm  = dd->comm;
1941
1942     if (comm->dth.empty())
1943     {
1944         /* Initialize the thread data.
1945          * This can not be done in init_domain_decomposition,
1946          * as the numbers of threads is determined later.
1947          */
1948         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1949         comm->dth.resize(numThreads);
1950     }
1951
1952     switch (fr->cutoff_scheme)
1953     {
1954         case ecutsGROUP:
1955             cg_cm = fr->cg_cm;
1956             break;
1957         case ecutsVERLET:
1958             cg_cm = state->x.rvec_array();
1959             break;
1960         default:
1961             gmx_incons("unimplemented");
1962     }
1963
1964     bBondComm = comm->bBondComm;
1965
1966     /* Do we need to determine extra distances for multi-body bondeds? */
1967     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
1968
1969     /* Do we need to determine extra distances for only two-body bondeds? */
1970     bDist2B = (bBondComm && !bDistMB);
1971
1972     const real r_comm2  = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff));
1973     const real r_bcomm2 = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff_mbody));
1974
1975     if (debug)
1976     {
1977         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1978     }
1979
1980     zones = &comm->zones;
1981
1982     dim0 = dd->dim[0];
1983     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1984     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1985
1986     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1987
1988     /* Triclinic stuff */
1989     normal      = ddbox->normal;
1990     skew_fac_01 = 0;
1991     if (dd->ndim >= 2)
1992     {
1993         v_0 = ddbox->v[dim0];
1994         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1995         {
1996             /* Determine the coupling coefficient for the distances
1997              * to the cell planes along dim0 and dim1 through dim2.
1998              * This is required for correct rounding.
1999              */
2000             skew_fac_01 =
2001                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
2002             if (debug)
2003             {
2004                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
2005             }
2006         }
2007     }
2008     if (dd->ndim >= 3)
2009     {
2010         v_1 = ddbox->v[dim1];
2011     }
2012
2013     zone_cg_range = zones->cg_range;
2014     cginfo_mb     = fr->cginfo_mb;
2015
2016     zone_cg_range[0]   = 0;
2017     zone_cg_range[1]   = dd->ncg_home;
2018     comm->zone_ncg1[0] = dd->ncg_home;
2019     pos_cg             = dd->ncg_home;
2020
2021     nat_tot = comm->atomRanges.numHomeAtoms();
2022     nzone   = 1;
2023     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
2024     {
2025         dim = dd->dim[dim_ind];
2026         cd  = &comm->cd[dim_ind];
2027
2028         /* Check if we need to compute triclinic distances along this dim */
2029         bool distanceIsTriclinic = false;
2030         for (i = 0; i <= dim_ind; i++)
2031         {
2032             if (ddbox->tric_dir[dd->dim[i]])
2033             {
2034                 distanceIsTriclinic = true;
2035             }
2036         }
2037
2038         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
2039         {
2040             /* No pbc in this dimension, the first node should not comm. */
2041             nzone_send = 0;
2042         }
2043         else
2044         {
2045             nzone_send = nzone;
2046         }
2047
2048         v_d                = ddbox->v[dim];
2049         skew_fac2_d        = gmx::square(ddbox->skew_fac[dim]);
2050
2051         cd->receiveInPlace = true;
2052         for (int p = 0; p < cd->numPulses(); p++)
2053         {
2054             /* Only atoms communicated in the first pulse are used
2055              * for multi-body bonded interactions or for bBondComm.
2056              */
2057             bDistBonded = ((bDistMB || bDist2B) && p == 0);
2058
2059             gmx_domdec_ind_t *ind = &cd->ind[p];
2060
2061             /* Thread 0 writes in the global index array */
2062             ind->index.clear();
2063             clearCommSetupData(&comm->dth[0]);
2064
2065             for (zone = 0; zone < nzone_send; zone++)
2066             {
2067                 if (dim_ind > 0 && distanceIsTriclinic)
2068                 {
2069                     /* Determine slightly more optimized skew_fac's
2070                      * for rounding.
2071                      * This reduces the number of communicated atoms
2072                      * by about 10% for 3D DD of rhombic dodecahedra.
2073                      */
2074                     for (dimd = 0; dimd < dim; dimd++)
2075                     {
2076                         sf2_round[dimd] = 1;
2077                         if (ddbox->tric_dir[dimd])
2078                         {
2079                             for (i = dd->dim[dimd]+1; i < DIM; i++)
2080                             {
2081                                 /* If we are shifted in dimension i
2082                                  * and the cell plane is tilted forward
2083                                  * in dimension i, skip this coupling.
2084                                  */
2085                                 if (!(zones->shift[nzone+zone][i] &&
2086                                       ddbox->v[dimd][i][dimd] >= 0))
2087                                 {
2088                                     sf2_round[dimd] +=
2089                                         gmx::square(ddbox->v[dimd][i][dimd]);
2090                                 }
2091                             }
2092                             sf2_round[dimd] = 1/sf2_round[dimd];
2093                         }
2094                     }
2095                 }
2096
2097                 zonei = zone_perm[dim_ind][zone];
2098                 if (p == 0)
2099                 {
2100                     /* Here we permutate the zones to obtain a convenient order
2101                      * for neighbor searching
2102                      */
2103                     cg0 = zone_cg_range[zonei];
2104                     cg1 = zone_cg_range[zonei+1];
2105                 }
2106                 else
2107                 {
2108                     /* Look only at the cg's received in the previous grid pulse
2109                      */
2110                     cg1 = zone_cg_range[nzone+zone+1];
2111                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
2112                 }
2113
2114                 const int numThreads = gmx::ssize(comm->dth);
2115 #pragma omp parallel for num_threads(numThreads) schedule(static)
2116                 for (int th = 0; th < numThreads; th++)
2117                 {
2118                     try
2119                     {
2120                         dd_comm_setup_work_t &work = comm->dth[th];
2121
2122                         /* Retain data accumulated into buffers of thread 0 */
2123                         if (th > 0)
2124                         {
2125                             clearCommSetupData(&work);
2126                         }
2127
2128                         int cg0_th = cg0 + ((cg1 - cg0)* th   )/numThreads;
2129                         int cg1_th = cg0 + ((cg1 - cg0)*(th+1))/numThreads;
2130
2131                         /* Get the cg's for this pulse in this zone */
2132                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
2133                                            dd->globalAtomGroupIndices,
2134                                            dd->atomGrouping(),
2135                                            dim, dim_ind, dim0, dim1, dim2,
2136                                            r_comm2, r_bcomm2,
2137                                            box, distanceIsTriclinic,
2138                                            normal, skew_fac2_d, skew_fac_01,
2139                                            v_d, v_0, v_1, &corners, sf2_round,
2140                                            bDistBonded, bBondComm,
2141                                            bDist2B, bDistMB,
2142                                            cg_cm, fr->cginfo,
2143                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2144                                            &work);
2145                     }
2146                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2147                 } // END
2148
2149                 std::vector<int>       &atomGroups = comm->dth[0].atomGroupBuffer;
2150                 std::vector<gmx::RVec> &positions  = comm->dth[0].positionBuffer;
2151                 ind->nsend[zone]  = comm->dth[0].nsend_zone;
2152                 /* Append data of threads>=1 to the communication buffers */
2153                 for (int th = 1; th < numThreads; th++)
2154                 {
2155                     const dd_comm_setup_work_t &dth = comm->dth[th];
2156
2157                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(), dth.localAtomGroupBuffer.end());
2158                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2159                     positions.insert(positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2160                     comm->dth[0].nat += dth.nat;
2161                     ind->nsend[zone] += dth.nsend_zone;
2162                 }
2163             }
2164             /* Clear the counts in case we do not have pbc */
2165             for (zone = nzone_send; zone < nzone; zone++)
2166             {
2167                 ind->nsend[zone] = 0;
2168             }
2169             ind->nsend[nzone]     = ind->index.size();
2170             ind->nsend[nzone + 1] = comm->dth[0].nat;
2171             /* Communicate the number of cg's and atoms to receive */
2172             ddSendrecv(dd, dim_ind, dddirBackward,
2173                        ind->nsend, nzone+2,
2174                        ind->nrecv, nzone+2);
2175
2176             if (p > 0)
2177             {
2178                 /* We can receive in place if only the last zone is not empty */
2179                 for (zone = 0; zone < nzone-1; zone++)
2180                 {
2181                     if (ind->nrecv[zone] > 0)
2182                     {
2183                         cd->receiveInPlace = false;
2184                     }
2185                 }
2186             }
2187
2188             int receiveBufferSize = 0;
2189             if (!cd->receiveInPlace)
2190             {
2191                 receiveBufferSize = ind->nrecv[nzone];
2192             }
2193             /* These buffer are actually only needed with in-place */
2194             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2195             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2196
2197             dd_comm_setup_work_t     &work = comm->dth[0];
2198
2199             /* Make space for the global cg indices */
2200             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2201             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2202             /* Communicate the global cg indices */
2203             gmx::ArrayRef<int> integerBufferRef;
2204             if (cd->receiveInPlace)
2205             {
2206                 integerBufferRef = gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2207             }
2208             else
2209             {
2210                 integerBufferRef = globalAtomGroupBuffer.buffer;
2211             }
2212             ddSendrecv<int>(dd, dim_ind, dddirBackward,
2213                             work.atomGroupBuffer, integerBufferRef);
2214
2215             /* Make space for cg_cm */
2216             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
2217             if (fr->cutoff_scheme == ecutsGROUP)
2218             {
2219                 cg_cm = fr->cg_cm;
2220             }
2221             else
2222             {
2223                 cg_cm = state->x.rvec_array();
2224             }
2225             /* Communicate cg_cm */
2226             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2227             if (cd->receiveInPlace)
2228             {
2229                 rvecBufferRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cg_cm + pos_cg), ind->nrecv[nzone]);
2230             }
2231             else
2232             {
2233                 rvecBufferRef = rvecBuffer.buffer;
2234             }
2235             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward,
2236                                   work.positionBuffer, rvecBufferRef);
2237
2238             /* Make the charge group index */
2239             if (cd->receiveInPlace)
2240             {
2241                 zone = (p == 0 ? 0 : nzone - 1);
2242                 while (zone < nzone)
2243                 {
2244                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
2245                     {
2246                         cg_gl                              = dd->globalAtomGroupIndices[pos_cg];
2247                         fr->cginfo[pos_cg]                 = ddcginfo(cginfo_mb, cg_gl);
2248                         nrcg                               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
2249                         dd->atomGrouping_.appendBlock(nrcg);
2250                         if (bBondComm)
2251                         {
2252                             /* Update the charge group presence,
2253                              * so we can use it in the next pass of the loop.
2254                              */
2255                             comm->bLocalCG[cg_gl] = TRUE;
2256                         }
2257                         pos_cg++;
2258                     }
2259                     if (p == 0)
2260                     {
2261                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
2262                     }
2263                     zone++;
2264                     zone_cg_range[nzone+zone] = pos_cg;
2265                 }
2266             }
2267             else
2268             {
2269                 /* This part of the code is never executed with bBondComm. */
2270                 std::vector<int> &atomGroupsIndex = dd->atomGrouping_.rawIndex();
2271                 atomGroupsIndex.resize(numAtomGroupsNew + 1);
2272
2273                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
2274                                  dd->globalAtomGroupIndices, integerBufferRef.data(),
2275                                  cg_cm, as_rvec_array(rvecBufferRef.data()),
2276                                  atomGroupsIndex,
2277                                  fr->cginfo_mb, fr->cginfo);
2278                 pos_cg += ind->nrecv[nzone];
2279             }
2280             nat_tot += ind->nrecv[nzone+1];
2281         }
2282         if (!cd->receiveInPlace)
2283         {
2284             /* Store the atom block for easy copying of communication buffers */
2285             make_cell2at_index(cd, nzone, zone_cg_range[nzone], dd->atomGrouping());
2286         }
2287         nzone += nzone;
2288     }
2289
2290     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2291
2292     if (!bBondComm)
2293     {
2294         /* We don't need to update cginfo, since that was alrady done above.
2295          * So we pass NULL for the forcerec.
2296          */
2297         dd_set_cginfo(dd->globalAtomGroupIndices,
2298                       dd->ncg_home, dd->globalAtomGroupIndices.size(),
2299                       nullptr, comm->bLocalCG);
2300     }
2301
2302     if (debug)
2303     {
2304         fprintf(debug, "Finished setting up DD communication, zones:");
2305         for (c = 0; c < zones->n; c++)
2306         {
2307             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
2308         }
2309         fprintf(debug, "\n");
2310     }
2311 }
2312
2313 //! Set boundaries for the charge group range.
2314 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
2315 {
2316     int c;
2317
2318     for (c = 0; c < zones->nizone; c++)
2319     {
2320         zones->izone[c].cg1  = zones->cg_range[c+1];
2321         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
2322         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
2323     }
2324 }
2325
2326 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2327  *
2328  * Also sets the atom density for the home zone when \p zone_start=0.
2329  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2330  * how many charge groups will move but are still part of the current range.
2331  * \todo When converting domdec to use proper classes, all these variables
2332  *       should be private and a method should return the correct count
2333  *       depending on an internal state.
2334  *
2335  * \param[in,out] dd          The domain decomposition struct
2336  * \param[in]     box         The box
2337  * \param[in]     ddbox       The domain decomposition box struct
2338  * \param[in]     zone_start  The start of the zone range to set sizes for
2339  * \param[in]     zone_end    The end of the zone range to set sizes for
2340  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2341  */
2342 static void set_zones_size(gmx_domdec_t *dd,
2343                            matrix box, const gmx_ddbox_t *ddbox,
2344                            int zone_start, int zone_end,
2345                            int numMovedChargeGroupsInHomeZone)
2346 {
2347     gmx_domdec_comm_t  *comm;
2348     gmx_domdec_zones_t *zones;
2349     gmx_bool            bDistMB;
2350     int                 z, zi, d, dim;
2351     real                rcs, rcmbs;
2352     int                 i, j;
2353     real                vol;
2354
2355     comm = dd->comm;
2356
2357     zones = &comm->zones;
2358
2359     /* Do we need to determine extra distances for multi-body bondeds? */
2360     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
2361
2362     for (z = zone_start; z < zone_end; z++)
2363     {
2364         /* Copy cell limits to zone limits.
2365          * Valid for non-DD dims and non-shifted dims.
2366          */
2367         copy_rvec(comm->cell_x0, zones->size[z].x0);
2368         copy_rvec(comm->cell_x1, zones->size[z].x1);
2369     }
2370
2371     for (d = 0; d < dd->ndim; d++)
2372     {
2373         dim = dd->dim[d];
2374
2375         for (z = 0; z < zones->n; z++)
2376         {
2377             /* With a staggered grid we have different sizes
2378              * for non-shifted dimensions.
2379              */
2380             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2381             {
2382                 if (d == 1)
2383                 {
2384                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
2385                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
2386                 }
2387                 else if (d == 2)
2388                 {
2389                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
2390                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
2391                 }
2392             }
2393         }
2394
2395         rcs   = comm->cutoff;
2396         rcmbs = comm->cutoff_mbody;
2397         if (ddbox->tric_dir[dim])
2398         {
2399             rcs   /= ddbox->skew_fac[dim];
2400             rcmbs /= ddbox->skew_fac[dim];
2401         }
2402
2403         /* Set the lower limit for the shifted zone dimensions */
2404         for (z = zone_start; z < zone_end; z++)
2405         {
2406             if (zones->shift[z][dim] > 0)
2407             {
2408                 dim = dd->dim[d];
2409                 if (!isDlbOn(dd->comm) || d == 0)
2410                 {
2411                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2412                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2413                 }
2414                 else
2415                 {
2416                     /* Here we take the lower limit of the zone from
2417                      * the lowest domain of the zone below.
2418                      */
2419                     if (z < 4)
2420                     {
2421                         zones->size[z].x0[dim] =
2422                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
2423                     }
2424                     else
2425                     {
2426                         if (d == 1)
2427                         {
2428                             zones->size[z].x0[dim] =
2429                                 zones->size[zone_perm[2][z-4]].x0[dim];
2430                         }
2431                         else
2432                         {
2433                             zones->size[z].x0[dim] =
2434                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
2435                         }
2436                     }
2437                     /* A temporary limit, is updated below */
2438                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2439
2440                     if (bDistMB)
2441                     {
2442                         for (zi = 0; zi < zones->nizone; zi++)
2443                         {
2444                             if (zones->shift[zi][dim] == 0)
2445                             {
2446                                 /* This takes the whole zone into account.
2447                                  * With multiple pulses this will lead
2448                                  * to a larger zone then strictly necessary.
2449                                  */
2450                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2451                                                                   zones->size[zi].x1[dim]+rcmbs);
2452                             }
2453                         }
2454                     }
2455                 }
2456             }
2457         }
2458
2459         /* Loop over the i-zones to set the upper limit of each
2460          * j-zone they see.
2461          */
2462         for (zi = 0; zi < zones->nizone; zi++)
2463         {
2464             if (zones->shift[zi][dim] == 0)
2465             {
2466                 /* We should only use zones up to zone_end */
2467                 int jZoneEnd = std::min(zones->izone[zi].j1, zone_end);
2468                 for (z = zones->izone[zi].j0; z < jZoneEnd; z++)
2469                 {
2470                     if (zones->shift[z][dim] > 0)
2471                     {
2472                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2473                                                           zones->size[zi].x1[dim]+rcs);
2474                     }
2475                 }
2476             }
2477         }
2478     }
2479
2480     for (z = zone_start; z < zone_end; z++)
2481     {
2482         /* Initialization only required to keep the compiler happy */
2483         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
2484         int  nc, c;
2485
2486         /* To determine the bounding box for a zone we need to find
2487          * the extreme corners of 4, 2 or 1 corners.
2488          */
2489         nc = 1 << (ddbox->nboundeddim - 1);
2490
2491         for (c = 0; c < nc; c++)
2492         {
2493             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2494             corner[XX] = 0;
2495             if ((c & 1) == 0)
2496             {
2497                 corner[YY] = zones->size[z].x0[YY];
2498             }
2499             else
2500             {
2501                 corner[YY] = zones->size[z].x1[YY];
2502             }
2503             if ((c & 2) == 0)
2504             {
2505                 corner[ZZ] = zones->size[z].x0[ZZ];
2506             }
2507             else
2508             {
2509                 corner[ZZ] = zones->size[z].x1[ZZ];
2510             }
2511             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
2512                 box[ZZ][1 - dd->dim[0]] != 0)
2513             {
2514                 /* With 1D domain decomposition the cg's are not in
2515                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2516                  * Shift the corner of the z-vector back to along the box
2517                  * vector of dimension d, so it will later end up at 0 along d.
2518                  * This can affect the location of this corner along dd->dim[0]
2519                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2520                  */
2521                 int d = 1 - dd->dim[0];
2522
2523                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
2524             }
2525             /* Apply the triclinic couplings */
2526             assert(ddbox->npbcdim <= DIM);
2527             for (i = YY; i < ddbox->npbcdim; i++)
2528             {
2529                 for (j = XX; j < i; j++)
2530                 {
2531                     corner[j] += corner[i]*box[i][j]/box[i][i];
2532                 }
2533             }
2534             if (c == 0)
2535             {
2536                 copy_rvec(corner, corner_min);
2537                 copy_rvec(corner, corner_max);
2538             }
2539             else
2540             {
2541                 for (i = 0; i < DIM; i++)
2542                 {
2543                     corner_min[i] = std::min(corner_min[i], corner[i]);
2544                     corner_max[i] = std::max(corner_max[i], corner[i]);
2545                 }
2546             }
2547         }
2548         /* Copy the extreme cornes without offset along x */
2549         for (i = 0; i < DIM; i++)
2550         {
2551             zones->size[z].bb_x0[i] = corner_min[i];
2552             zones->size[z].bb_x1[i] = corner_max[i];
2553         }
2554         /* Add the offset along x */
2555         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2556         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2557     }
2558
2559     if (zone_start == 0)
2560     {
2561         vol = 1;
2562         for (dim = 0; dim < DIM; dim++)
2563         {
2564             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2565         }
2566         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
2567     }
2568
2569     if (debug)
2570     {
2571         for (z = zone_start; z < zone_end; z++)
2572         {
2573             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2574                     z,
2575                     zones->size[z].x0[XX], zones->size[z].x1[XX],
2576                     zones->size[z].x0[YY], zones->size[z].x1[YY],
2577                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2578             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2579                     z,
2580                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
2581                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
2582                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2583         }
2584     }
2585 }
2586
2587 /*! \brief Order data in \p dataToSort according to \p sort
2588  *
2589  * Note: both buffers should have at least \p sort.size() elements.
2590  */
2591 template <typename T>
2592 static void
2593 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2594             gmx::ArrayRef<T>                   dataToSort,
2595             gmx::ArrayRef<T>                   sortBuffer)
2596 {
2597     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2598     GMX_ASSERT(sortBuffer.size() >= sort.size(), "The sorting buffer needs to be sufficiently large");
2599
2600     /* Order the data into the temporary buffer */
2601     size_t i = 0;
2602     for (const gmx_cgsort_t &entry : sort)
2603     {
2604         sortBuffer[i++] = dataToSort[entry.ind];
2605     }
2606
2607     /* Copy back to the original array */
2608     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(),
2609               dataToSort.begin());
2610 }
2611
2612 /*! \brief Order data in \p dataToSort according to \p sort
2613  *
2614  * Note: \p vectorToSort should have at least \p sort.size() elements,
2615  *       \p workVector is resized when it is too small.
2616  */
2617 template <typename T>
2618 static void
2619 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2620             gmx::ArrayRef<T>                   vectorToSort,
2621             std::vector<T>                    *workVector)
2622 {
2623     if (gmx::index(workVector->size()) < sort.ssize())
2624     {
2625         workVector->resize(sort.size());
2626     }
2627     orderVector<T>(sort, vectorToSort, *workVector);
2628 }
2629
2630 //! Order vectors of atoms.
2631 static void order_vec_atom(const gmx::RangePartitioning      *atomGroups,
2632                            gmx::ArrayRef<const gmx_cgsort_t>  sort,
2633                            gmx::ArrayRef<gmx::RVec>           v,
2634                            gmx::ArrayRef<gmx::RVec>           buf)
2635 {
2636     if (atomGroups == nullptr)
2637     {
2638         /* Avoid the useless loop of the atoms within a cg */
2639         orderVector(sort, v, buf);
2640
2641         return;
2642     }
2643
2644     /* Order the data */
2645     int a = 0;
2646     for (const gmx_cgsort_t &entry : sort)
2647     {
2648         for (int i : atomGroups->block(entry.ind))
2649         {
2650             copy_rvec(v[i], buf[a]);
2651             a++;
2652         }
2653     }
2654     int atot = a;
2655
2656     /* Copy back to the original array */
2657     for (int a = 0; a < atot; a++)
2658     {
2659         copy_rvec(buf[a], v[a]);
2660     }
2661 }
2662
2663 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2664 static void dd_sort_order_nbnxn(const t_forcerec          *fr,
2665                                 std::vector<gmx_cgsort_t> *sort)
2666 {
2667     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2668
2669     /* Using push_back() instead of this resize results in much slower code */
2670     sort->resize(atomOrder.size());
2671     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2672     size_t                      numSorted = 0;
2673     for (int i : atomOrder)
2674     {
2675         if (i >= 0)
2676         {
2677             /* The values of nsc and ind_gl are not used in this case */
2678             buffer[numSorted++].ind = i;
2679         }
2680     }
2681     sort->resize(numSorted);
2682 }
2683
2684 //! Returns the sorting state for DD.
2685 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state)
2686 {
2687     gmx_domdec_sort_t *sort = dd->comm->sort.get();
2688
2689     dd_sort_order_nbnxn(fr, &sort->sorted);
2690
2691     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
2692
2693     /* We alloc with the old size, since cgindex is still old */
2694     GMX_ASSERT(atomGrouping.numBlocks() == dd->ncg_home, "atomGroups and dd should be consistent");
2695     DDBufferAccess<gmx::RVec>     rvecBuffer(dd->comm->rvecBuffer, atomGrouping.fullRange().end());
2696
2697     const gmx::RangePartitioning *atomGroupsPtr = (dd->comm->bCGs ? &atomGrouping : nullptr);
2698
2699     /* Set the new home atom/charge group count */
2700     dd->ncg_home = sort->sorted.size();
2701     if (debug)
2702     {
2703         fprintf(debug, "Set the new home %s count to %d\n",
2704                 dd->comm->bCGs ? "charge group" : "atom",
2705                 dd->ncg_home);
2706     }
2707
2708     /* Reorder the state */
2709     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2710     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2711
2712     if (state->flags & (1 << estX))
2713     {
2714         order_vec_atom(atomGroupsPtr, cgsort, state->x, rvecBuffer.buffer);
2715     }
2716     if (state->flags & (1 << estV))
2717     {
2718         order_vec_atom(atomGroupsPtr, cgsort, state->v, rvecBuffer.buffer);
2719     }
2720     if (state->flags & (1 << estCGP))
2721     {
2722         order_vec_atom(atomGroupsPtr, cgsort, state->cg_p, rvecBuffer.buffer);
2723     }
2724
2725     if (fr->cutoff_scheme == ecutsGROUP)
2726     {
2727         /* Reorder cgcm */
2728         gmx::ArrayRef<gmx::RVec> cgcmRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cgcm[0]), cgsort.size());
2729         orderVector(cgsort, cgcmRef, rvecBuffer.buffer);
2730     }
2731
2732     /* Reorder the global cg index */
2733     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2734     /* Reorder the cginfo */
2735     orderVector<int>(cgsort, gmx::arrayRefFromArray(fr->cginfo, cgsort.size()), &sort->intBuffer);
2736     /* Rebuild the local cg index */
2737     if (dd->comm->bCGs)
2738     {
2739         /* We make a new, ordered atomGroups object and assign it to
2740          * the old one. This causes some allocation overhead, but saves
2741          * a copy back of the whole index.
2742          */
2743         gmx::RangePartitioning ordered;
2744         for (const gmx_cgsort_t &entry : cgsort)
2745         {
2746             ordered.appendBlock(atomGrouping.block(entry.ind).size());
2747         }
2748         dd->atomGrouping_ = ordered;
2749     }
2750     else
2751     {
2752         dd->atomGrouping_.setAllBlocksSizeOne(dd->ncg_home);
2753     }
2754     /* Set the home atom number */
2755     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->atomGrouping().fullRange().end());
2756
2757     /* The atoms are now exactly in grid order, update the grid order */
2758     fr->nbv->setLocalAtomOrder();
2759 }
2760
2761 //! Accumulates load statistics.
2762 static void add_dd_statistics(gmx_domdec_t *dd)
2763 {
2764     gmx_domdec_comm_t *comm = dd->comm;
2765
2766     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2767     {
2768         auto range = static_cast<DDAtomRanges::Type>(i);
2769         comm->sum_nat[i] +=
2770             comm->atomRanges.end(range) - comm->atomRanges.start(range);
2771     }
2772     comm->ndecomp++;
2773 }
2774
2775 void reset_dd_statistics_counters(gmx_domdec_t *dd)
2776 {
2777     gmx_domdec_comm_t *comm = dd->comm;
2778
2779     /* Reset all the statistics and counters for total run counting */
2780     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2781     {
2782         comm->sum_nat[i] = 0;
2783     }
2784     comm->ndecomp   = 0;
2785     comm->nload     = 0;
2786     comm->load_step = 0;
2787     comm->load_sum  = 0;
2788     comm->load_max  = 0;
2789     clear_ivec(comm->load_lim);
2790     comm->load_mdf = 0;
2791     comm->load_pme = 0;
2792 }
2793
2794 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
2795 {
2796     gmx_domdec_comm_t *comm      = cr->dd->comm;
2797
2798     const int          numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2799     gmx_sumd(numRanges, comm->sum_nat, cr);
2800
2801     if (fplog == nullptr)
2802     {
2803         return;
2804     }
2805
2806     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2807
2808     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2809     {
2810         auto   range = static_cast<DDAtomRanges::Type>(i);
2811         double av    = comm->sum_nat[i]/comm->ndecomp;
2812         switch (range)
2813         {
2814             case DDAtomRanges::Type::Zones:
2815                 fprintf(fplog,
2816                         " av. #atoms communicated per step for force:  %d x %.1f\n",
2817                         2, av);
2818                 break;
2819             case DDAtomRanges::Type::Vsites:
2820                 if (cr->dd->vsite_comm)
2821                 {
2822                     fprintf(fplog,
2823                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2824                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
2825                             av);
2826                 }
2827                 break;
2828             case DDAtomRanges::Type::Constraints:
2829                 if (cr->dd->constraint_comm)
2830                 {
2831                     fprintf(fplog,
2832                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2833                             1 + ir->nLincsIter, av);
2834                 }
2835                 break;
2836             default:
2837                 gmx_incons(" Unknown type for DD statistics");
2838         }
2839     }
2840     fprintf(fplog, "\n");
2841
2842     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
2843     {
2844         print_dd_load_av(fplog, cr->dd);
2845     }
2846 }
2847
2848 // TODO Remove fplog when group scheme and charge groups are gone
2849 void dd_partition_system(FILE                    *fplog,
2850                          const gmx::MDLogger     &mdlog,
2851                          int64_t                  step,
2852                          const t_commrec         *cr,
2853                          gmx_bool                 bMasterState,
2854                          int                      nstglobalcomm,
2855                          t_state                 *state_global,
2856                          const gmx_mtop_t        &top_global,
2857                          const t_inputrec        *ir,
2858                          gmx::ImdSession         *imdSession,
2859                          t_state                 *state_local,
2860                          PaddedVector<gmx::RVec> *f,
2861                          gmx::MDAtoms            *mdAtoms,
2862                          gmx_localtop_t          *top_local,
2863                          t_forcerec              *fr,
2864                          gmx_vsite_t             *vsite,
2865                          gmx::Constraints        *constr,
2866                          t_nrnb                  *nrnb,
2867                          gmx_wallcycle           *wcycle,
2868                          gmx_bool                 bVerbose)
2869 {
2870     gmx_domdec_t      *dd;
2871     gmx_domdec_comm_t *comm;
2872     gmx_ddbox_t        ddbox = {0};
2873     t_block           *cgs_gl;
2874     int64_t            step_pcoupl;
2875     rvec               cell_ns_x0, cell_ns_x1;
2876     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2877     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2878     gmx_bool           bRedist;
2879     ivec               np;
2880     real               grid_density;
2881     char               sbuf[22];
2882
2883     wallcycle_start(wcycle, ewcDOMDEC);
2884
2885     dd   = cr->dd;
2886     comm = dd->comm;
2887
2888     // TODO if the update code becomes accessible here, use
2889     // upd->deform for this logic.
2890     bBoxChanged = (bMasterState || inputrecDeform(ir));
2891     if (ir->epc != epcNO)
2892     {
2893         /* With nstpcouple > 1 pressure coupling happens.
2894          * one step after calculating the pressure.
2895          * Box scaling happens at the end of the MD step,
2896          * after the DD partitioning.
2897          * We therefore have to do DLB in the first partitioning
2898          * after an MD step where P-coupling occurred.
2899          * We need to determine the last step in which p-coupling occurred.
2900          * MRS -- need to validate this for vv?
2901          */
2902         int n = ir->nstpcouple;
2903         if (n == 1)
2904         {
2905             step_pcoupl = step - 1;
2906         }
2907         else
2908         {
2909             step_pcoupl = ((step - 1)/n)*n + 1;
2910         }
2911         if (step_pcoupl >= comm->partition_step)
2912         {
2913             bBoxChanged = TRUE;
2914         }
2915     }
2916
2917     bNStGlobalComm = (step % nstglobalcomm == 0);
2918
2919     if (!isDlbOn(comm))
2920     {
2921         bDoDLB = FALSE;
2922     }
2923     else
2924     {
2925         /* Should we do dynamic load balacing this step?
2926          * Since it requires (possibly expensive) global communication,
2927          * we might want to do DLB less frequently.
2928          */
2929         if (bBoxChanged || ir->epc != epcNO)
2930         {
2931             bDoDLB = bBoxChanged;
2932         }
2933         else
2934         {
2935             bDoDLB = bNStGlobalComm;
2936         }
2937     }
2938
2939     /* Check if we have recorded loads on the nodes */
2940     if (comm->bRecordLoad && dd_load_count(comm) > 0)
2941     {
2942         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2943
2944         /* Print load every nstlog, first and last step to the log file */
2945         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
2946                     comm->n_load_collect == 0 ||
2947                     (ir->nsteps >= 0 &&
2948                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
2949
2950         /* Avoid extra communication due to verbose screen output
2951          * when nstglobalcomm is set.
2952          */
2953         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
2954             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
2955         {
2956             get_load_distribution(dd, wcycle);
2957             if (DDMASTER(dd))
2958             {
2959                 if (bLogLoad)
2960                 {
2961                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step-1));
2962                 }
2963                 if (bVerbose)
2964                 {
2965                     dd_print_load_verbose(dd);
2966                 }
2967             }
2968             comm->n_load_collect++;
2969
2970             if (isDlbOn(comm))
2971             {
2972                 if (DDMASTER(dd))
2973                 {
2974                     /* Add the measured cycles to the running average */
2975                     const float averageFactor        = 0.1f;
2976                     comm->cyclesPerStepDlbExpAverage =
2977                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
2978                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
2979                 }
2980                 if (comm->dlbState == DlbState::onCanTurnOff &&
2981                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2982                 {
2983                     gmx_bool turnOffDlb;
2984                     if (DDMASTER(dd))
2985                     {
2986                         /* If the running averaged cycles with DLB are more
2987                          * than before we turned on DLB, turn off DLB.
2988                          * We will again run and check the cycles without DLB
2989                          * and we can then decide if to turn off DLB forever.
2990                          */
2991                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
2992                                       comm->cyclesPerStepBeforeDLB);
2993                     }
2994                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2995                     if (turnOffDlb)
2996                     {
2997                         /* To turn off DLB, we need to redistribute the atoms */
2998                         dd_collect_state(dd, state_local, state_global);
2999                         bMasterState = TRUE;
3000                         turn_off_dlb(mdlog, dd, step);
3001                     }
3002                 }
3003             }
3004             else if (bCheckWhetherToTurnDlbOn)
3005             {
3006                 gmx_bool turnOffDlbForever = FALSE;
3007                 gmx_bool turnOnDlb         = FALSE;
3008
3009                 /* Since the timings are node dependent, the master decides */
3010                 if (DDMASTER(dd))
3011                 {
3012                     /* If we recently turned off DLB, we want to check if
3013                      * performance is better without DLB. We want to do this
3014                      * ASAP to minimize the chance that external factors
3015                      * slowed down the DLB step are gone here and we
3016                      * incorrectly conclude that DLB was causing the slowdown.
3017                      * So we measure one nstlist block, no running average.
3018                      */
3019                     if (comm->haveTurnedOffDlb &&
3020                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
3021                         comm->cyclesPerStepDlbExpAverage)
3022                     {
3023                         /* After turning off DLB we ran nstlist steps in fewer
3024                          * cycles than with DLB. This likely means that DLB
3025                          * in not benefical, but this could be due to a one
3026                          * time unlucky fluctuation, so we require two such
3027                          * observations in close succession to turn off DLB
3028                          * forever.
3029                          */
3030                         if (comm->dlbSlowerPartitioningCount > 0 &&
3031                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
3032                         {
3033                             turnOffDlbForever = TRUE;
3034                         }
3035                         comm->haveTurnedOffDlb           = false;
3036                         /* Register when we last measured DLB slowdown */
3037                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
3038                     }
3039                     else
3040                     {
3041                         /* Here we check if the max PME rank load is more than 0.98
3042                          * the max PP force load. If so, PP DLB will not help,
3043                          * since we are (almost) limited by PME. Furthermore,
3044                          * DLB will cause a significant extra x/f redistribution
3045                          * cost on the PME ranks, which will then surely result
3046                          * in lower total performance.
3047                          */
3048                         if (cr->npmenodes > 0 &&
3049                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
3050                         {
3051                             turnOnDlb = FALSE;
3052                         }
3053                         else
3054                         {
3055                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
3056                         }
3057                     }
3058                 }
3059                 struct
3060                 {
3061                     gmx_bool turnOffDlbForever;
3062                     gmx_bool turnOnDlb;
3063                 }
3064                 bools {
3065                     turnOffDlbForever, turnOnDlb
3066                 };
3067                 dd_bcast(dd, sizeof(bools), &bools);
3068                 if (bools.turnOffDlbForever)
3069                 {
3070                     turn_off_dlb_forever(mdlog, dd, step);
3071                 }
3072                 else if (bools.turnOnDlb)
3073                 {
3074                     turn_on_dlb(mdlog, dd, step);
3075                     bDoDLB = TRUE;
3076                 }
3077             }
3078         }
3079         comm->n_load_have++;
3080     }
3081
3082     cgs_gl = &comm->cgs_gl;
3083
3084     bRedist = FALSE;
3085     if (bMasterState)
3086     {
3087         /* Clear the old state */
3088         clearDDStateIndices(dd, 0, 0);
3089         ncgindex_set = 0;
3090
3091         auto xGlobal = positionsFromStatePointer(state_global);
3092
3093         set_ddbox(*dd, true,
3094                   DDMASTER(dd) ? state_global->box : nullptr,
3095                   true, xGlobal,
3096                   &ddbox);
3097
3098         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
3099
3100         dd_make_local_cgs(dd, &top_local->cgs);
3101
3102         /* Ensure that we have space for the new distribution */
3103         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
3104
3105         if (fr->cutoff_scheme == ecutsGROUP)
3106         {
3107             calc_cgcm(fplog, 0, dd->ncg_home,
3108                       &top_local->cgs, state_local->x.rvec_array(), fr->cg_cm);
3109         }
3110
3111         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3112
3113         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3114     }
3115     else if (state_local->ddp_count != dd->ddp_count)
3116     {
3117         if (state_local->ddp_count > dd->ddp_count)
3118         {
3119             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64 ")", state_local->ddp_count, dd->ddp_count);
3120         }
3121
3122         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
3123         {
3124             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
3125         }
3126
3127         /* Clear the old state */
3128         clearDDStateIndices(dd, 0, 0);
3129
3130         /* Restore the atom group indices from state_local */
3131         restoreAtomGroups(dd, cgs_gl->index, state_local);
3132         make_dd_indices(dd, cgs_gl->index, 0);
3133         ncgindex_set = dd->ncg_home;
3134
3135         if (fr->cutoff_scheme == ecutsGROUP)
3136         {
3137             /* Redetermine the cg COMs */
3138             calc_cgcm(fplog, 0, dd->ncg_home,
3139                       &top_local->cgs, state_local->x.rvec_array(), fr->cg_cm);
3140         }
3141
3142         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3143
3144         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3145
3146         set_ddbox(*dd, bMasterState, state_local->box,
3147                   true, state_local->x, &ddbox);
3148
3149         bRedist = isDlbOn(comm);
3150     }
3151     else
3152     {
3153         /* We have the full state, only redistribute the cgs */
3154
3155         /* Clear the non-home indices */
3156         clearDDStateIndices(dd, dd->ncg_home, comm->atomRanges.numHomeAtoms());
3157         ncgindex_set = 0;
3158
3159         /* To avoid global communication, we do not recompute the extent
3160          * of the system for dims without pbc. Therefore we need to copy
3161          * the previously computed values when we do not communicate.
3162          */
3163         if (!bNStGlobalComm)
3164         {
3165             copy_rvec(comm->box0, ddbox.box0    );
3166             copy_rvec(comm->box_size, ddbox.box_size);
3167         }
3168         set_ddbox(*dd, bMasterState, state_local->box,
3169                   bNStGlobalComm, state_local->x, &ddbox);
3170
3171         bBoxChanged = TRUE;
3172         bRedist     = TRUE;
3173     }
3174     /* Copy needed for dim's without pbc when avoiding communication */
3175     copy_rvec(ddbox.box0, comm->box0    );
3176     copy_rvec(ddbox.box_size, comm->box_size);
3177
3178     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(*dd), bMasterState, bDoDLB,
3179                       step, wcycle);
3180
3181     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
3182     {
3183         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3184     }
3185
3186     if (comm->useUpdateGroups)
3187     {
3188         comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3189                                        state_local->x);
3190     }
3191
3192     /* Check if we should sort the charge groups */
3193     const bool bSortCG = (bMasterState || bRedist);
3194
3195     /* When repartitioning we mark atom groups that will move to neighboring
3196      * DD cells, but we do not move them right away for performance reasons.
3197      * Thus we need to keep track of how many charge groups will move for
3198      * obtaining correct local charge group / atom counts.
3199      */
3200     ncg_moved = 0;
3201     if (bRedist)
3202     {
3203         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
3204
3205         ncgindex_set = dd->ncg_home;
3206         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
3207                            state_local, f, fr,
3208                            nrnb, &ncg_moved);
3209
3210         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3211
3212         if (comm->useUpdateGroups)
3213         {
3214             comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3215                                            state_local->x);
3216         }
3217
3218         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3219     }
3220
3221     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
3222                           dd, &ddbox,
3223                           &comm->cell_x0, &comm->cell_x1,
3224                           dd->ncg_home, fr->cg_cm,
3225                           cell_ns_x0, cell_ns_x1, &grid_density);
3226
3227     if (bBoxChanged)
3228     {
3229         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3230     }
3231
3232     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
3233     copy_ivec(ddbox.tric_dir, comm->tric_dir);
3234
3235     if (bSortCG)
3236     {
3237         wallcycle_sub_start(wcycle, ewcsDD_GRID);
3238
3239         /* Sort the state on charge group position.
3240          * This enables exact restarts from this step.
3241          * It also improves performance by about 15% with larger numbers
3242          * of atoms per node.
3243          */
3244
3245         /* Fill the ns grid with the home cell,
3246          * so we can sort with the indices.
3247          */
3248         set_zones_ncg_home(dd);
3249
3250         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3251
3252         nbnxn_put_on_grid(fr->nbv.get(), state_local->box,
3253                           0,
3254                           comm->zones.size[0].bb_x0,
3255                           comm->zones.size[0].bb_x1,
3256                           comm->updateGroupsCog.get(),
3257                           0, dd->ncg_home,
3258                           comm->zones.dens_zone0,
3259                           fr->cginfo,
3260                           state_local->x,
3261                           ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr);
3262
3263         if (debug)
3264         {
3265             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
3266                     gmx_step_str(step, sbuf), dd->ncg_home);
3267         }
3268         dd_sort_state(dd, fr->cg_cm, fr, state_local);
3269
3270         /* After sorting and compacting we set the correct size */
3271         dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
3272
3273         /* Rebuild all the indices */
3274         dd->ga2la->clear();
3275         ncgindex_set = 0;
3276
3277         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3278     }
3279     else
3280     {
3281         /* With the group scheme the sorting array is part of the DD state,
3282          * but it just got out of sync, so mark as invalid by emptying it.
3283          */
3284         if (ir->cutoff_scheme == ecutsGROUP)
3285         {
3286             comm->sort->sorted.clear();
3287         }
3288     }
3289
3290     if (comm->useUpdateGroups)
3291     {
3292         /* The update groups cog's are invalid after sorting
3293          * and need to be cleared before the next partitioning anyhow.
3294          */
3295         comm->updateGroupsCog->clear();
3296     }
3297
3298     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3299
3300     /* Setup up the communication and communicate the coordinates */
3301     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
3302
3303     /* Set the indices */
3304     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
3305
3306     /* Set the charge group boundaries for neighbor searching */
3307     set_cg_boundaries(&comm->zones);
3308
3309     if (fr->cutoff_scheme == ecutsVERLET)
3310     {
3311         /* When bSortCG=true, we have already set the size for zone 0 */
3312         set_zones_size(dd, state_local->box, &ddbox,
3313                        bSortCG ? 1 : 0, comm->zones.n,
3314                        0);
3315     }
3316
3317     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3318
3319     /*
3320        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3321                  -1,state_local->x.rvec_array(),state_local->box);
3322      */
3323
3324     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3325
3326     /* Extract a local topology from the global topology */
3327     for (int i = 0; i < dd->ndim; i++)
3328     {
3329         np[dd->dim[i]] = comm->cd[i].numPulses();
3330     }
3331     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
3332                       comm->cellsize_min, np,
3333                       fr,
3334                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x.rvec_array(),
3335                       top_global, top_local);
3336
3337     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3338
3339     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3340
3341     /* Set up the special atom communication */
3342     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3343     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3344     {
3345         auto range = static_cast<DDAtomRanges::Type>(i);
3346         switch (range)
3347         {
3348             case DDAtomRanges::Type::Vsites:
3349                 if (vsite && vsite->numInterUpdategroupVsites)
3350                 {
3351                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3352                 }
3353                 break;
3354             case DDAtomRanges::Type::Constraints:
3355                 if (dd->splitConstraints || dd->splitSettles)
3356                 {
3357                     /* Only for inter-cg constraints we need special code */
3358                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo,
3359                                                   constr, ir->nProjOrder,
3360                                                   top_local->idef.il);
3361                 }
3362                 break;
3363             default:
3364                 gmx_incons("Unknown special atom type setup");
3365         }
3366         comm->atomRanges.setEnd(range, n);
3367     }
3368
3369     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3370
3371     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3372
3373     /* Make space for the extra coordinates for virtual site
3374      * or constraint communication.
3375      */
3376     state_local->natoms = comm->atomRanges.numAtomsTotal();
3377
3378     dd_resize_state(state_local, f, state_local->natoms);
3379
3380     if (fr->haveDirectVirialContributions)
3381     {
3382         if (vsite && vsite->numInterUpdategroupVsites)
3383         {
3384             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3385         }
3386         else
3387         {
3388             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
3389             {
3390                 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3391             }
3392             else
3393             {
3394                 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3395             }
3396         }
3397     }
3398     else
3399     {
3400         nat_f_novirsum = 0;
3401     }
3402
3403     /* Set the number of atoms required for the force calculation.
3404      * Forces need to be constrained when doing energy
3405      * minimization. For simple simulations we could avoid some
3406      * allocation, zeroing and copying, but this is probably not worth
3407      * the complications and checking.
3408      */
3409     forcerec_set_ranges(fr, dd->ncg_home, dd->globalAtomGroupIndices.size(),
3410                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3411                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3412                         nat_f_novirsum);
3413
3414     /* Update atom data for mdatoms and several algorithms */
3415     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
3416                               nullptr, mdAtoms, constr, vsite, nullptr);
3417
3418     auto mdatoms = mdAtoms->mdatoms();
3419     if (!thisRankHasDuty(cr, DUTY_PME))
3420     {
3421         /* Send the charges and/or c6/sigmas to our PME only node */
3422         gmx_pme_send_parameters(cr,
3423                                 fr->ic,
3424                                 mdatoms->nChargePerturbed != 0, mdatoms->nTypePerturbed != 0,
3425                                 mdatoms->chargeA, mdatoms->chargeB,
3426                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
3427                                 mdatoms->sigmaA, mdatoms->sigmaB,
3428                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3429     }
3430
3431     if (ir->bPull)
3432     {
3433         /* Update the local pull groups */
3434         dd_make_local_pull_groups(cr, ir->pull_work);
3435     }
3436
3437     if (dd->atomSets != nullptr)
3438     {
3439         /* Update the local atom sets */
3440         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3441     }
3442
3443     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3444     imdSession->dd_make_local_IMD_atoms(dd);
3445
3446     add_dd_statistics(dd);
3447
3448     /* Make sure we only count the cycles for this DD partitioning */
3449     clear_dd_cycle_counts(dd);
3450
3451     /* Because the order of the atoms might have changed since
3452      * the last vsite construction, we need to communicate the constructing
3453      * atom coordinates again (for spreading the forces this MD step).
3454      */
3455     dd_move_x_vsites(dd, state_local->box, state_local->x.rvec_array());
3456
3457     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3458
3459     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
3460     {
3461         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3462         write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
3463                      -1, state_local->x.rvec_array(), state_local->box);
3464     }
3465
3466     /* Store the partitioning step */
3467     comm->partition_step = step;
3468
3469     /* Increase the DD partitioning counter */
3470     dd->ddp_count++;
3471     /* The state currently matches this DD partitioning count, store it */
3472     state_local->ddp_count = dd->ddp_count;
3473     if (bMasterState)
3474     {
3475         /* The DD master node knows the complete cg distribution,
3476          * store the count so we can possibly skip the cg info communication.
3477          */
3478         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3479     }
3480
3481     if (comm->DD_debug > 0)
3482     {
3483         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3484         check_index_consistency(dd, top_global.natoms, ncg_mtop(&top_global),
3485                                 "after partitioning");
3486     }
3487
3488     wallcycle_stop(wcycle, ewcDOMDEC);
3489 }
3490
3491 /*! \brief Check whether bonded interactions are missing, if appropriate */
3492 void checkNumberOfBondedInteractions(const gmx::MDLogger  &mdlog,
3493                                      t_commrec            *cr,
3494                                      int                   totalNumberOfBondedInteractions,
3495                                      const gmx_mtop_t     *top_global,
3496                                      const gmx_localtop_t *top_local,
3497                                      const t_state        *state,
3498                                      bool                 *shouldCheckNumberOfBondedInteractions)
3499 {
3500     if (*shouldCheckNumberOfBondedInteractions)
3501     {
3502         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3503         {
3504             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
3505         }
3506         *shouldCheckNumberOfBondedInteractions = false;
3507     }
3508 }