src/gromacs/domdec/domdec_setup.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2008,2009,2010,2011,2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file defines functions used by the domdec module
  39  * in its initial setup phase.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \ingroup module_domdec
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include <cassert>
  48 #include <cinttypes>
  49 #include <cmath>
  50 #include <cstdio>
  51
  52 #include "gromacs/domdec/domdec.h"
  53 #include "gromacs/domdec/domdec_struct.h"
  54 #include "gromacs/ewald/pme.h"
  55 #include "gromacs/gmxlib/network.h"
  56 #include "gromacs/math/utilities.h"
  57 #include "gromacs/math/vec.h"
  58 #include "gromacs/mdlib/perf_est.h"
  59 #include "gromacs/mdtypes/commrec.h"
  60 #include "gromacs/mdtypes/inputrec.h"
  61 #include "gromacs/mdtypes/md_enums.h"
  62 #include "gromacs/pbcutil/pbc.h"
  63 #include "gromacs/topology/topology.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/logger.h"
  66 #include "gromacs/utility/stringutil.h"
  67
  68 /*! \brief Margin for setting up the DD grid */
  69 #define DD_GRID_MARGIN_PRES_SCALE 1.05
  70
  71 /*! \brief Factorize \p n.
  72  *
  73  * \param[in]    n     Value to factorize
  74  * \param[out]   fac   Vector of factors (to be allocated in this function)
  75  * \param[out]   mfac  Vector with the number of times each factor repeats in the factorization (to be allocated in this function)
  76  */
  77 static void factorize(int               n,
  78                       std::vector<int> *fac,
  79                       std::vector<int> *mfac)
  80 {
  81     if (n <= 0)
  82     {
  83         gmx_fatal(FARGS, "Can only factorize positive integers.");
  84     }
  85
  86     /* Decompose n in factors */
  87     fac->clear();
  88     mfac->clear();
  89     int d = 2;
  90     while (n > 1)
  91     {
  92         while (n % d == 0)
  93         {
  94             if (fac->empty() || fac->back() != d)
  95             {
  96                 fac->push_back(d);
  97                 mfac->push_back(1);
  98             }
  99             else
 100             {
 101                 mfac->back()++;
 102             }
 103             n /= d;
 104         }
 105         d++;
 106     }
 107 }
 108
 109 /*! \brief Find largest divisor of \p n smaller than \p n*/
 110 static int largest_divisor(int n)
 111 {
 112     std::vector<int> div;
 113     std::vector<int> mdiv;
 114     factorize(n, &div, &mdiv);
 115
 116     return div.back();
 117 }
 118
 119 /*! \brief Compute largest common divisor of \p n1 and \b n2 */
 120 static int lcd(int n1, int n2)
 121 {
 122     int d, i;
 123
 124     d = 1;
 125     for (i = 2; (i <= n1 && i <= n2); i++)
 126     {
 127         if (n1 % i == 0 && n2 % i == 0)
 128         {
 129             d = i;
 130         }
 131     }
 132
 133     return d;
 134 }
 135
 136 /*! \brief Returns TRUE when there are enough PME ranks for the ratio */
 137 static gmx_bool fits_pme_ratio(int nrank_tot, int nrank_pme, float ratio)
 138 {
 139     return (static_cast<double>(nrank_pme)/static_cast<double>(nrank_tot) > 0.95*ratio);
 140 }
 141
 142 /*! \brief Returns TRUE when npme out of ntot ranks doing PME is expected to give reasonable performance */
 143 static gmx_bool fits_pp_pme_perf(int ntot, int npme, float ratio)
 144 {
 145     std::vector<int> div;
 146     std::vector<int> mdiv;
 147     factorize(ntot - npme, &div, &mdiv);
 148
 149     int npp_root3  = gmx::roundToInt(std::cbrt(ntot - npme));
 150     int npme_root2 = gmx::roundToInt(std::sqrt(static_cast<double>(npme)));
 151
 152     /* The check below gives a reasonable division:
 153      * factor 5 allowed at 5 or more PP ranks,
 154      * factor 7 allowed at 49 or more PP ranks.
 155      */
 156     if (div.back() > 3 + npp_root3)
 157     {
 158         return FALSE;
 159     }
 160
 161     /* Check if the number of PP and PME ranks have a reasonable sized
 162      * denominator in common, such that we can use 2D PME decomposition
 163      * when required (which requires nx_pp == nx_pme).
 164      * The factor of 2 allows for a maximum ratio of 2^2=4
 165      * between nx_pme and ny_pme.
 166      */
 167     if (lcd(ntot - npme, npme)*2 < npme_root2)
 168     {
 169         return FALSE;
 170     }
 171
 172     /* Does this division gives a reasonable PME load? */
 173     return fits_pme_ratio(ntot, npme, ratio);
 174 }
 175
 176 /*! \brief Make a guess for the number of PME ranks to use. */
 177 static int guess_npme(const gmx::MDLogger &mdlog,
 178                       const gmx_mtop_t *mtop, const t_inputrec *ir,
 179                       const matrix box,
 180                       int nrank_tot)
 181 {
 182     float      ratio;
 183     int        npme;
 184
 185     ratio = pme_load_estimate(mtop, ir, box);
 186
 187     GMX_LOG(mdlog.info).appendTextFormatted(
 188             "Guess for relative PME load: %.2f", ratio);
 189
 190     /* We assume the optimal rank ratio is close to the load ratio.
 191      * The communication load is neglected,
 192      * but (hopefully) this will balance out between PP and PME.
 193      */
 194
 195     if (!fits_pme_ratio(nrank_tot, nrank_tot/2, ratio))
 196     {
 197         /* We would need more than nrank_tot/2 PME only nodes,
 198          * which is not possible. Since the PME load is very high,
 199          * we will not loose much performance when all ranks do PME.
 200          */
 201
 202         return 0;
 203     }
 204
 205     /* First try to find npme as a factor of nrank_tot up to nrank_tot/3.
 206      * We start with a minimum PME node fraction of 1/16
 207      * and avoid ratios which lead to large prime factors in nnodes-npme.
 208      */
 209     npme = (nrank_tot + 15)/16;
 210     while (npme <= nrank_tot/3)
 211     {
 212         if (nrank_tot % npme == 0)
 213         {
 214             /* Note that fits_perf might change the PME grid,
 215              * in the current implementation it does not.
 216              */
 217             if (fits_pp_pme_perf(nrank_tot, npme, ratio))
 218             {
 219                 break;
 220             }
 221         }
 222         npme++;
 223     }
 224     if (npme > nrank_tot/3)
 225     {
 226         /* Try any possible number for npme */
 227         npme = 1;
 228         while (npme <= nrank_tot/2)
 229         {
 230             /* Note that fits_perf may change the PME grid */
 231             if (fits_pp_pme_perf(nrank_tot, npme, ratio))
 232             {
 233                 break;
 234             }
 235             npme++;
 236         }
 237     }
 238     if (npme > nrank_tot/2)
 239     {
 240         gmx_fatal(FARGS, "Could not find an appropriate number of separate PME ranks. i.e. >= %5f*#ranks (%d) and <= #ranks/2 (%d) and reasonable performance wise (grid_x=%d, grid_y=%d).\n"
 241                   "Use the -npme option of mdrun or change the number of ranks or the PME grid dimensions, see the manual for details.",
 242                   ratio, gmx::roundToInt(0.95*ratio*nrank_tot), nrank_tot/2, ir->nkx, ir->nky);
 243     }
 244     else
 245     {
 246         GMX_LOG(mdlog.info).appendTextFormatted(
 247                 "Will use %d particle-particle and %d PME only ranks\n"
 248                 "This is a guess, check the performance at the end of the log file",
 249                 nrank_tot - npme, npme);
 250     }
 251
 252     return npme;
 253 }
 254
 255 /*! \brief Return \p n divided by \p f rounded up to the next integer. */
 256 static int div_up(int n, int f)
 257 {
 258     return (n + f - 1)/f;
 259 }
 260
 261 real comm_box_frac(const ivec dd_nc, real cutoff, const gmx_ddbox_t *ddbox)
 262 {
 263     int  i, j, k;
 264     rvec nw;
 265     real comm_vol;
 266
 267     for (i = 0; i < DIM; i++)
 268     {
 269         real bt = ddbox->box_size[i]*ddbox->skew_fac[i];
 270         nw[i] = dd_nc[i]*cutoff/bt;
 271     }
 272
 273     comm_vol = 0;
 274     for (i = 0; i < DIM; i++)
 275     {
 276         if (dd_nc[i] > 1)
 277         {
 278             comm_vol += nw[i];
 279             for (j = i+1; j < DIM; j++)
 280             {
 281                 if (dd_nc[j] > 1)
 282                 {
 283                     comm_vol += nw[i]*nw[j]*M_PI/4;
 284                     for (k = j+1; k < DIM; k++)
 285                     {
 286                         if (dd_nc[k] > 1)
 287                         {
 288                             comm_vol += nw[i]*nw[j]*nw[k]*M_PI/6;
 289                         }
 290                     }
 291                 }
 292             }
 293         }
 294     }
 295
 296     return comm_vol;
 297 }
 298
 299 /*! \brief Return whether the DD inhomogeneous in the z direction */
 300 static gmx_bool inhomogeneous_z(const t_inputrec *ir)
 301 {
 302     return ((EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) &&
 303             ir->ePBC == epbcXYZ && ir->ewald_geometry == eewg3DC);
 304 }
 305
 306 /*! \brief Estimate cost of PME FFT communication
 307  *
 308  * This only takes the communication into account and not imbalance
 309  * in the calculation. But the imbalance in communication and calculation
 310  * are similar and therefore these formulas also prefer load balance
 311  * in the FFT and pme_solve calculation.
 312  */
 313 static float comm_pme_cost_vol(int npme, int a, int b, int c)
 314 {
 315     /* We use a float here, since an integer might overflow */
 316     float comm_vol;
 317
 318     comm_vol  = npme - 1;
 319     comm_vol *= npme;
 320     comm_vol *= div_up(a, npme);
 321     comm_vol *= div_up(b, npme);
 322     comm_vol *= c;
 323
 324     return comm_vol;
 325 }
 326
 327 /*! \brief Estimate cost of communication for a possible domain decomposition. */
 328 static float comm_cost_est(real limit, real cutoff,
 329                            const matrix box, const gmx_ddbox_t *ddbox,
 330                            int natoms, const t_inputrec *ir,
 331                            float pbcdxr,
 332                            int npme_tot, ivec nc)
 333 {
 334     ivec  npme = {1, 1, 1};
 335     int   i, j, nk, overlap;
 336     rvec  bt;
 337     float comm_vol, comm_vol_xf, comm_pme, cost_pbcdx;
 338     /* This is the cost of a pbc_dx call relative to the cost
 339      * of communicating the coordinate and force of an atom.
 340      * This will be machine dependent.
 341      * These factors are for x86 with SMP or Infiniband.
 342      */
 343     float pbcdx_rect_fac = 0.1;
 344     float pbcdx_tric_fac = 0.2;
 345     float temp;
 346
 347     /* Check the DD algorithm restrictions */
 348     if ((ir->ePBC == epbcXY && ir->nwall < 2 && nc[ZZ] > 1) ||
 349         (ir->ePBC == epbcSCREW && (nc[XX] == 1 || nc[YY] > 1 || nc[ZZ] > 1)))
 350     {
 351         return -1;
 352     }
 353
 354     if (inhomogeneous_z(ir) && nc[ZZ] > 1)
 355     {
 356         return -1;
 357     }
 358
 359     assert(ddbox->npbcdim <= DIM);
 360
 361     /* Check if the triclinic requirements are met */
 362     for (i = 0; i < DIM; i++)
 363     {
 364         for (j = i+1; j < ddbox->npbcdim; j++)
 365         {
 366             if (box[j][i] != 0 || ir->deform[j][i] != 0 ||
 367                 (ir->epc != epcNO && ir->compress[j][i] != 0))
 368             {
 369                 if (nc[j] > 1 && nc[i] == 1)
 370                 {
 371                     return -1;
 372                 }
 373             }
 374         }
 375     }
 376
 377     for (i = 0; i < DIM; i++)
 378     {
 379         bt[i] = ddbox->box_size[i]*ddbox->skew_fac[i];
 380
 381         /* Without PBC and with 2 cells, there are no lower limits on the cell size */
 382         if (!(i >= ddbox->npbcdim && nc[i] <= 2) && bt[i] < nc[i]*limit)
 383         {
 384             return -1;
 385         }
 386         /* With PBC, check if the cut-off fits in nc[i]-1 cells */
 387         if (i < ddbox->npbcdim && nc[i] > 1 && (nc[i] - 1)*bt[i] < nc[i]*cutoff)
 388         {
 389             return -1;
 390         }
 391     }
 392
 393     if (npme_tot > 1)
 394     {
 395         /* The following choices should match those
 396          * in init_domain_decomposition in domdec.c.
 397          */
 398         if (nc[XX] == 1 && nc[YY] > 1)
 399         {
 400             npme[XX] = 1;
 401             npme[YY] = npme_tot;
 402         }
 403         else if (nc[YY] == 1)
 404         {
 405             npme[XX] = npme_tot;
 406             npme[YY] = 1;
 407         }
 408         else
 409         {
 410             /* Will we use 1D or 2D PME decomposition? */
 411             npme[XX] = (npme_tot % nc[XX] == 0) ? nc[XX] : npme_tot;
 412             npme[YY] = npme_tot/npme[XX];
 413         }
 414     }
 415
 416     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
 417     {
 418         /* Check the PME grid restrictions.
 419          * Currently these can only be invalid here with too few grid lines
 420          * along the x dimension per rank doing PME.
 421          */
 422         int npme_x = (npme_tot > 1 ? npme[XX] : nc[XX]);
 423
 424         /* Currently we don't have the OpenMP thread count available here.
 425          * But with threads we have only tighter restrictions and it's
 426          * probably better anyhow to avoid settings where we need to reduce
 427          * grid lines over multiple ranks, as the thread check will do.
 428          */
 429         bool useThreads     = true;
 430         bool errorsAreFatal = false;
 431         if (!gmx_pme_check_restrictions(ir->pme_order, ir->nkx, ir->nky, ir->nkz,
 432                                         npme_x, useThreads, errorsAreFatal))
 433         {
 434             return -1;
 435         }
 436     }
 437
 438     /* When two dimensions are (nearly) equal, use more cells
 439      * for the smallest index, so the decomposition does not
 440      * depend sensitively on the rounding of the box elements.
 441      */
 442     for (i = 0; i < DIM; i++)
 443     {
 444         for (j = i+1; j < DIM; j++)
 445         {
 446             /* Check if the box size is nearly identical,
 447              * in that case we prefer nx > ny  and ny > nz.
 448              */
 449             if (std::fabs(bt[j] - bt[i]) < 0.01*bt[i] && nc[j] > nc[i])
 450             {
 451                 /* The XX/YY check is a bit compact. If nc[YY]==npme[YY]
 452                  * this means the swapped nc has nc[XX]==npme[XX],
 453                  * and we can also swap X and Y for PME.
 454                  */
 455                 /* Check if dimension i and j are equivalent for PME.
 456                  * For x/y: if nc[YY]!=npme[YY], we can not swap x/y
 457                  * For y/z: we can not have PME decomposition in z
 458                  */
 459                 if (npme_tot <= 1 ||
 460                     !((i == XX && j == YY && nc[YY] != npme[YY]) ||
 461                       (i == YY && j == ZZ && npme[YY] > 1)))
 462                 {
 463                     return -1;
 464                 }
 465             }
 466         }
 467     }
 468
 469     /* This function determines only half of the communication cost.
 470      * All PP, PME and PP-PME communication is symmetric
 471      * and the "back"-communication cost is identical to the forward cost.
 472      */
 473
 474     comm_vol = comm_box_frac(nc, cutoff, ddbox);
 475
 476     comm_pme = 0;
 477     for (i = 0; i < 2; i++)
 478     {
 479         /* Determine the largest volume for PME x/f redistribution */
 480         if (nc[i] % npme[i] != 0)
 481         {
 482             if (nc[i] > npme[i])
 483             {
 484                 comm_vol_xf = (npme[i] == 2 ? 1.0/3.0 : 0.5);
 485             }
 486             else
 487             {
 488                 comm_vol_xf = 1.0 - lcd(nc[i], npme[i])/static_cast<double>(npme[i]);
 489             }
 490             comm_pme += 3*natoms*comm_vol_xf;
 491         }
 492
 493         /* Grid overlap communication */
 494         if (npme[i] > 1)
 495         {
 496             nk        = (i == 0 ? ir->nkx : ir->nky);
 497             overlap   = (nk % npme[i] == 0 ? ir->pme_order-1 : ir->pme_order);
 498             temp      = npme[i];
 499             temp     *= overlap;
 500             temp     *= ir->nkx;
 501             temp     *= ir->nky;
 502             temp     *= ir->nkz;
 503             temp     /= nk;
 504             comm_pme += temp;
 505 /* Old line comm_pme += npme[i]*overlap*ir->nkx*ir->nky*ir->nkz/nk; */
 506         }
 507     }
 508
 509     comm_pme += comm_pme_cost_vol(npme[YY], ir->nky, ir->nkz, ir->nkx);
 510     comm_pme += comm_pme_cost_vol(npme[XX], ir->nkx, ir->nky, ir->nkz);
 511
 512     /* Add cost of pbc_dx for bondeds */
 513     cost_pbcdx = 0;
 514     if ((nc[XX] == 1 || nc[YY] == 1) || (nc[ZZ] == 1 && ir->ePBC != epbcXY))
 515     {
 516         if ((ddbox->tric_dir[XX] && nc[XX] == 1) ||
 517             (ddbox->tric_dir[YY] && nc[YY] == 1))
 518         {
 519             cost_pbcdx = pbcdxr*pbcdx_tric_fac;
 520         }
 521         else
 522         {
 523             cost_pbcdx = pbcdxr*pbcdx_rect_fac;
 524         }
 525     }
 526
 527     if (debug)
 528     {
 529         fprintf(debug,
 530                 "nc %2d %2d %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %9.3e tot %9.3e\n",
 531                 nc[XX], nc[YY], nc[ZZ], npme[XX], npme[YY],
 532                 comm_vol, cost_pbcdx, comm_pme/(3*natoms),
 533                 comm_vol + cost_pbcdx + comm_pme/(3*natoms));
 534     }
 535
 536     return 3*natoms*(comm_vol + cost_pbcdx) + comm_pme;
 537 }
 538
 539 /*! \brief Assign penalty factors to possible domain decompositions, based on the estimated communication costs. */
 540 static void assign_factors(const gmx_domdec_t *dd,
 541                            real limit, real cutoff,
 542                            const matrix box, const gmx_ddbox_t *ddbox,
 543                            int natoms, const t_inputrec *ir,
 544                            float pbcdxr, int npme,
 545                            int ndiv, const int *div, const int *mdiv,
 546                            ivec ir_try, ivec opt)
 547 {
 548     int   x, y, i;
 549     float ce;
 550
 551     if (ndiv == 0)
 552     {
 553         ce = comm_cost_est(limit, cutoff, box, ddbox,
 554                            natoms, ir, pbcdxr, npme, ir_try);
 555         if (ce >= 0 && (opt[XX] == 0 ||
 556                         ce < comm_cost_est(limit, cutoff, box, ddbox,
 557                                            natoms, ir, pbcdxr,
 558                                            npme, opt)))
 559         {
 560             copy_ivec(ir_try, opt);
 561         }
 562
 563         return;
 564     }
 565
 566     for (x = mdiv[0]; x >= 0; x--)
 567     {
 568         for (i = 0; i < x; i++)
 569         {
 570             ir_try[XX] *= div[0];
 571         }
 572         for (y = mdiv[0]-x; y >= 0; y--)
 573         {
 574             for (i = 0; i < y; i++)
 575             {
 576                 ir_try[YY] *= div[0];
 577             }
 578             for (i = 0; i < mdiv[0]-x-y; i++)
 579             {
 580                 ir_try[ZZ] *= div[0];
 581             }
 582
 583             /* recurse */
 584             assign_factors(dd, limit, cutoff, box, ddbox, natoms, ir, pbcdxr, npme,
 585                            ndiv-1, div+1, mdiv+1, ir_try, opt);
 586
 587             for (i = 0; i < mdiv[0]-x-y; i++)
 588             {
 589                 ir_try[ZZ] /= div[0];
 590             }
 591             for (i = 0; i < y; i++)
 592             {
 593                 ir_try[YY] /= div[0];
 594             }
 595         }
 596         for (i = 0; i < x; i++)
 597         {
 598             ir_try[XX] /= div[0];
 599         }
 600     }
 601 }
 602
 603 /*! \brief Determine the optimal distribution of DD cells for the simulation system and number of MPI ranks */
 604 static real optimize_ncells(const gmx::MDLogger &mdlog,
 605                             int nnodes_tot, int npme_only,
 606                             gmx_bool bDynLoadBal, real dlb_scale,
 607                             const gmx_mtop_t *mtop,
 608                             const matrix box, const gmx_ddbox_t *ddbox,
 609                             const t_inputrec *ir,
 610                             gmx_domdec_t *dd,
 611                             real cellsize_limit, real cutoff,
 612                             const bool haveInterDomainBondeds,
 613                             ivec nc)
 614 {
 615     int      npp, npme, d, nmax;
 616     double   pbcdxr;
 617     real     limit;
 618     ivec     itry;
 619
 620     limit  = cellsize_limit;
 621
 622     dd->nc[XX] = 1;
 623     dd->nc[YY] = 1;
 624     dd->nc[ZZ] = 1;
 625
 626     npp = nnodes_tot - npme_only;
 627     if (EEL_PME(ir->coulombtype))
 628     {
 629         npme = (npme_only > 0 ? npme_only : npp);
 630     }
 631     else
 632     {
 633         npme = 0;
 634     }
 635
 636     if (haveInterDomainBondeds)
 637     {
 638         /* If we can skip PBC for distance calculations in plain-C bondeds,
 639          * we can save some time (e.g. 3D DD with pbc=xyz).
 640          * Here we ignore SIMD bondeds as they always do (fast) PBC.
 641          */
 642         count_bonded_distances(mtop, ir, &pbcdxr, nullptr);
 643         pbcdxr /= static_cast<double>(mtop->natoms);
 644     }
 645     else
 646     {
 647         /* Every molecule is a single charge group: no pbc required */
 648         pbcdxr = 0;
 649     }
 650     /* Add a margin for DLB and/or pressure scaling */
 651     if (bDynLoadBal)
 652     {
 653         if (dlb_scale >= 1.0)
 654         {
 655             gmx_fatal(FARGS, "The value for option -dds should be smaller than 1");
 656         }
 657         GMX_LOG(mdlog.info).appendTextFormatted(
 658                 "Scaling the initial minimum size with 1/%g (option -dds) = %g",
 659                 dlb_scale, 1/dlb_scale);
 660         limit /= dlb_scale;
 661     }
 662     else if (ir->epc != epcNO)
 663     {
 664         GMX_LOG(mdlog.info).appendTextFormatted(
 665                 "To account for pressure scaling, scaling the initial minimum size with %g",
 666                 DD_GRID_MARGIN_PRES_SCALE);
 667         limit *= DD_GRID_MARGIN_PRES_SCALE;
 668     }
 669
 670     GMX_LOG(mdlog.info).appendTextFormatted(
 671             "Optimizing the DD grid for %d cells with a minimum initial size of %.3f nm",
 672             npp, limit);
 673
 674     if (inhomogeneous_z(ir))
 675     {
 676         GMX_LOG(mdlog.info).appendTextFormatted(
 677                 "Ewald_geometry=%s: assuming inhomogeneous particle distribution in z, will not decompose in z.",
 678                 eewg_names[ir->ewald_geometry]);
 679     }
 680
 681     if (limit > 0)
 682     {
 683         std::string maximumCells = "The maximum allowed number of cells is:";
 684         for (d = 0; d < DIM; d++)
 685         {
 686             nmax = static_cast<int>(ddbox->box_size[d]*ddbox->skew_fac[d]/limit);
 687             if (d >= ddbox->npbcdim && nmax < 2)
 688             {
 689                 nmax = 2;
 690             }
 691             if (d == ZZ && inhomogeneous_z(ir))
 692             {
 693                 nmax = 1;
 694             }
 695             maximumCells += gmx::formatString(" %c %d", 'X' + d, nmax);
 696         }
 697         GMX_LOG(mdlog.info).appendText(maximumCells);
 698     }
 699
 700     if (debug)
 701     {
 702         fprintf(debug, "Average nr of pbc_dx calls per atom %.2f\n", pbcdxr);
 703     }
 704
 705     /* Decompose npp in factors */
 706     std::vector<int> div;
 707     std::vector<int> mdiv;
 708     factorize(npp, &div, &mdiv);
 709
 710     itry[XX] = 1;
 711     itry[YY] = 1;
 712     itry[ZZ] = 1;
 713     clear_ivec(nc);
 714     assign_factors(dd, limit, cutoff, box, ddbox, mtop->natoms, ir, pbcdxr,
 715                    npme, div.size(), div.data(), mdiv.data(), itry, nc);
 716
 717     return limit;
 718 }
 719
 720 real dd_choose_grid(const gmx::MDLogger &mdlog,
 721                     t_commrec *cr, gmx_domdec_t *dd,
 722                     const t_inputrec *ir,
 723                     const gmx_mtop_t *mtop,
 724                     const matrix box, const gmx_ddbox_t *ddbox,
 725                     int nPmeRanks,
 726                     gmx_bool bDynLoadBal, real dlb_scale,
 727                     real cellsize_limit, real cutoff_dd,
 728                     const bool haveInterDomainBondeds)
 729 {
 730     int64_t         nnodes_div, ldiv;
 731     real            limit;
 732
 733     if (MASTER(cr))
 734     {
 735         nnodes_div = cr->nnodes;
 736         if (EEL_PME(ir->coulombtype))
 737         {
 738             if (nPmeRanks > 0)
 739             {
 740                 if (nPmeRanks >= cr->nnodes)
 741                 {
 742                     gmx_fatal(FARGS,
 743                               "Cannot have %d separate PME ranks with just %d total ranks",
 744                               nPmeRanks, cr->nnodes);
 745                 }
 746
 747                 /* If the user purposely selected the number of PME nodes,
 748                  * only check for large primes in the PP node count.
 749                  */
 750                 nnodes_div -= nPmeRanks;
 751             }
 752         }
 753         else
 754         {
 755             cr->npmenodes = 0;
 756         }
 757
 758         if (nnodes_div > 12)
 759         {
 760             ldiv = largest_divisor(nnodes_div);
 761             /* Check if the largest divisor is more than nnodes^2/3 */
 762             if (ldiv*ldiv*ldiv > nnodes_div*nnodes_div)
 763             {
 764                 gmx_fatal(FARGS, "The number of ranks you selected (%" PRId64 ") contains a large prime factor %" PRId64 ". In most cases this will lead to bad performance. Choose a number with smaller prime factors or set the decomposition (option -dd) manually.",
 765                           nnodes_div, ldiv);
 766             }
 767         }
 768
 769         if (EEL_PME(ir->coulombtype))
 770         {
 771             if (nPmeRanks < 0)
 772             {
 773                 /* Use PME nodes when the number of nodes is more than 16 */
 774                 if (cr->nnodes <= 18)
 775                 {
 776                     cr->npmenodes = 0;
 777                     GMX_LOG(mdlog.info).appendTextFormatted(
 778                             "Using %d separate PME ranks, as there are too few total\n"
 779                             " ranks for efficient splitting",
 780                             cr->npmenodes);
 781                 }
 782                 else
 783                 {
 784                     cr->npmenodes = guess_npme(mdlog, mtop, ir, box, cr->nnodes);
 785                     GMX_LOG(mdlog.info).appendTextFormatted(
 786                             "Using %d separate PME ranks, as guessed by mdrun", cr->npmenodes);
 787                 }
 788             }
 789             else
 790             {
 791                 /* We checked above that nPmeRanks is a valid number */
 792                 cr->npmenodes = nPmeRanks;
 793                 GMX_LOG(mdlog.info).appendTextFormatted(
 794                         "Using %d separate PME ranks", cr->npmenodes);
 795                 // TODO: there was a ", per user request" note here, but it's not correct anymore,
 796                 // as with GPUs decision about nPmeRanks can be made in runner() as well.
 797                 // Consider a single spot for setting nPmeRanks.
 798             }
 799         }
 800
 801         limit = optimize_ncells(mdlog, cr->nnodes, cr->npmenodes,
 802                                 bDynLoadBal, dlb_scale,
 803                                 mtop, box, ddbox, ir, dd,
 804                                 cellsize_limit, cutoff_dd,
 805                                 haveInterDomainBondeds,
 806                                 dd->nc);
 807     }
 808     else
 809     {
 810         limit = 0;
 811     }
 812     /* Communicate the information set by the master to all nodes */
 813     gmx_bcast(sizeof(dd->nc), dd->nc, cr);
 814     if (EEL_PME(ir->coulombtype))
 815     {
 816         gmx_bcast(sizeof(cr->npmenodes), &cr->npmenodes, cr);
 817     }
 818     else
 819     {
 820         cr->npmenodes = 0;
 821     }
 822
 823     return limit;
 824 }