src/gromacs/gmxlib/network.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "network.h"
  40
  41 #include "config.h"
  42
  43 #include <cctype>
  44 #include <cstdarg>
  45 #include <cstdlib>
  46 #include <cstring>
  47
  48 #include "gromacs/commandline/filenm.h"
  49 #include "gromacs/mdtypes/commrec.h"
  50 #include "gromacs/utility/basenetwork.h"
  51 #include "gromacs/utility/cstringutil.h"
  52 #include "gromacs/utility/fatalerror.h"
  53 #include "gromacs/utility/futil.h"
  54 #include "gromacs/utility/gmxmpi.h"
  55 #include "gromacs/utility/mpiinplacebuffers.h"
  56 #include "gromacs/utility/real.h"
  57 #include "gromacs/utility/smalloc.h"
  58
  59 /* The source code in this file should be thread-safe.
  60       Please keep it that way. */
  61
  62 void gmx_fill_commrec_from_mpi(t_commrec *cr)
  63 {
  64 #if !GMX_MPI
  65     gmx_call("gmx_fill_commrec_from_mpi");
  66     GMX_UNUSED_VALUE(cr);
  67 #else
  68     if (!gmx_mpi_initialized())
  69     {
  70         gmx_comm("MPI has not been initialized properly");
  71     }
  72
  73     cr->nnodes           = gmx_node_num();
  74     cr->nodeid           = gmx_node_rank();
  75     cr->sim_nodeid       = cr->nodeid;
  76     cr->mpi_comm_mysim   = MPI_COMM_WORLD;
  77     cr->mpi_comm_mygroup = MPI_COMM_WORLD;
  78 #endif
  79 }
  80
  81 t_commrec *init_commrec()
  82 {
  83     t_commrec    *cr;
  84
  85     snew(cr, 1);
  86
  87 #if GMX_LIB_MPI
  88     gmx_fill_commrec_from_mpi(cr);
  89 #else
  90     cr->mpi_comm_mysim   = MPI_COMM_NULL;
  91     cr->mpi_comm_mygroup = MPI_COMM_NULL;
  92     cr->nnodes           = 1;
  93     cr->sim_nodeid       = 0;
  94     cr->nodeid           = cr->sim_nodeid;
  95 #endif
  96
  97     // TODO cr->duty should not be initialized here
  98     cr->duty = (DUTY_PP | DUTY_PME);
  99
 100 #if GMX_MPI && !MPI_IN_PLACE_EXISTS
 101     /* initialize the MPI_IN_PLACE replacement buffers */
 102     snew(cr->mpb, 1);
 103     cr->mpb->ibuf        = nullptr;
 104     cr->mpb->libuf       = nullptr;
 105     cr->mpb->fbuf        = nullptr;
 106     cr->mpb->dbuf        = nullptr;
 107     cr->mpb->ibuf_alloc  = 0;
 108     cr->mpb->libuf_alloc = 0;
 109     cr->mpb->fbuf_alloc  = 0;
 110     cr->mpb->dbuf_alloc  = 0;
 111 #endif
 112
 113     return cr;
 114 }
 115
 116 void done_commrec(t_commrec *cr)
 117 {
 118     if (MASTER(cr))
 119     {
 120         if (nullptr != cr->dd)
 121         {
 122             // TODO: implement
 123             // done_domdec(cr->dd);
 124         }
 125         done_mpi_in_place_buf(cr->mpb);
 126     }
 127 #if GMX_MPI
 128     // TODO We need to be able to free communicators, but the
 129     // structure of the commrec and domdec initialization code makes
 130     // it hard to avoid both leaks and double frees.
 131     bool mySimIsMyGroup = (cr->mpi_comm_mysim == cr->mpi_comm_mygroup);
 132     if (cr->mpi_comm_mysim != MPI_COMM_NULL &&
 133         cr->mpi_comm_mysim != MPI_COMM_WORLD)
 134     {
 135         // TODO see above
 136         // MPI_Comm_free(&cr->mpi_comm_mysim);
 137     }
 138     if (!mySimIsMyGroup &&
 139         cr->mpi_comm_mygroup != MPI_COMM_NULL &&
 140         cr->mpi_comm_mygroup != MPI_COMM_WORLD)
 141     {
 142         // TODO see above
 143         // MPI_Comm_free(&cr->mpi_comm_mygroup);
 144     }
 145 #endif
 146     sfree(cr);
 147 }
 148
 149 t_commrec *reinitialize_commrec_for_this_thread(const t_commrec *cro)
 150 {
 151 #if GMX_THREAD_MPI
 152     t_commrec *cr;
 153
 154     /* make a thread-specific commrec */
 155     snew(cr, 1);
 156     /* now copy the whole thing, so settings like the number of PME nodes
 157        get propagated. */
 158     *cr = *cro;
 159
 160     /* and we start setting our own thread-specific values for things */
 161     gmx_fill_commrec_from_mpi(cr);
 162
 163     // TODO cr->duty should not be initialized here
 164     cr->duty             = (DUTY_PP | DUTY_PME);
 165
 166     return cr;
 167 #else
 168     GMX_UNUSED_VALUE(cro);
 169     return nullptr;
 170 #endif
 171 }
 172
 173 void gmx_setup_nodecomm(FILE gmx_unused *fplog, t_commrec *cr)
 174 {
 175     gmx_nodecomm_t *nc;
 176
 177     /* Many MPI implementations do not optimize MPI_Allreduce
 178      * (and probably also other global communication calls)
 179      * for multi-core nodes connected by a network.
 180      * We can optimize such communication by using one MPI call
 181      * within each node and one between the nodes.
 182      * For MVAPICH2 and Intel MPI this reduces the time for
 183      * the global_stat communication by 25%
 184      * for 2x2-core 3 GHz Woodcrest connected by mixed DDR/SDR Infiniband.
 185      * B. Hess, November 2007
 186      */
 187
 188     nc = &cr->nc;
 189
 190     nc->bUse = FALSE;
 191 #if !GMX_THREAD_MPI
 192 #if GMX_MPI
 193     int n, rank;
 194
 195     // TODO PhysicalNodeCommunicator could be extended/used to handle
 196     // the need for per-node per-group communicators.
 197     MPI_Comm_size(cr->mpi_comm_mygroup, &n);
 198     MPI_Comm_rank(cr->mpi_comm_mygroup, &rank);
 199
 200     int nodehash = gmx_physicalnode_id_hash();
 201
 202     if (debug)
 203     {
 204         fprintf(debug, "In gmx_setup_nodecomm: splitting communicator of size %d\n", n);
 205     }
 206
 207
 208     /* The intra-node communicator, split on node number */
 209     MPI_Comm_split(cr->mpi_comm_mygroup, nodehash, rank, &nc->comm_intra);
 210     MPI_Comm_rank(nc->comm_intra, &nc->rank_intra);
 211     if (debug)
 212     {
 213         fprintf(debug, "In gmx_setup_nodecomm: node ID %d rank within node %d\n",
 214                 rank, nc->rank_intra);
 215     }
 216     /* The inter-node communicator, split on rank_intra.
 217      * We actually only need the one for rank=0,
 218      * but it is easier to create them all.
 219      */
 220     MPI_Comm_split(cr->mpi_comm_mygroup, nc->rank_intra, rank, &nc->comm_inter);
 221     /* Check if this really created two step communication */
 222     int ng, ni;
 223
 224     MPI_Comm_size(nc->comm_inter, &ng);
 225     MPI_Comm_size(nc->comm_intra, &ni);
 226     if (debug)
 227     {
 228         fprintf(debug, "In gmx_setup_nodecomm: groups %d, my group size %d\n",
 229                 ng, ni);
 230     }
 231
 232     if (getenv("GMX_NO_NODECOMM") == nullptr &&
 233         ((ng > 1 && ng < n) || (ni > 1 && ni < n)))
 234     {
 235         nc->bUse = TRUE;
 236         if (fplog)
 237         {
 238             fprintf(fplog, "Using two step summing over %d groups of on average %.1f ranks\n\n",
 239                     ng, (real)n/(real)ng);
 240         }
 241         if (nc->rank_intra > 0)
 242         {
 243             MPI_Comm_free(&nc->comm_inter);
 244         }
 245     }
 246     else
 247     {
 248         /* One group or all processes in a separate group, use normal summing */
 249         MPI_Comm_free(&nc->comm_inter);
 250         MPI_Comm_free(&nc->comm_intra);
 251         if (debug)
 252         {
 253             fprintf(debug, "In gmx_setup_nodecomm: not unsing separate inter- and intra-node communicators.\n");
 254         }
 255     }
 256 #endif
 257 #else
 258     /* tMPI runs only on a single node so just use the nodeid */
 259     nc->rank_intra = cr->nodeid;
 260 #endif
 261 }
 262
 263 void gmx_barrier(const t_commrec gmx_unused *cr)
 264 {
 265 #if !GMX_MPI
 266     gmx_call("gmx_barrier");
 267 #else
 268     MPI_Barrier(cr->mpi_comm_mygroup);
 269 #endif
 270 }
 271
 272 void gmx_bcast(int gmx_unused nbytes, void gmx_unused *b, const t_commrec gmx_unused *cr)
 273 {
 274 #if !GMX_MPI
 275     gmx_call("gmx_bast");
 276 #else
 277     MPI_Bcast(b, nbytes, MPI_BYTE, MASTERRANK(cr), cr->mpi_comm_mygroup);
 278 #endif
 279 }
 280
 281 void gmx_bcast_sim(int gmx_unused nbytes, void gmx_unused *b, const t_commrec gmx_unused *cr)
 282 {
 283 #if !GMX_MPI
 284     gmx_call("gmx_bast");
 285 #else
 286     MPI_Bcast(b, nbytes, MPI_BYTE, MASTERRANK(cr), cr->mpi_comm_mysim);
 287 #endif
 288 }
 289
 290 void gmx_sumd(int gmx_unused nr, double gmx_unused r[], const t_commrec gmx_unused *cr)
 291 {
 292 #if !GMX_MPI
 293     gmx_call("gmx_sumd");
 294 #else
 295 #if MPI_IN_PLACE_EXISTS
 296     if (cr->nc.bUse)
 297     {
 298         if (cr->nc.rank_intra == 0)
 299         {
 300             /* Use two step summing. */
 301             MPI_Reduce(MPI_IN_PLACE, r, nr, MPI_DOUBLE, MPI_SUM, 0,
 302                        cr->nc.comm_intra);
 303             /* Sum the roots of the internal (intra) buffers. */
 304             MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_DOUBLE, MPI_SUM,
 305                           cr->nc.comm_inter);
 306         }
 307         else
 308         {
 309             /* This is here because of the silly MPI specification
 310                 that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 311             MPI_Reduce(r, nullptr, nr, MPI_DOUBLE, MPI_SUM, 0, cr->nc.comm_intra);
 312         }
 313         MPI_Bcast(r, nr, MPI_DOUBLE, 0, cr->nc.comm_intra);
 314     }
 315     else
 316     {
 317         MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_DOUBLE, MPI_SUM,
 318                       cr->mpi_comm_mygroup);
 319     }
 320 #else
 321     int i;
 322
 323     if (nr > cr->mpb->dbuf_alloc)
 324     {
 325         cr->mpb->dbuf_alloc = nr;
 326         srenew(cr->mpb->dbuf, cr->mpb->dbuf_alloc);
 327     }
 328     if (cr->nc.bUse)
 329     {
 330         /* Use two step summing */
 331         MPI_Allreduce(r, cr->mpb->dbuf, nr, MPI_DOUBLE, MPI_SUM, cr->nc.comm_intra);
 332         if (cr->nc.rank_intra == 0)
 333         {
 334             /* Sum with the buffers reversed */
 335             MPI_Allreduce(cr->mpb->dbuf, r, nr, MPI_DOUBLE, MPI_SUM,
 336                           cr->nc.comm_inter);
 337         }
 338         MPI_Bcast(r, nr, MPI_DOUBLE, 0, cr->nc.comm_intra);
 339     }
 340     else
 341     {
 342         MPI_Allreduce(r, cr->mpb->dbuf, nr, MPI_DOUBLE, MPI_SUM,
 343                       cr->mpi_comm_mygroup);
 344         for (i = 0; i < nr; i++)
 345         {
 346             r[i] = cr->mpb->dbuf[i];
 347         }
 348     }
 349 #endif
 350 #endif
 351 }
 352
 353 void gmx_sumf(int gmx_unused nr, float gmx_unused r[], const t_commrec gmx_unused *cr)
 354 {
 355 #if !GMX_MPI
 356     gmx_call("gmx_sumf");
 357 #else
 358 #if MPI_IN_PLACE_EXISTS
 359     if (cr->nc.bUse)
 360     {
 361         /* Use two step summing.  */
 362         if (cr->nc.rank_intra == 0)
 363         {
 364             MPI_Reduce(MPI_IN_PLACE, r, nr, MPI_FLOAT, MPI_SUM, 0,
 365                        cr->nc.comm_intra);
 366             /* Sum the roots of the internal (intra) buffers */
 367             MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_FLOAT, MPI_SUM,
 368                           cr->nc.comm_inter);
 369         }
 370         else
 371         {
 372             /* This is here because of the silly MPI specification
 373                 that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 374             MPI_Reduce(r, nullptr, nr, MPI_FLOAT, MPI_SUM, 0, cr->nc.comm_intra);
 375         }
 376         MPI_Bcast(r, nr, MPI_FLOAT, 0, cr->nc.comm_intra);
 377     }
 378     else
 379     {
 380         MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_FLOAT, MPI_SUM, cr->mpi_comm_mygroup);
 381     }
 382 #else
 383     int i;
 384
 385     if (nr > cr->mpb->fbuf_alloc)
 386     {
 387         cr->mpb->fbuf_alloc = nr;
 388         srenew(cr->mpb->fbuf, cr->mpb->fbuf_alloc);
 389     }
 390     if (cr->nc.bUse)
 391     {
 392         /* Use two step summing */
 393         MPI_Allreduce(r, cr->mpb->fbuf, nr, MPI_FLOAT, MPI_SUM, cr->nc.comm_intra);
 394         if (cr->nc.rank_intra == 0)
 395         {
 396             /* Sum with the buffers reversed */
 397             MPI_Allreduce(cr->mpb->fbuf, r, nr, MPI_FLOAT, MPI_SUM,
 398                           cr->nc.comm_inter);
 399         }
 400         MPI_Bcast(r, nr, MPI_FLOAT, 0, cr->nc.comm_intra);
 401     }
 402     else
 403     {
 404         MPI_Allreduce(r, cr->mpb->fbuf, nr, MPI_FLOAT, MPI_SUM,
 405                       cr->mpi_comm_mygroup);
 406         for (i = 0; i < nr; i++)
 407         {
 408             r[i] = cr->mpb->fbuf[i];
 409         }
 410     }
 411 #endif
 412 #endif
 413 }
 414
 415 void gmx_sumi(int gmx_unused nr, int gmx_unused r[], const t_commrec gmx_unused *cr)
 416 {
 417 #if !GMX_MPI
 418     gmx_call("gmx_sumi");
 419 #else
 420 #if MPI_IN_PLACE_EXISTS
 421     if (cr->nc.bUse)
 422     {
 423         /* Use two step summing */
 424         if (cr->nc.rank_intra == 0)
 425         {
 426             MPI_Reduce(MPI_IN_PLACE, r, nr, MPI_INT, MPI_SUM, 0, cr->nc.comm_intra);
 427             /* Sum with the buffers reversed */
 428             MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_INT, MPI_SUM, cr->nc.comm_inter);
 429         }
 430         else
 431         {
 432             /* This is here because of the silly MPI specification
 433                 that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 434             MPI_Reduce(r, nullptr, nr, MPI_INT, MPI_SUM, 0, cr->nc.comm_intra);
 435         }
 436         MPI_Bcast(r, nr, MPI_INT, 0, cr->nc.comm_intra);
 437     }
 438     else
 439     {
 440         MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_INT, MPI_SUM, cr->mpi_comm_mygroup);
 441     }
 442 #else
 443     int i;
 444
 445     if (nr > cr->mpb->ibuf_alloc)
 446     {
 447         cr->mpb->ibuf_alloc = nr;
 448         srenew(cr->mpb->ibuf, cr->mpb->ibuf_alloc);
 449     }
 450     if (cr->nc.bUse)
 451     {
 452         /* Use two step summing */
 453         MPI_Allreduce(r, cr->mpb->ibuf, nr, MPI_INT, MPI_SUM, cr->nc.comm_intra);
 454         if (cr->nc.rank_intra == 0)
 455         {
 456             /* Sum with the buffers reversed */
 457             MPI_Allreduce(cr->mpb->ibuf, r, nr, MPI_INT, MPI_SUM, cr->nc.comm_inter);
 458         }
 459         MPI_Bcast(r, nr, MPI_INT, 0, cr->nc.comm_intra);
 460     }
 461     else
 462     {
 463         MPI_Allreduce(r, cr->mpb->ibuf, nr, MPI_INT, MPI_SUM, cr->mpi_comm_mygroup);
 464         for (i = 0; i < nr; i++)
 465         {
 466             r[i] = cr->mpb->ibuf[i];
 467         }
 468     }
 469 #endif
 470 #endif
 471 }
 472
 473 void gmx_sumli(int gmx_unused nr, int64_t gmx_unused r[], const t_commrec gmx_unused *cr)
 474 {
 475 #if !GMX_MPI
 476     gmx_call("gmx_sumli");
 477 #else
 478 #if MPI_IN_PLACE_EXISTS
 479     if (cr->nc.bUse)
 480     {
 481         /* Use two step summing */
 482         if (cr->nc.rank_intra == 0)
 483         {
 484             MPI_Reduce(MPI_IN_PLACE, r, nr, MPI_INT64_T, MPI_SUM, 0,
 485                        cr->nc.comm_intra);
 486             /* Sum with the buffers reversed */
 487             MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_INT64_T, MPI_SUM,
 488                           cr->nc.comm_inter);
 489         }
 490         else
 491         {
 492             /* This is here because of the silly MPI specification
 493                 that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 494             MPI_Reduce(r, nullptr, nr, MPI_INT64_T, MPI_SUM, 0, cr->nc.comm_intra);
 495         }
 496         MPI_Bcast(r, nr, MPI_INT64_T, 0, cr->nc.comm_intra);
 497     }
 498     else
 499     {
 500         MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_INT64_T, MPI_SUM, cr->mpi_comm_mygroup);
 501     }
 502 #else
 503     int i;
 504
 505     if (nr > cr->mpb->libuf_alloc)
 506     {
 507         cr->mpb->libuf_alloc = nr;
 508         srenew(cr->mpb->libuf, cr->mpb->libuf_alloc);
 509     }
 510     if (cr->nc.bUse)
 511     {
 512         /* Use two step summing */
 513         MPI_Allreduce(r, cr->mpb->libuf, nr, MPI_INT64_T, MPI_SUM,
 514                       cr->nc.comm_intra);
 515         if (cr->nc.rank_intra == 0)
 516         {
 517             /* Sum with the buffers reversed */
 518             MPI_Allreduce(cr->mpb->libuf, r, nr, MPI_INT64_T, MPI_SUM,
 519                           cr->nc.comm_inter);
 520         }
 521         MPI_Bcast(r, nr, MPI_INT64_T, 0, cr->nc.comm_intra);
 522     }
 523     else
 524     {
 525         MPI_Allreduce(r, cr->mpb->libuf, nr, MPI_INT64_T, MPI_SUM,
 526                       cr->mpi_comm_mygroup);
 527         for (i = 0; i < nr; i++)
 528         {
 529             r[i] = cr->mpb->libuf[i];
 530         }
 531     }
 532 #endif
 533 #endif
 534 }
 535
 536 const char *opt2fn_master(const char *opt, int nfile, const t_filenm fnm[],
 537                           t_commrec *cr)
 538 {
 539     return SIMMASTER(cr) ? opt2fn(opt, nfile, fnm) : nullptr;
 540 }
 541
 542 void gmx_fatal_collective(int f_errno, const char *file, int line,
 543                           MPI_Comm comm, gmx_bool bMaster,
 544                           gmx_fmtstr const char *fmt, ...)
 545 {
 546     va_list  ap;
 547     gmx_bool bFinalize;
 548 #if GMX_MPI
 549     int      result;
 550     /* Check if we are calling on all processes in MPI_COMM_WORLD */
 551     MPI_Comm_compare(comm, MPI_COMM_WORLD, &result);
 552     /* Any result except MPI_UNEQUAL allows us to call MPI_Finalize */
 553     bFinalize = (result != MPI_UNEQUAL);
 554 #else
 555     GMX_UNUSED_VALUE(comm);
 556     bFinalize = TRUE;
 557 #endif
 558
 559     va_start(ap, fmt);
 560     gmx_fatal_mpi_va(f_errno, file, line, bMaster, bFinalize, fmt, ap);
 561     va_end(ap);
 562 }