2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements the multi-simulation support routines.
39 * \author Mark Abraham <mark.j.abraham@gmail.com>
40 * \ingroup module_mdrunutility
48 #include "gromacs/gmxlib/network.h"
49 #include "gromacs/mdtypes/commrec.h"
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/futil.h"
52 #include "gromacs/utility/mpiinplacebuffers.h"
53 #include "gromacs/utility/smalloc.h"
55 gmx_multisim_t
*init_multisystem(MPI_Comm comm
,
56 gmx::ArrayRef
<const std::string
> multidirs
)
60 MPI_Group mpi_group_world
;
64 if (multidirs
.empty())
69 if (!GMX_LIB_MPI
&& !multidirs
.empty())
71 gmx_fatal(FARGS
, "mdrun -multidir is only supported when GROMACS has been "
72 "configured with a proper external MPI library.");
75 if (multidirs
.size() == 1)
77 /* NOTE: It would be nice if this special case worked, but this requires checks/tests. */
78 gmx_fatal(FARGS
, "To run mdrun in multiple simulation mode, more then one "
79 "actual simulation is required. The single simulation case is not supported.");
84 MPI_Comm_size(comm
, &numRanks
);
85 if (numRanks
% multidirs
.size() != 0)
87 gmx_fatal(FARGS
, "The number of ranks (%d) is not a multiple of the number of simulations (%td)", numRanks
, multidirs
.ssize());
90 int numRanksPerSim
= numRanks
/multidirs
.size();
92 MPI_Comm_rank(comm
, &rankWithinComm
);
96 fprintf(debug
, "We have %td simulations, %d ranks per simulation, local simulation is %d\n", multidirs
.ssize(), numRanksPerSim
, rankWithinComm
/numRanksPerSim
);
99 ms
= new gmx_multisim_t
;
100 ms
->nsim
= multidirs
.size();
101 ms
->sim
= rankWithinComm
/numRanksPerSim
;
102 /* Create a communicator for the master nodes */
103 snew(rank
, ms
->nsim
);
104 for (int i
= 0; i
< ms
->nsim
; i
++)
106 rank
[i
] = i
*numRanksPerSim
;
108 MPI_Comm_group(comm
, &mpi_group_world
);
109 MPI_Group_incl(mpi_group_world
, ms
->nsim
, rank
, &ms
->mpi_group_masters
);
111 MPI_Comm_create(MPI_COMM_WORLD
, ms
->mpi_group_masters
,
112 &ms
->mpi_comm_masters
);
114 #if !MPI_IN_PLACE_EXISTS
115 /* initialize the MPI_IN_PLACE replacement buffers */
117 ms
->mpb
->ibuf
= nullptr;
118 ms
->mpb
->libuf
= nullptr;
119 ms
->mpb
->fbuf
= nullptr;
120 ms
->mpb
->dbuf
= nullptr;
121 ms
->mpb
->ibuf_alloc
= 0;
122 ms
->mpb
->libuf_alloc
= 0;
123 ms
->mpb
->fbuf_alloc
= 0;
124 ms
->mpb
->dbuf_alloc
= 0;
127 // TODO This should throw upon error
128 gmx_chdir(multidirs
[ms
->sim
].c_str());
130 GMX_UNUSED_VALUE(comm
);
137 void done_multisim(gmx_multisim_t
*ms
)
143 done_mpi_in_place_buf(ms
->mpb
);
146 // TODO This would work better if the result of MPI_Comm_split was
147 // put into an RAII-style guard, such as gmx::unique_cptr.
148 if (ms
->mpi_comm_masters
!= MPI_COMM_NULL
&&
149 ms
->mpi_comm_masters
!= MPI_COMM_WORLD
)
151 MPI_Comm_free(&ms
->mpi_comm_masters
);
153 if (ms
->mpi_group_masters
!= MPI_GROUP_NULL
)
155 MPI_Group_free(&ms
->mpi_group_masters
);
162 static void gmx_sumd_comm(int nr
, double r
[], MPI_Comm mpi_comm
)
164 #if MPI_IN_PLACE_EXISTS
165 MPI_Allreduce(MPI_IN_PLACE
, r
, nr
, MPI_DOUBLE
, MPI_SUM
, mpi_comm
);
167 /* this function is only used in code that is not performance critical,
168 (during setup, when comm_rec is not the appropriate communication
169 structure), so this isn't as bad as it looks. */
174 MPI_Allreduce(r
, buf
, nr
, MPI_DOUBLE
, MPI_SUM
, mpi_comm
);
175 for (i
= 0; i
< nr
; i
++)
185 static void gmx_sumf_comm(int nr
, float r
[], MPI_Comm mpi_comm
)
187 #if MPI_IN_PLACE_EXISTS
188 MPI_Allreduce(MPI_IN_PLACE
, r
, nr
, MPI_FLOAT
, MPI_SUM
, mpi_comm
);
190 /* this function is only used in code that is not performance critical,
191 (during setup, when comm_rec is not the appropriate communication
192 structure), so this isn't as bad as it looks. */
197 MPI_Allreduce(r
, buf
, nr
, MPI_FLOAT
, MPI_SUM
, mpi_comm
);
198 for (i
= 0; i
< nr
; i
++)
207 void gmx_sumd_sim(int gmx_unused nr
, double gmx_unused r
[], const gmx_multisim_t gmx_unused
*ms
)
210 gmx_call("gmx_sumd_sim");
212 gmx_sumd_comm(nr
, r
, ms
->mpi_comm_masters
);
216 void gmx_sumf_sim(int gmx_unused nr
, float gmx_unused r
[], const gmx_multisim_t gmx_unused
*ms
)
219 gmx_call("gmx_sumf_sim");
221 gmx_sumf_comm(nr
, r
, ms
->mpi_comm_masters
);
225 void gmx_sumi_sim(int gmx_unused nr
, int gmx_unused r
[], const gmx_multisim_t gmx_unused
*ms
)
228 gmx_call("gmx_sumi_sim");
230 #if MPI_IN_PLACE_EXISTS
231 MPI_Allreduce(MPI_IN_PLACE
, r
, nr
, MPI_INT
, MPI_SUM
, ms
->mpi_comm_masters
);
233 /* this is thread-unsafe, but it will do for now: */
236 if (nr
> ms
->mpb
->ibuf_alloc
)
238 ms
->mpb
->ibuf_alloc
= nr
;
239 srenew(ms
->mpb
->ibuf
, ms
->mpb
->ibuf_alloc
);
241 MPI_Allreduce(r
, ms
->mpb
->ibuf
, nr
, MPI_INT
, MPI_SUM
, ms
->mpi_comm_masters
);
242 for (i
= 0; i
< nr
; i
++)
244 r
[i
] = ms
->mpb
->ibuf
[i
];
250 void gmx_sumli_sim(int gmx_unused nr
, int64_t gmx_unused r
[], const gmx_multisim_t gmx_unused
*ms
)
253 gmx_call("gmx_sumli_sim");
255 #if MPI_IN_PLACE_EXISTS
256 MPI_Allreduce(MPI_IN_PLACE
, r
, nr
, MPI_INT64_T
, MPI_SUM
,
257 ms
->mpi_comm_masters
);
259 /* this is thread-unsafe, but it will do for now: */
262 if (nr
> ms
->mpb
->libuf_alloc
)
264 ms
->mpb
->libuf_alloc
= nr
;
265 srenew(ms
->mpb
->libuf
, ms
->mpb
->libuf_alloc
);
267 MPI_Allreduce(r
, ms
->mpb
->libuf
, nr
, MPI_INT64_T
, MPI_SUM
,
268 ms
->mpi_comm_masters
);
269 for (i
= 0; i
< nr
; i
++)
271 r
[i
] = ms
->mpb
->libuf
[i
];
277 void check_multi_int(FILE *log
, const gmx_multisim_t
*ms
, int val
,
282 gmx_bool bCompatible
;
284 if (nullptr != log
&& !bQuiet
)
286 fprintf(log
, "Multi-checking %s ... ", name
);
292 "check_multi_int called with a NULL communication pointer");
295 snew(ibuf
, ms
->nsim
);
297 gmx_sumi_sim(ms
->nsim
, ibuf
, ms
);
300 for (p
= 1; p
< ms
->nsim
; p
++)
302 bCompatible
= bCompatible
&& (ibuf
[p
-1] == ibuf
[p
]);
307 if (nullptr != log
&& !bQuiet
)
309 fprintf(log
, "OK\n");
316 fprintf(log
, "\n%s is not equal for all subsystems\n", name
);
317 for (p
= 0; p
< ms
->nsim
; p
++)
319 fprintf(log
, " subsystem %d: %d\n", p
, ibuf
[p
]);
322 gmx_fatal(FARGS
, "The %d subsystems are not compatible\n", ms
->nsim
);
328 void check_multi_int64(FILE *log
, const gmx_multisim_t
*ms
,
329 int64_t val
, const char *name
,
334 gmx_bool bCompatible
;
336 if (nullptr != log
&& !bQuiet
)
338 fprintf(log
, "Multi-checking %s ... ", name
);
344 "check_multi_int called with a NULL communication pointer");
347 snew(ibuf
, ms
->nsim
);
349 gmx_sumli_sim(ms
->nsim
, ibuf
, ms
);
352 for (p
= 1; p
< ms
->nsim
; p
++)
354 bCompatible
= bCompatible
&& (ibuf
[p
-1] == ibuf
[p
]);
359 if (nullptr != log
&& !bQuiet
)
361 fprintf(log
, "OK\n");
366 // TODO Part of this error message would also be good to go to
367 // stderr (from one rank of one sim only)
370 fprintf(log
, "\n%s is not equal for all subsystems\n", name
);
371 for (p
= 0; p
< ms
->nsim
; p
++)
374 /* first make the format string */
375 snprintf(strbuf
, 255, " subsystem %%d: %s\n",
377 fprintf(log
, strbuf
, p
, ibuf
[p
]);
380 gmx_fatal(FARGS
, "The %d subsystems are not compatible\n", ms
->nsim
);
386 bool isMasterSim(const gmx_multisim_t
*ms
)
388 return !isMultiSim(ms
) || ms
->sim
== 0;
391 bool isMasterSimMasterRank(const gmx_multisim_t
*ms
,
394 return (isMaster
&& isMasterSim(ms
));