2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements the multi-simulation support routines.
39 * \author Mark Abraham <mark.j.abraham@gmail.com>
40 * \ingroup module_mdrunutility
48 #include "gromacs/gmxlib/network.h"
49 #include "gromacs/mdtypes/commrec.h"
50 #include "gromacs/utility/exceptions.h"
51 #include "gromacs/utility/fatalerror.h"
52 #include "gromacs/utility/futil.h"
53 #include "gromacs/utility/gmxassert.h"
54 #include "gromacs/utility/logger.h"
55 #include "gromacs/utility/smalloc.h"
57 std::unique_ptr
<gmx_multisim_t
> buildMultiSimulation(MPI_Comm worldComm
,
58 gmx::ArrayRef
<const std::string
> multidirs
)
60 if (multidirs
.empty())
65 if (!GMX_LIB_MPI
&& !multidirs
.empty())
67 GMX_THROW(gmx::NotImplementedError(
68 "Multi-simulations are only supported when GROMACS has been "
69 "configured with a proper external MPI library."));
72 if (multidirs
.size() == 1)
74 /* NOTE: It would be nice if this special case worked, but this requires checks/tests. */
75 GMX_THROW(gmx::NotImplementedError(
76 "To run mdrun in multi-simulation mode, more then one "
77 "actual simulation is required. The single simulation case is not supported."));
82 MPI_Comm_size(worldComm
, &numRanks
);
83 if (numRanks
% multidirs
.size() != 0)
85 auto message
= gmx::formatString(
86 "The number of ranks (%d) is not a multiple of the number of simulations (%td)",
87 numRanks
, multidirs
.ssize());
88 GMX_THROW(gmx::InconsistentInputError(message
));
91 int numRanksPerSimulation
= numRanks
/ multidirs
.size();
92 int rankWithinWorldComm
;
93 MPI_Comm_rank(worldComm
, &rankWithinWorldComm
);
97 fprintf(debug
, "We have %td simulations, %d ranks per simulation, local simulation is %d\n",
98 multidirs
.ssize(), numRanksPerSimulation
, rankWithinWorldComm
/ numRanksPerSimulation
);
101 int numSimulations
= multidirs
.size();
102 // Create a communicator for the master ranks of each simulation
103 std::vector
<int> ranksOfMasters(numSimulations
);
104 for (int i
= 0; i
< numSimulations
; i
++)
106 ranksOfMasters
[i
] = i
* numRanksPerSimulation
;
108 MPI_Group worldGroup
;
109 // No need to free worldGroup later, we didn't create it.
110 MPI_Comm_group(worldComm
, &worldGroup
);
112 MPI_Group mastersGroup
= MPI_GROUP_NULL
;
113 MPI_Group_incl(worldGroup
, numSimulations
, ranksOfMasters
.data(), &mastersGroup
);
114 MPI_Comm mastersComm
= MPI_COMM_NULL
;
115 MPI_Comm_create(worldComm
, mastersGroup
, &mastersComm
);
116 if (mastersGroup
!= MPI_GROUP_NULL
)
118 MPI_Group_free(&mastersGroup
);
121 int simulationIndex
= rankWithinWorldComm
/ numRanksPerSimulation
;
122 MPI_Comm simulationComm
= MPI_COMM_NULL
;
123 MPI_Comm_split(worldComm
, simulationIndex
, rankWithinWorldComm
, &simulationComm
);
127 gmx_chdir(multidirs
[simulationIndex
].c_str());
129 catch (gmx::GromacsException
& e
)
131 e
.prependContext("While changing directory for multi-simulation to " + multidirs
[simulationIndex
]);
134 return std::make_unique
<gmx_multisim_t
>(numSimulations
, simulationIndex
, mastersComm
, simulationComm
);
136 GMX_UNUSED_VALUE(worldComm
);
141 gmx_multisim_t::gmx_multisim_t(int numSimulations
, int simulationIndex
, MPI_Comm mastersComm
, MPI_Comm simulationComm
) :
142 numSimulations_(numSimulations
),
143 simulationIndex_(simulationIndex
),
144 mastersComm_(mastersComm
),
145 simulationComm_(simulationComm
)
149 gmx_multisim_t::~gmx_multisim_t()
152 // TODO This would work better if the result of MPI_Comm_split was
153 // put into an RAII-style guard, such as gmx::unique_cptr.
154 if (mastersComm_
!= MPI_COMM_NULL
&& mastersComm_
!= MPI_COMM_WORLD
)
156 MPI_Comm_free(&mastersComm_
);
158 if (simulationComm_
!= MPI_COMM_NULL
&& simulationComm_
!= MPI_COMM_WORLD
)
160 MPI_Comm_free(&simulationComm_
);
166 static void gmx_sumd_comm(int nr
, double r
[], MPI_Comm mpi_comm
)
168 # if MPI_IN_PLACE_EXISTS
169 MPI_Allreduce(MPI_IN_PLACE
, r
, nr
, MPI_DOUBLE
, MPI_SUM
, mpi_comm
);
171 /* this function is only used in code that is not performance critical,
172 (during setup, when comm_rec is not the appropriate communication
173 structure), so this isn't as bad as it looks. */
178 MPI_Allreduce(r
, buf
, nr
, MPI_DOUBLE
, MPI_SUM
, mpi_comm
);
179 for (i
= 0; i
< nr
; i
++)
189 static void gmx_sumf_comm(int nr
, float r
[], MPI_Comm mpi_comm
)
191 # if MPI_IN_PLACE_EXISTS
192 MPI_Allreduce(MPI_IN_PLACE
, r
, nr
, MPI_FLOAT
, MPI_SUM
, mpi_comm
);
194 /* this function is only used in code that is not performance critical,
195 (during setup, when comm_rec is not the appropriate communication
196 structure), so this isn't as bad as it looks. */
201 MPI_Allreduce(r
, buf
, nr
, MPI_FLOAT
, MPI_SUM
, mpi_comm
);
202 for (i
= 0; i
< nr
; i
++)
211 void gmx_sumd_sim(int gmx_unused nr
, double gmx_unused r
[], const gmx_multisim_t gmx_unused
* ms
)
214 GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumd_sim");
216 gmx_sumd_comm(nr
, r
, ms
->mastersComm_
);
220 void gmx_sumf_sim(int gmx_unused nr
, float gmx_unused r
[], const gmx_multisim_t gmx_unused
* ms
)
223 GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumf_sim");
225 gmx_sumf_comm(nr
, r
, ms
->mastersComm_
);
229 void gmx_sumi_sim(int gmx_unused nr
, int gmx_unused r
[], const gmx_multisim_t gmx_unused
* ms
)
232 GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumi_sim");
234 # if MPI_IN_PLACE_EXISTS
235 MPI_Allreduce(MPI_IN_PLACE
, r
, nr
, MPI_INT
, MPI_SUM
, ms
->mastersComm_
);
237 /* this is thread-unsafe, but it will do for now: */
238 ms
->intBuffer
.resize(nr
);
239 MPI_Allreduce(r
, ms
->intBuffer
.data(), ms
->intBuffer
.size(), MPI_INT
, MPI_SUM
, ms
->mastersComm_
);
240 std::copy(std::begin(ms
->intBuffer
), std::end(ms
->intBuffer
), r
);
245 void gmx_sumli_sim(int gmx_unused nr
, int64_t gmx_unused r
[], const gmx_multisim_t gmx_unused
* ms
)
248 GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumli_sim");
250 # if MPI_IN_PLACE_EXISTS
251 MPI_Allreduce(MPI_IN_PLACE
, r
, nr
, MPI_INT64_T
, MPI_SUM
, ms
->mastersComm_
);
253 /* this is thread-unsafe, but it will do for now: */
254 ms
->int64Buffer
.resize(nr
);
255 MPI_Allreduce(r
, ms
->int64Buffer
.data(), ms
->int64Buffer
.size(), MPI_INT64_T
, MPI_SUM
, ms
->mastersComm_
);
256 std::copy(std::begin(ms
->int64Buffer
), std::end(ms
->int64Buffer
), r
);
261 std::vector
<int> gatherIntFromMultiSimulation(const gmx_multisim_t
* ms
, const int localValue
)
263 std::vector
<int> valuesFromAllRanks
;
267 valuesFromAllRanks
.resize(ms
->numSimulations_
);
268 valuesFromAllRanks
[ms
->simulationIndex_
] = localValue
;
269 gmx_sumi_sim(ms
->numSimulations_
, valuesFromAllRanks
.data(), ms
);
273 valuesFromAllRanks
.emplace_back(localValue
);
276 GMX_UNUSED_VALUE(ms
);
277 valuesFromAllRanks
.emplace_back(localValue
);
279 return valuesFromAllRanks
;
282 void check_multi_int(FILE* log
, const gmx_multisim_t
* ms
, int val
, const char* name
, gmx_bool bQuiet
)
285 gmx_bool bCompatible
;
287 if (nullptr != log
&& !bQuiet
)
289 fprintf(log
, "Multi-checking %s ... ", name
);
294 gmx_fatal(FARGS
, "check_multi_int called with a NULL communication pointer");
297 snew(ibuf
, ms
->numSimulations_
);
298 ibuf
[ms
->simulationIndex_
] = val
;
299 gmx_sumi_sim(ms
->numSimulations_
, ibuf
, ms
);
302 for (p
= 1; p
< ms
->numSimulations_
; p
++)
304 bCompatible
= bCompatible
&& (ibuf
[p
- 1] == ibuf
[p
]);
309 if (nullptr != log
&& !bQuiet
)
311 fprintf(log
, "OK\n");
318 fprintf(log
, "\n%s is not equal for all subsystems\n", name
);
319 for (p
= 0; p
< ms
->numSimulations_
; p
++)
321 fprintf(log
, " subsystem %d: %d\n", p
, ibuf
[p
]);
324 gmx_fatal(FARGS
, "The %d subsystems are not compatible\n", ms
->numSimulations_
);
330 void check_multi_int64(FILE* log
, const gmx_multisim_t
* ms
, int64_t val
, const char* name
, gmx_bool bQuiet
)
334 gmx_bool bCompatible
;
336 if (nullptr != log
&& !bQuiet
)
338 fprintf(log
, "Multi-checking %s ... ", name
);
343 gmx_fatal(FARGS
, "check_multi_int called with a NULL communication pointer");
346 snew(ibuf
, ms
->numSimulations_
);
347 ibuf
[ms
->simulationIndex_
] = val
;
348 gmx_sumli_sim(ms
->numSimulations_
, ibuf
, ms
);
351 for (p
= 1; p
< ms
->numSimulations_
; p
++)
353 bCompatible
= bCompatible
&& (ibuf
[p
- 1] == ibuf
[p
]);
358 if (nullptr != log
&& !bQuiet
)
360 fprintf(log
, "OK\n");
365 // TODO Part of this error message would also be good to go to
366 // stderr (from one rank of one sim only)
369 fprintf(log
, "\n%s is not equal for all subsystems\n", name
);
370 for (p
= 0; p
< ms
->numSimulations_
; p
++)
373 /* first make the format string */
374 snprintf(strbuf
, 255, " subsystem %%d: %s\n", "%" PRId64
);
375 fprintf(log
, strbuf
, p
, ibuf
[p
]);
378 gmx_fatal(FARGS
, "The %d subsystems are not compatible\n", ms
->numSimulations_
);
384 bool findIsSimulationMasterRank(const gmx_multisim_t
* ms
, MPI_Comm communicator
)
388 // Ranks of multi-simulations know whether they are a master
389 // rank. Ranks of non-multi simulation do not know until a
390 // t_commrec is available.
391 if ((ms
!= nullptr) && (ms
->numSimulations_
> 1))
393 return ms
->mastersComm_
!= MPI_COMM_NULL
;
399 MPI_Comm_rank(communicator
, &rank
);
404 else if (GMX_THREAD_MPI
)
406 GMX_RELEASE_ASSERT(communicator
== MPI_COMM_NULL
|| communicator
== MPI_COMM_WORLD
,
407 "Invalid communicator");
408 // Spawned threads have MPI_COMM_WORLD upon creation, so if
409 // the communicator is MPI_COMM_NULL this is not a spawned thread,
410 // ie is the master thread
411 return (communicator
== MPI_COMM_NULL
);
415 // No MPI means it must be the master (and only) rank.
420 bool isMasterSim(const gmx_multisim_t
* ms
)
422 return !isMultiSim(ms
) || ms
->simulationIndex_
== 0;
425 bool isMasterSimMasterRank(const gmx_multisim_t
* ms
, const bool isMaster
)
427 return (isMaster
&& isMasterSim(ms
));
430 static bool multisim_int_all_are_equal(const gmx_multisim_t
* ms
, int64_t value
)
432 bool allValuesAreEqual
= true;
435 GMX_RELEASE_ASSERT(ms
, "Invalid use of multi-simulation pointer");
437 snew(buf
, ms
->numSimulations_
);
438 /* send our value to all other master ranks, receive all of theirs */
439 buf
[ms
->simulationIndex_
] = value
;
440 gmx_sumli_sim(ms
->numSimulations_
, buf
, ms
);
442 for (int s
= 0; s
< ms
->numSimulations_
; s
++)
446 allValuesAreEqual
= false;
453 return allValuesAreEqual
;
456 void logInitialMultisimStatus(const gmx_multisim_t
* ms
,
458 const gmx::MDLogger
& mdlog
,
459 const bool simulationsShareState
,
461 const int initialStep
)
463 if (!multisim_int_all_are_equal(ms
, numSteps
))
465 GMX_LOG(mdlog
.warning
)
467 "Note: The number of steps is not consistent across multi "
469 "but we are proceeding anyway!");
471 if (!multisim_int_all_are_equal(ms
, initialStep
))
473 if (simulationsShareState
)
478 "The initial step is not consistent across multi simulations which "
481 gmx_barrier(cr
->mpi_comm_mygroup
);
485 GMX_LOG(mdlog
.warning
)
487 "Note: The initial step is not consistent across multi "
489 "but we are proceeding anyway!");