Add replacements for pbc enumerations
[gromacs.git] / src / gromacs / taskassignment / taskassignment.cpp
blob5b77152fd3642c656ceb5a09bca1282c8a0345df
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief
37 * Defines helper and factory functionality for task assignment.
39 * Note that the GPU ID assignment could potentially match many
40 * different kinds of simulation setups, including ranks from multiple
41 * simulations, ranks from the same simulation, and/or ranks with duty
42 * only for particular tasks (e.g. PME-only ranks). Which GPU ID
43 * assignments are valid will naturally depend on the other run-time
44 * options given to mdrun, and the current capabilities of the
45 * implementation.
47 * \author Mark Abraham <mark.j.abraham@gmail.com>
48 * \ingroup module_taskassignment
50 #include "gmxpre.h"
52 #include "taskassignment.h"
54 #include "config.h"
56 #include <string>
57 #include <vector>
59 #include "gromacs/hardware/hw_info.h"
60 #include "gromacs/mdrunutility/multisim.h"
61 #include "gromacs/mdtypes/commrec.h"
62 #include "gromacs/taskassignment/usergpuids.h"
63 #include "gromacs/utility/cstringutil.h"
64 #include "gromacs/utility/exceptions.h"
65 #include "gromacs/utility/fatalerror.h"
66 #include "gromacs/utility/gmxassert.h"
67 #include "gromacs/utility/gmxmpi.h"
68 #include "gromacs/utility/logger.h"
69 #include "gromacs/utility/physicalnodecommunicator.h"
70 #include "gromacs/utility/stringutil.h"
71 #include "gromacs/utility/sysinfo.h"
73 #include "findallgputasks.h"
74 #include "reportgpuusage.h"
76 namespace gmx
79 namespace
82 /*! \brief Build data structure of types of GPU tasks on a rank,
83 * together with the mapped GPU device IDs, for all GPU tasks on all
84 * the ranks of this node.
86 * \param[in] gpuTasksOnRanksOfThisNode For each rank on this node, the set of tasks
87 * that are eligible to run on GPUs.
88 * \param[in] gpuIds The user-supplied GPU IDs.
90 GpuTaskAssignments
91 buildTaskAssignment(const GpuTasksOnRanks &gpuTasksOnRanksOfThisNode,
92 ArrayRef<const int> gpuIds)
94 GpuTaskAssignments gpuTaskAssignmentOnRanksOfThisNode(gpuTasksOnRanksOfThisNode.size());
96 // Loop over the ranks on this node, and the tasks on each
97 // rank. For each task, take the next device ID from those
98 // provided by the user, to build a vector of mappings of task to
99 // ID, for each rank on this node. Note that if there have not
100 // been any GPU tasks identified, then gpuIds can be empty.
101 auto currentGpuId = gpuIds.begin();
102 auto gpuTaskAssignmentOnRank = gpuTaskAssignmentOnRanksOfThisNode.begin();
103 for (const auto &gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
105 gpuTaskAssignmentOnRank->reserve(gpuTasksOnRank.size());
106 for (const auto &gpuTaskType : gpuTasksOnRank)
108 GMX_RELEASE_ASSERT(currentGpuId != gpuIds.end(), "Indexing out of range for GPU tasks");
109 gpuTaskAssignmentOnRank->push_back({gpuTaskType, *currentGpuId});
110 ++currentGpuId;
112 GMX_RELEASE_ASSERT(gpuTaskAssignmentOnRank->size() == gpuTasksOnRank.size(),
113 "Mismatch in number of GPU tasks on a rank with the number of elements in the resulting task assignment");
114 ++gpuTaskAssignmentOnRank;
117 return gpuTaskAssignmentOnRanksOfThisNode;
120 /*! \brief Return whether a GPU device is shared between any ranks.
122 * Sharing GPUs among multiple ranks is possible via either user or
123 * automated selection. */
124 bool isAnyGpuSharedBetweenRanks(const GpuTaskAssignments &gpuTaskAssignments)
126 // Loop over all ranks i, looking on all higher ranks j whether
127 // any tasks on them share GPU device IDs.
129 // TODO Should this functionality also consider whether tasks on
130 // the same rank are sharing a device?
131 for (size_t i = 0; i < gpuTaskAssignments.size(); ++i)
133 for (const auto &taskOnRankI : gpuTaskAssignments[i])
135 for (size_t j = i+1; j < gpuTaskAssignments.size(); ++j)
137 for (const auto &taskOnRankJ : gpuTaskAssignments[j])
139 if (taskOnRankI.deviceId_ == taskOnRankJ.deviceId_)
141 return true;
147 return false;
150 //! Logs to \c mdlog information that may help a user learn how to let mdrun make a task assignment that runs faster.
151 void logPerformanceHints(const MDLogger &mdlog,
152 size_t numCompatibleGpus,
153 size_t numGpuTasksOnThisNode,
154 const GpuTaskAssignments &gpuTaskAssignments)
156 if (numCompatibleGpus > numGpuTasksOnThisNode)
158 /* TODO In principle, this warning could be warranted only on
159 * some nodes, but we lack the infrastructure to do a good job
160 * of reporting that. */
161 GMX_LOG(mdlog.warning).asParagraph().
162 appendText("NOTE: You assigned the GPU tasks on a node such that some GPUs "
163 "available on that node are unused, which might not be optimal.");
166 if (isAnyGpuSharedBetweenRanks(gpuTaskAssignments))
168 GMX_LOG(mdlog.warning).asParagraph().
169 appendText("NOTE: You assigned the same GPU ID(s) to multiple ranks, which is a good idea if you have measured the performance of alternatives.");
173 //! Counts all the GPU tasks on this node.
174 size_t countGpuTasksOnThisNode(const GpuTasksOnRanks &gpuTasksOnRanksOfThisNode)
176 size_t numGpuTasksOnThisNode = 0;
177 for (const auto &gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
179 numGpuTasksOnThisNode += gpuTasksOnRank.size();
181 return numGpuTasksOnThisNode;
184 } // namespace
186 GpuTaskAssignments::value_type
187 runTaskAssignment(const std::vector<int> &gpuIdsToUse,
188 const std::vector<int> &userGpuTaskAssignment,
189 const gmx_hw_info_t &hardwareInfo,
190 const MDLogger &mdlog,
191 const t_commrec *cr,
192 const gmx_multisim_t *ms,
193 const PhysicalNodeCommunicator &physicalNodeComm,
194 const std::vector<GpuTask> &gpuTasksOnThisRank,
195 bool useGpuForBonded,
196 PmeRunMode pmeRunMode)
198 /* Communicate among ranks on this node to find each task that can
199 * be executed on a GPU, on each rank. */
200 auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank,
201 physicalNodeComm);
202 auto numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
204 GpuTaskAssignments taskAssignmentOnRanksOfThisNode;
207 // Use the GPU IDs from the user if they supplied
208 // them. Otherwise, choose from the compatible GPUs.
210 // GPU ID assignment strings, if provided, cover all the ranks
211 // on a node. If nodes or the process placement on them are
212 // heterogeneous, then the GMX_GPU_ID environment variable
213 // must be set by a user who also wishes to direct GPU ID
214 // assignment. Thus this implementation of task assignment
215 // can assume it has a GPU ID assignment appropriate for the
216 // node upon which its process is running.
218 // Valid GPU ID assignments are `an ordered set of digits that
219 // identify GPU device IDs (e.g. as understood by the GPU
220 // runtime, and subject to environment modification such as
221 // with CUDA_VISIBLE_DEVICES) that will be used for the
222 // GPU-suitable tasks on all of the ranks of that node.
223 ArrayRef<const int> gpuIdsForTaskAssignment;
224 std::vector<int> generatedGpuIds;
225 if (userGpuTaskAssignment.empty())
227 ArrayRef<const int> compatibleGpusToUse = gpuIdsToUse;
229 // enforce the single device/rank restriction
230 if (physicalNodeComm.size_ == 1 && !compatibleGpusToUse.empty())
232 compatibleGpusToUse = compatibleGpusToUse.subArray(0, 1);
235 // When doing automated assignment of GPU tasks to GPU
236 // IDs, even if we have more than one kind of GPU task, we
237 // do a simple round-robin assignment. That's not ideal,
238 // but we don't have any way to do a better job reliably.
239 generatedGpuIds = makeGpuIds(compatibleGpusToUse, numGpuTasksOnThisNode);
241 if ((numGpuTasksOnThisNode > gpuIdsToUse.size()) &&
242 (numGpuTasksOnThisNode % gpuIdsToUse.size() != 0))
244 // TODO Decorating the message with hostname should be
245 // the job of an error-reporting module.
246 char host[STRLEN];
247 gmx_gethostname(host, STRLEN);
249 GMX_THROW(InconsistentInputError
250 (formatString("There were %zu GPU tasks found on node %s, but %zu GPUs were "
251 "available. If the GPUs are equivalent, then it is usually best "
252 "to have a number of tasks that is a multiple of the number of GPUs. "
253 "You should reconsider your GPU task assignment, "
254 "number of ranks, or your use of the -nb, -pme, and -npme options, "
255 "perhaps after measuring the performance you can get.", numGpuTasksOnThisNode,
256 host, gpuIdsToUse.size())));
258 gpuIdsForTaskAssignment = generatedGpuIds;
260 else
262 if (numGpuTasksOnThisNode != userGpuTaskAssignment.size())
264 // TODO Decorating the message with hostname should be
265 // the job of an error-reporting module.
266 char host[STRLEN];
267 gmx_gethostname(host, STRLEN);
269 GMX_THROW(InconsistentInputError
270 (formatString("There were %zu GPU tasks assigned on node %s, but %zu GPU tasks were "
271 "identified, and these must match. Reconsider your GPU task assignment, "
272 "number of ranks, or your use of the -nb, -pme, and -npme options.", userGpuTaskAssignment.size(),
273 host, numGpuTasksOnThisNode)));
275 // Did the user choose compatible GPUs?
276 checkUserGpuIds(hardwareInfo.gpu_info, gpuIdsToUse, userGpuTaskAssignment);
278 gpuIdsForTaskAssignment = userGpuTaskAssignment;
280 taskAssignmentOnRanksOfThisNode =
281 buildTaskAssignment(gpuTasksOnRanksOfThisNode, gpuIdsForTaskAssignment);
284 catch (const std::exception &ex)
286 // TODO This implementation is quite similar to that of
287 // processExceptionAsFatalError (which implements
288 // GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear
289 // how we should involve MPI in the implementation of error
290 // handling.
291 if (physicalNodeComm.rank_ == 0)
293 printFatalErrorMessage(stderr, ex);
296 if (PAR(cr))
298 #if GMX_MPI
299 MPI_Barrier(cr->mpi_comm_mysim);
300 #endif
302 if (isMultiSim(ms))
304 #if GMX_MPI
305 MPI_Barrier(ms->mpi_comm_masters);
306 #endif
309 gmx_exit_on_fatal_error(ExitType_Abort, 1);
312 reportGpuUsage(mdlog, taskAssignmentOnRanksOfThisNode,
313 numGpuTasksOnThisNode, physicalNodeComm.size_, cr->nnodes > 1,
314 useGpuForBonded, pmeRunMode);
316 // If the user chose a task assignment, give them some hints where appropriate.
317 if (!userGpuTaskAssignment.empty())
319 logPerformanceHints(mdlog, gpuIdsToUse.size(),
320 numGpuTasksOnThisNode,
321 taskAssignmentOnRanksOfThisNode);
324 return taskAssignmentOnRanksOfThisNode[physicalNodeComm.rank_];
326 // TODO There is no check that mdrun -nb gpu or -pme gpu or
327 // -gpu_id is actually being implemented such that nonbonded tasks
328 // are being run on compatible GPUs, on all applicable ranks. That
329 // would require communication.
332 } // namespace gmx