2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * Defines helper and factory functionality for task assignment.
39 * Note that the GPU ID assignment could potentially match many
40 * different kinds of simulation setups, including ranks from multiple
41 * simulations, ranks from the same simulation, and/or ranks with duty
42 * only for particular tasks (e.g. PME-only ranks). Which GPU ID
43 * assignments are valid will naturally depend on the other run-time
44 * options given to mdrun, and the current capabilities of the
47 * \author Mark Abraham <mark.j.abraham@gmail.com>
48 * \ingroup module_taskassignment
52 #include "taskassignment.h"
61 #include "gromacs/domdec/domdec.h"
62 #include "gromacs/gmxlib/network.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/hardware/hw_info.h"
65 #include "gromacs/mdrunutility/multisim.h"
66 #include "gromacs/mdtypes/commrec.h"
67 #include "gromacs/taskassignment/usergpuids.h"
68 #include "gromacs/utility/cstringutil.h"
69 #include "gromacs/utility/exceptions.h"
70 #include "gromacs/utility/fatalerror.h"
71 #include "gromacs/utility/gmxassert.h"
72 #include "gromacs/utility/gmxmpi.h"
73 #include "gromacs/utility/logger.h"
74 #include "gromacs/utility/physicalnodecommunicator.h"
75 #include "gromacs/utility/stringutil.h"
76 #include "gromacs/utility/sysinfo.h"
78 #include "findallgputasks.h"
79 #include "reportgpuusage.h"
87 /*! \brief Build data structure of types of GPU tasks on a rank,
88 * together with the mapped GPU device IDs, for all GPU tasks on all
89 * the ranks of this node.
91 * \param[in] gpuTasksOnRanksOfThisNode For each rank on this node, the set of tasks
92 * that are eligible to run on GPUs.
93 * \param[in] gpuIds The user-supplied GPU IDs.
95 std::vector
<GpuTaskAssignment
> buildTaskAssignment(const GpuTasksOnRanks
& gpuTasksOnRanksOfThisNode
,
96 ArrayRef
<const int> gpuIds
)
98 std::vector
<GpuTaskAssignment
> gpuTaskAssignmentOnRanksOfThisNode(gpuTasksOnRanksOfThisNode
.size());
100 // Loop over the ranks on this node, and the tasks on each
101 // rank. For each task, take the next device ID from those
102 // provided by the user, to build a vector of mappings of task to
103 // ID, for each rank on this node. Note that if there have not
104 // been any GPU tasks identified, then gpuIds can be empty.
105 auto currentGpuId
= gpuIds
.begin();
106 auto gpuTaskAssignmentOnRank
= gpuTaskAssignmentOnRanksOfThisNode
.begin();
107 for (const auto& gpuTasksOnRank
: gpuTasksOnRanksOfThisNode
)
109 gpuTaskAssignmentOnRank
->reserve(gpuTasksOnRank
.size());
110 for (const auto& gpuTaskType
: gpuTasksOnRank
)
112 GMX_RELEASE_ASSERT(currentGpuId
!= gpuIds
.end(), "Indexing out of range for GPU tasks");
113 gpuTaskAssignmentOnRank
->push_back({ gpuTaskType
, *currentGpuId
});
116 GMX_RELEASE_ASSERT(gpuTaskAssignmentOnRank
->size() == gpuTasksOnRank
.size(),
117 "Mismatch in number of GPU tasks on a rank with the number of elements "
118 "in the resulting task assignment");
119 ++gpuTaskAssignmentOnRank
;
122 return gpuTaskAssignmentOnRanksOfThisNode
;
125 /*! \brief Return whether a GPU device is shared between any ranks.
127 * Sharing GPUs among multiple ranks is possible via either user or
128 * automated selection. */
129 bool isAnyGpuSharedBetweenRanks(ArrayRef
<const GpuTaskAssignment
> gpuTaskAssignments
)
131 // Loop over all ranks i, looking on all higher ranks j whether
132 // any tasks on them share GPU device IDs.
134 // TODO Should this functionality also consider whether tasks on
135 // the same rank are sharing a device?
136 for (size_t i
= 0; i
< gpuTaskAssignments
.size(); ++i
)
138 for (const auto& taskOnRankI
: gpuTaskAssignments
[i
])
140 for (size_t j
= i
+ 1; j
< gpuTaskAssignments
.size(); ++j
)
142 for (const auto& taskOnRankJ
: gpuTaskAssignments
[j
])
144 if (taskOnRankI
.deviceId_
== taskOnRankJ
.deviceId_
)
157 void GpuTaskAssignments::logPerformanceHints(const MDLogger
& mdlog
, size_t numCompatibleGpusOnThisNode
)
159 if (numCompatibleGpusOnThisNode
> numGpuTasksOnThisNode_
)
161 /* TODO In principle, this warning could be warranted only on
162 * some nodes, but we lack the infrastructure to do a good job
163 * of reporting that. */
164 GMX_LOG(mdlog
.warning
)
167 "NOTE: You assigned the GPU tasks on a node such that some GPUs "
168 "available on that node are unused, which might not be optimal.");
171 if (isAnyGpuSharedBetweenRanks(assignmentForAllRanksOnThisNode_
))
173 GMX_LOG(mdlog
.warning
)
176 "NOTE: You assigned the same GPU ID(s) to multiple ranks, which is a good "
177 "idea if you have measured the performance of alternatives.");
184 //! Counts all the GPU tasks on this node.
185 size_t countGpuTasksOnThisNode(const GpuTasksOnRanks
& gpuTasksOnRanksOfThisNode
)
187 size_t numGpuTasksOnThisNode
= 0;
188 for (const auto& gpuTasksOnRank
: gpuTasksOnRanksOfThisNode
)
190 numGpuTasksOnThisNode
+= gpuTasksOnRank
.size();
192 return numGpuTasksOnThisNode
;
195 /*! \brief Return on each rank the total count over all ranks of all
197 int countOverAllRanks(MPI_Comm comm
, int countOnThisRank
)
202 MPI_Comm_size(comm
, &numRanks
);
205 MPI_Allreduce(&countOnThisRank
, &sum
, 1, MPI_INT
, MPI_SUM
, comm
);
209 GMX_UNUSED_VALUE(comm
);
212 sum
= countOnThisRank
;
218 /*! \brief Barrier over all rank in \p comm */
219 void barrierOverAllRanks(MPI_Comm comm
)
223 MPI_Comm_size(comm
, &numRanks
);
229 GMX_UNUSED_VALUE(comm
);
235 GpuTaskAssignmentsBuilder::GpuTaskAssignmentsBuilder() = default;
237 GpuTaskAssignments
GpuTaskAssignmentsBuilder::build(const std::vector
<int>& gpuIdsToUse
,
238 const std::vector
<int>& userGpuTaskAssignment
,
239 const gmx_hw_info_t
& hardwareInfo
,
240 MPI_Comm gromacsWorldComm
,
241 const PhysicalNodeCommunicator
& physicalNodeComm
,
242 const TaskTarget nonbondedTarget
,
243 const TaskTarget pmeTarget
,
244 const TaskTarget bondedTarget
,
245 const TaskTarget updateTarget
,
246 const bool useGpuForNonbonded
,
247 const bool useGpuForPme
,
251 size_t numRanksOnThisNode
= physicalNodeComm
.size_
;
252 std::vector
<GpuTask
> gpuTasksOnThisRank
= findGpuTasksOnThisRank(
253 !gpuIdsToUse
.empty(), nonbondedTarget
, pmeTarget
, bondedTarget
, updateTarget
,
254 useGpuForNonbonded
, useGpuForPme
, rankHasPpTask
, rankHasPmeTask
);
255 /* Communicate among ranks on this node to find each task that can
256 * be executed on a GPU, on each rank. */
257 auto gpuTasksOnRanksOfThisNode
= findAllGpuTasksOnThisNode(gpuTasksOnThisRank
, physicalNodeComm
);
258 size_t numGpuTasksOnThisNode
= countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode
);
260 std::exception_ptr exceptionPtr
;
261 std::vector
<GpuTaskAssignment
> taskAssignmentOnRanksOfThisNode
;
264 // Use the GPU IDs from the user if they supplied
265 // them. Otherwise, choose from the compatible GPUs.
267 // GPU ID assignment strings, if provided, cover all the ranks
268 // on a node. If nodes or the process placement on them are
269 // heterogeneous, then the GMX_GPU_ID environment variable
270 // must be set by a user who also wishes to direct GPU ID
271 // assignment. Thus this implementation of task assignment
272 // can assume it has a GPU ID assignment appropriate for the
273 // node upon which its process is running.
275 // Valid GPU ID assignments are `an ordered set of digits that
276 // identify GPU device IDs (e.g. as understood by the GPU
277 // runtime, and subject to environment modification such as
278 // with CUDA_VISIBLE_DEVICES) that will be used for the
279 // GPU-suitable tasks on all of the ranks of that node.
280 ArrayRef
<const int> gpuIdsForTaskAssignment
;
281 std::vector
<int> generatedGpuIds
;
282 if (userGpuTaskAssignment
.empty())
284 ArrayRef
<const int> compatibleGpusToUse
= gpuIdsToUse
;
286 // enforce the single device/rank restriction
287 if (numRanksOnThisNode
== 1 && !compatibleGpusToUse
.empty())
289 compatibleGpusToUse
= compatibleGpusToUse
.subArray(0, 1);
292 // When doing automated assignment of GPU tasks to GPU
293 // IDs, even if we have more than one kind of GPU task, we
294 // do a simple round-robin assignment. That's not ideal,
295 // but we don't have any way to do a better job reliably.
296 generatedGpuIds
= makeGpuIds(compatibleGpusToUse
, numGpuTasksOnThisNode
);
298 if ((numGpuTasksOnThisNode
> gpuIdsToUse
.size())
299 && (numGpuTasksOnThisNode
% gpuIdsToUse
.size() != 0))
301 // TODO Decorating the message with hostname should be
302 // the job of an error-reporting module.
304 gmx_gethostname(host
, STRLEN
);
306 GMX_THROW(InconsistentInputError(formatString(
307 "There were %zu GPU tasks found on node %s, but %zu GPUs were "
308 "available. If the GPUs are equivalent, then it is usually best "
309 "to have a number of tasks that is a multiple of the number of GPUs. "
310 "You should reconsider your GPU task assignment, "
311 "number of ranks, or your use of the -nb, -pme, and -npme options, "
312 "perhaps after measuring the performance you can get.",
313 numGpuTasksOnThisNode
, host
, gpuIdsToUse
.size())));
315 gpuIdsForTaskAssignment
= generatedGpuIds
;
319 if (numGpuTasksOnThisNode
!= userGpuTaskAssignment
.size())
321 // TODO Decorating the message with hostname should be
322 // the job of an error-reporting module.
324 gmx_gethostname(host
, STRLEN
);
326 GMX_THROW(InconsistentInputError(formatString(
327 "There were %zu GPU tasks assigned on node %s, but %zu GPU tasks were "
328 "identified, and these must match. Reconsider your GPU task assignment, "
329 "number of ranks, or your use of the -nb, -pme, and -npme options.",
330 userGpuTaskAssignment
.size(), host
, numGpuTasksOnThisNode
)));
332 // Did the user choose compatible GPUs?
333 checkUserGpuIds(hardwareInfo
.gpu_info
, gpuIdsToUse
, userGpuTaskAssignment
);
335 gpuIdsForTaskAssignment
= userGpuTaskAssignment
;
337 taskAssignmentOnRanksOfThisNode
=
338 buildTaskAssignment(gpuTasksOnRanksOfThisNode
, gpuIdsForTaskAssignment
);
342 exceptionPtr
= std::current_exception();
344 int countOfExceptionsOnThisRank
= int(bool(exceptionPtr
));
345 int countOfExceptionsOverAllRanks
= countOverAllRanks(gromacsWorldComm
, countOfExceptionsOnThisRank
);
347 // Avoid all ranks spamming the error stream
349 // TODO improve this so that unique errors on different ranks
350 // are all reported on one rank.
351 if (countOfExceptionsOnThisRank
> 0 && physicalNodeComm
.rank_
== 0)
357 std::rethrow_exception(exceptionPtr
);
360 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
362 // TODO Global barrier so that MPI runtimes can
363 // organize an orderly shutdown if one of the ranks has had to
364 // issue a fatal error above. When we have MPI-aware error
365 // handling and reporting, this should be improved (perhaps
366 // centralized there).
367 barrierOverAllRanks(gromacsWorldComm
);
368 if (countOfExceptionsOverAllRanks
> 0)
371 "Exiting because task assignment failed. If there is no descriptive error "
372 "message in the terminal output, please report this failure as a bug.");
375 // TODO There is no check that mdrun -nb gpu or -pme gpu or
376 // -gpu_id is actually being implemented such that nonbonded tasks
377 // are being run on compatible GPUs, on all applicable ranks. That
378 // would require communication.
380 GpuTaskAssignments
gpuTaskAssignments(hardwareInfo
);
381 gpuTaskAssignments
.assignmentForAllRanksOnThisNode_
= taskAssignmentOnRanksOfThisNode
;
382 gpuTaskAssignments
.indexOfThisRank_
= physicalNodeComm
.rank_
;
383 gpuTaskAssignments
.numGpuTasksOnThisNode_
= numGpuTasksOnThisNode
;
384 gpuTaskAssignments
.numRanksOnThisNode_
= numRanksOnThisNode
;
385 return gpuTaskAssignments
;
388 GpuTaskAssignments::GpuTaskAssignments(const gmx_hw_info_t
& hardwareInfo
) :
389 hardwareInfo_(hardwareInfo
)
393 void GpuTaskAssignments::reportGpuUsage(const MDLogger
& mdlog
,
395 bool useGpuForBonded
,
396 PmeRunMode pmeRunMode
,
397 bool useGpuForUpdate
)
399 gmx::reportGpuUsage(mdlog
, assignmentForAllRanksOnThisNode_
, numGpuTasksOnThisNode_
,
400 numRanksOnThisNode_
, printHostName
, useGpuForBonded
, pmeRunMode
, useGpuForUpdate
);
403 DeviceInformation
* GpuTaskAssignments::initNonbondedDevice(const t_commrec
* cr
) const
405 DeviceInformation
* deviceInfo
= nullptr;
406 const GpuTaskAssignment
& gpuTaskAssignment
= assignmentForAllRanksOnThisNode_
[indexOfThisRank_
];
408 // This works because only one task of each type per rank is currently permitted.
409 auto nbGpuTaskMapping
= std::find_if(gpuTaskAssignment
.begin(), gpuTaskAssignment
.end(),
410 hasTaskType
<GpuTask::Nonbonded
>);
411 if (nbGpuTaskMapping
!= gpuTaskAssignment
.end())
413 int deviceId
= nbGpuTaskMapping
->deviceId_
;
414 deviceInfo
= getDeviceInfo(hardwareInfo_
.gpu_info
, deviceId
);
415 init_gpu(deviceInfo
);
417 // TODO Setting up this sharing should probably part of
418 // init_domain_decomposition after further refactoring.
419 if (DOMAINDECOMP(cr
))
421 /* When we share GPUs over ranks, we need to know this for the DLB */
422 dd_setup_dlb_resource_sharing(cr
, deviceId
);
428 DeviceInformation
* GpuTaskAssignments::initPmeDevice() const
430 DeviceInformation
* deviceInfo
= nullptr;
431 const GpuTaskAssignment
& gpuTaskAssignment
= assignmentForAllRanksOnThisNode_
[indexOfThisRank_
];
433 // This works because only one task of each type is currently permitted.
434 auto pmeGpuTaskMapping
= std::find_if(gpuTaskAssignment
.begin(), gpuTaskAssignment
.end(),
435 hasTaskType
<GpuTask::Pme
>);
436 const bool thisRankHasPmeGpuTask
= (pmeGpuTaskMapping
!= gpuTaskAssignment
.end());
437 if (thisRankHasPmeGpuTask
)
439 deviceInfo
= getDeviceInfo(hardwareInfo_
.gpu_info
, pmeGpuTaskMapping
->deviceId_
);
440 init_gpu(deviceInfo
);
445 bool GpuTaskAssignments::thisRankHasPmeGpuTask() const
447 const GpuTaskAssignment
& gpuTaskAssignment
= assignmentForAllRanksOnThisNode_
[indexOfThisRank_
];
449 auto pmeGpuTaskMapping
= std::find_if(gpuTaskAssignment
.begin(), gpuTaskAssignment
.end(),
450 hasTaskType
<GpuTask::Pme
>);
451 const bool thisRankHasPmeGpuTask
= (pmeGpuTaskMapping
!= gpuTaskAssignment
.end());
453 return thisRankHasPmeGpuTask
;
456 bool GpuTaskAssignments::thisRankHasAnyGpuTask() const
458 const GpuTaskAssignment
& gpuTaskAssignment
= assignmentForAllRanksOnThisNode_
[indexOfThisRank_
];
460 const bool thisRankHasAnyGpuTask
= !gpuTaskAssignment
.empty();
461 return thisRankHasAnyGpuTask
;