Move DeviceInfo into GPU traits
[gromacs.git] / src / gromacs / taskassignment / taskassignment.cpp
blob5a62972d61001ad59172b8538fdab1f2c6c09860
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 /*! \internal \file
36 * \brief
37 * Defines helper and factory functionality for task assignment.
39 * Note that the GPU ID assignment could potentially match many
40 * different kinds of simulation setups, including ranks from multiple
41 * simulations, ranks from the same simulation, and/or ranks with duty
42 * only for particular tasks (e.g. PME-only ranks). Which GPU ID
43 * assignments are valid will naturally depend on the other run-time
44 * options given to mdrun, and the current capabilities of the
45 * implementation.
47 * \author Mark Abraham <mark.j.abraham@gmail.com>
48 * \ingroup module_taskassignment
50 #include "gmxpre.h"
52 #include "taskassignment.h"
54 #include "config.h"
56 #include <algorithm>
57 #include <exception>
58 #include <string>
59 #include <vector>
61 #include "gromacs/domdec/domdec.h"
62 #include "gromacs/gmxlib/network.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/hardware/hw_info.h"
65 #include "gromacs/mdrunutility/multisim.h"
66 #include "gromacs/mdtypes/commrec.h"
67 #include "gromacs/taskassignment/usergpuids.h"
68 #include "gromacs/utility/cstringutil.h"
69 #include "gromacs/utility/exceptions.h"
70 #include "gromacs/utility/fatalerror.h"
71 #include "gromacs/utility/gmxassert.h"
72 #include "gromacs/utility/gmxmpi.h"
73 #include "gromacs/utility/logger.h"
74 #include "gromacs/utility/physicalnodecommunicator.h"
75 #include "gromacs/utility/stringutil.h"
76 #include "gromacs/utility/sysinfo.h"
78 #include "findallgputasks.h"
79 #include "reportgpuusage.h"
81 namespace gmx
84 namespace
87 /*! \brief Build data structure of types of GPU tasks on a rank,
88 * together with the mapped GPU device IDs, for all GPU tasks on all
89 * the ranks of this node.
91 * \param[in] gpuTasksOnRanksOfThisNode For each rank on this node, the set of tasks
92 * that are eligible to run on GPUs.
93 * \param[in] gpuIds The user-supplied GPU IDs.
95 std::vector<GpuTaskAssignment> buildTaskAssignment(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode,
96 ArrayRef<const int> gpuIds)
98 std::vector<GpuTaskAssignment> gpuTaskAssignmentOnRanksOfThisNode(gpuTasksOnRanksOfThisNode.size());
100 // Loop over the ranks on this node, and the tasks on each
101 // rank. For each task, take the next device ID from those
102 // provided by the user, to build a vector of mappings of task to
103 // ID, for each rank on this node. Note that if there have not
104 // been any GPU tasks identified, then gpuIds can be empty.
105 auto currentGpuId = gpuIds.begin();
106 auto gpuTaskAssignmentOnRank = gpuTaskAssignmentOnRanksOfThisNode.begin();
107 for (const auto& gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
109 gpuTaskAssignmentOnRank->reserve(gpuTasksOnRank.size());
110 for (const auto& gpuTaskType : gpuTasksOnRank)
112 GMX_RELEASE_ASSERT(currentGpuId != gpuIds.end(), "Indexing out of range for GPU tasks");
113 gpuTaskAssignmentOnRank->push_back({ gpuTaskType, *currentGpuId });
114 ++currentGpuId;
116 GMX_RELEASE_ASSERT(gpuTaskAssignmentOnRank->size() == gpuTasksOnRank.size(),
117 "Mismatch in number of GPU tasks on a rank with the number of elements "
118 "in the resulting task assignment");
119 ++gpuTaskAssignmentOnRank;
122 return gpuTaskAssignmentOnRanksOfThisNode;
125 /*! \brief Return whether a GPU device is shared between any ranks.
127 * Sharing GPUs among multiple ranks is possible via either user or
128 * automated selection. */
129 bool isAnyGpuSharedBetweenRanks(ArrayRef<const GpuTaskAssignment> gpuTaskAssignments)
131 // Loop over all ranks i, looking on all higher ranks j whether
132 // any tasks on them share GPU device IDs.
134 // TODO Should this functionality also consider whether tasks on
135 // the same rank are sharing a device?
136 for (size_t i = 0; i < gpuTaskAssignments.size(); ++i)
138 for (const auto& taskOnRankI : gpuTaskAssignments[i])
140 for (size_t j = i + 1; j < gpuTaskAssignments.size(); ++j)
142 for (const auto& taskOnRankJ : gpuTaskAssignments[j])
144 if (taskOnRankI.deviceId_ == taskOnRankJ.deviceId_)
146 return true;
152 return false;
155 } // namespace
157 void GpuTaskAssignments::logPerformanceHints(const MDLogger& mdlog, size_t numCompatibleGpusOnThisNode)
159 if (numCompatibleGpusOnThisNode > numGpuTasksOnThisNode_)
161 /* TODO In principle, this warning could be warranted only on
162 * some nodes, but we lack the infrastructure to do a good job
163 * of reporting that. */
164 GMX_LOG(mdlog.warning)
165 .asParagraph()
166 .appendText(
167 "NOTE: You assigned the GPU tasks on a node such that some GPUs "
168 "available on that node are unused, which might not be optimal.");
171 if (isAnyGpuSharedBetweenRanks(assignmentForAllRanksOnThisNode_))
173 GMX_LOG(mdlog.warning)
174 .asParagraph()
175 .appendText(
176 "NOTE: You assigned the same GPU ID(s) to multiple ranks, which is a good "
177 "idea if you have measured the performance of alternatives.");
181 namespace
184 //! Counts all the GPU tasks on this node.
185 size_t countGpuTasksOnThisNode(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode)
187 size_t numGpuTasksOnThisNode = 0;
188 for (const auto& gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
190 numGpuTasksOnThisNode += gpuTasksOnRank.size();
192 return numGpuTasksOnThisNode;
195 /*! \brief Return on each rank the total count over all ranks of all
196 * simulations. */
197 int countOverAllRanks(MPI_Comm comm, int countOnThisRank)
199 int sum;
200 #if GMX_MPI
201 int numRanks;
202 MPI_Comm_size(comm, &numRanks);
203 if (numRanks > 1)
205 MPI_Allreduce(&countOnThisRank, &sum, 1, MPI_INT, MPI_SUM, comm);
207 else
208 #else
209 GMX_UNUSED_VALUE(comm);
210 #endif
212 sum = countOnThisRank;
215 return sum;
218 /*! \brief Barrier over all rank in \p comm */
219 void barrierOverAllRanks(MPI_Comm comm)
221 #if GMX_MPI
222 int numRanks;
223 MPI_Comm_size(comm, &numRanks);
224 if (numRanks > 1)
226 MPI_Barrier(comm);
228 #else
229 GMX_UNUSED_VALUE(comm);
230 #endif
233 } // namespace
235 GpuTaskAssignmentsBuilder::GpuTaskAssignmentsBuilder() = default;
237 GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const std::vector<int>& gpuIdsToUse,
238 const std::vector<int>& userGpuTaskAssignment,
239 const gmx_hw_info_t& hardwareInfo,
240 MPI_Comm gromacsWorldComm,
241 const PhysicalNodeCommunicator& physicalNodeComm,
242 const TaskTarget nonbondedTarget,
243 const TaskTarget pmeTarget,
244 const TaskTarget bondedTarget,
245 const TaskTarget updateTarget,
246 const bool useGpuForNonbonded,
247 const bool useGpuForPme,
248 bool rankHasPpTask,
249 bool rankHasPmeTask)
251 size_t numRanksOnThisNode = physicalNodeComm.size_;
252 std::vector<GpuTask> gpuTasksOnThisRank = findGpuTasksOnThisRank(
253 !gpuIdsToUse.empty(), nonbondedTarget, pmeTarget, bondedTarget, updateTarget,
254 useGpuForNonbonded, useGpuForPme, rankHasPpTask, rankHasPmeTask);
255 /* Communicate among ranks on this node to find each task that can
256 * be executed on a GPU, on each rank. */
257 auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank, physicalNodeComm);
258 size_t numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
260 std::exception_ptr exceptionPtr;
261 std::vector<GpuTaskAssignment> taskAssignmentOnRanksOfThisNode;
264 // Use the GPU IDs from the user if they supplied
265 // them. Otherwise, choose from the compatible GPUs.
267 // GPU ID assignment strings, if provided, cover all the ranks
268 // on a node. If nodes or the process placement on them are
269 // heterogeneous, then the GMX_GPU_ID environment variable
270 // must be set by a user who also wishes to direct GPU ID
271 // assignment. Thus this implementation of task assignment
272 // can assume it has a GPU ID assignment appropriate for the
273 // node upon which its process is running.
275 // Valid GPU ID assignments are `an ordered set of digits that
276 // identify GPU device IDs (e.g. as understood by the GPU
277 // runtime, and subject to environment modification such as
278 // with CUDA_VISIBLE_DEVICES) that will be used for the
279 // GPU-suitable tasks on all of the ranks of that node.
280 ArrayRef<const int> gpuIdsForTaskAssignment;
281 std::vector<int> generatedGpuIds;
282 if (userGpuTaskAssignment.empty())
284 ArrayRef<const int> compatibleGpusToUse = gpuIdsToUse;
286 // enforce the single device/rank restriction
287 if (numRanksOnThisNode == 1 && !compatibleGpusToUse.empty())
289 compatibleGpusToUse = compatibleGpusToUse.subArray(0, 1);
292 // When doing automated assignment of GPU tasks to GPU
293 // IDs, even if we have more than one kind of GPU task, we
294 // do a simple round-robin assignment. That's not ideal,
295 // but we don't have any way to do a better job reliably.
296 generatedGpuIds = makeGpuIds(compatibleGpusToUse, numGpuTasksOnThisNode);
298 if ((numGpuTasksOnThisNode > gpuIdsToUse.size())
299 && (numGpuTasksOnThisNode % gpuIdsToUse.size() != 0))
301 // TODO Decorating the message with hostname should be
302 // the job of an error-reporting module.
303 char host[STRLEN];
304 gmx_gethostname(host, STRLEN);
306 GMX_THROW(InconsistentInputError(formatString(
307 "There were %zu GPU tasks found on node %s, but %zu GPUs were "
308 "available. If the GPUs are equivalent, then it is usually best "
309 "to have a number of tasks that is a multiple of the number of GPUs. "
310 "You should reconsider your GPU task assignment, "
311 "number of ranks, or your use of the -nb, -pme, and -npme options, "
312 "perhaps after measuring the performance you can get.",
313 numGpuTasksOnThisNode, host, gpuIdsToUse.size())));
315 gpuIdsForTaskAssignment = generatedGpuIds;
317 else
319 if (numGpuTasksOnThisNode != userGpuTaskAssignment.size())
321 // TODO Decorating the message with hostname should be
322 // the job of an error-reporting module.
323 char host[STRLEN];
324 gmx_gethostname(host, STRLEN);
326 GMX_THROW(InconsistentInputError(formatString(
327 "There were %zu GPU tasks assigned on node %s, but %zu GPU tasks were "
328 "identified, and these must match. Reconsider your GPU task assignment, "
329 "number of ranks, or your use of the -nb, -pme, and -npme options.",
330 userGpuTaskAssignment.size(), host, numGpuTasksOnThisNode)));
332 // Did the user choose compatible GPUs?
333 checkUserGpuIds(hardwareInfo.gpu_info, gpuIdsToUse, userGpuTaskAssignment);
335 gpuIdsForTaskAssignment = userGpuTaskAssignment;
337 taskAssignmentOnRanksOfThisNode =
338 buildTaskAssignment(gpuTasksOnRanksOfThisNode, gpuIdsForTaskAssignment);
340 catch (...)
342 exceptionPtr = std::current_exception();
344 int countOfExceptionsOnThisRank = int(bool(exceptionPtr));
345 int countOfExceptionsOverAllRanks = countOverAllRanks(gromacsWorldComm, countOfExceptionsOnThisRank);
347 // Avoid all ranks spamming the error stream
349 // TODO improve this so that unique errors on different ranks
350 // are all reported on one rank.
351 if (countOfExceptionsOnThisRank > 0 && physicalNodeComm.rank_ == 0)
355 if (exceptionPtr)
357 std::rethrow_exception(exceptionPtr);
360 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
362 // TODO Global barrier so that MPI runtimes can
363 // organize an orderly shutdown if one of the ranks has had to
364 // issue a fatal error above. When we have MPI-aware error
365 // handling and reporting, this should be improved (perhaps
366 // centralized there).
367 barrierOverAllRanks(gromacsWorldComm);
368 if (countOfExceptionsOverAllRanks > 0)
370 gmx_fatal(FARGS,
371 "Exiting because task assignment failed. If there is no descriptive error "
372 "message in the terminal output, please report this failure as a bug.");
375 // TODO There is no check that mdrun -nb gpu or -pme gpu or
376 // -gpu_id is actually being implemented such that nonbonded tasks
377 // are being run on compatible GPUs, on all applicable ranks. That
378 // would require communication.
380 GpuTaskAssignments gpuTaskAssignments(hardwareInfo);
381 gpuTaskAssignments.assignmentForAllRanksOnThisNode_ = taskAssignmentOnRanksOfThisNode;
382 gpuTaskAssignments.indexOfThisRank_ = physicalNodeComm.rank_;
383 gpuTaskAssignments.numGpuTasksOnThisNode_ = numGpuTasksOnThisNode;
384 gpuTaskAssignments.numRanksOnThisNode_ = numRanksOnThisNode;
385 return gpuTaskAssignments;
388 GpuTaskAssignments::GpuTaskAssignments(const gmx_hw_info_t& hardwareInfo) :
389 hardwareInfo_(hardwareInfo)
393 void GpuTaskAssignments::reportGpuUsage(const MDLogger& mdlog,
394 bool printHostName,
395 bool useGpuForBonded,
396 PmeRunMode pmeRunMode,
397 bool useGpuForUpdate)
399 gmx::reportGpuUsage(mdlog, assignmentForAllRanksOnThisNode_, numGpuTasksOnThisNode_,
400 numRanksOnThisNode_, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
403 DeviceInformation* GpuTaskAssignments::initNonbondedDevice(const t_commrec* cr) const
405 DeviceInformation* deviceInfo = nullptr;
406 const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
408 // This works because only one task of each type per rank is currently permitted.
409 auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
410 hasTaskType<GpuTask::Nonbonded>);
411 if (nbGpuTaskMapping != gpuTaskAssignment.end())
413 int deviceId = nbGpuTaskMapping->deviceId_;
414 deviceInfo = getDeviceInfo(hardwareInfo_.gpu_info, deviceId);
415 init_gpu(deviceInfo);
417 // TODO Setting up this sharing should probably part of
418 // init_domain_decomposition after further refactoring.
419 if (DOMAINDECOMP(cr))
421 /* When we share GPUs over ranks, we need to know this for the DLB */
422 dd_setup_dlb_resource_sharing(cr, deviceId);
425 return deviceInfo;
428 DeviceInformation* GpuTaskAssignments::initPmeDevice() const
430 DeviceInformation* deviceInfo = nullptr;
431 const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
433 // This works because only one task of each type is currently permitted.
434 auto pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
435 hasTaskType<GpuTask::Pme>);
436 const bool thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
437 if (thisRankHasPmeGpuTask)
439 deviceInfo = getDeviceInfo(hardwareInfo_.gpu_info, pmeGpuTaskMapping->deviceId_);
440 init_gpu(deviceInfo);
442 return deviceInfo;
445 bool GpuTaskAssignments::thisRankHasPmeGpuTask() const
447 const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
449 auto pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
450 hasTaskType<GpuTask::Pme>);
451 const bool thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
453 return thisRankHasPmeGpuTask;
456 bool GpuTaskAssignments::thisRankHasAnyGpuTask() const
458 const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
460 const bool thisRankHasAnyGpuTask = !gpuTaskAssignment.empty();
461 return thisRankHasAnyGpuTask;
464 } // namespace gmx