src/gromacs/taskassignment/taskassignment.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief
  37  * Defines helper and factory functionality for task assignment.
  38  *
  39  * Note that the GPU ID assignment could potentially match many
  40  * different kinds of simulation setups, including ranks from multiple
  41  * simulations, ranks from the same simulation, and/or ranks with duty
  42  * only for particular tasks (e.g. PME-only ranks). Which GPU ID
  43  * assignments are valid will naturally depend on the other run-time
  44  * options given to mdrun, and the current capabilities of the
  45  * implementation.
  46  *
  47  * \author Mark Abraham <mark.j.abraham@gmail.com>
  48  * \ingroup module_taskassignment
  49  */
  50 #include "gmxpre.h"
  51
  52 #include "taskassignment.h"
  53
  54 #include "config.h"
  55
  56 #include <string>
  57 #include <vector>
  58
  59 #include "gromacs/hardware/hw_info.h"
  60 #include "gromacs/mdrunutility/multisim.h"
  61 #include "gromacs/mdtypes/commrec.h"
  62 #include "gromacs/taskassignment/usergpuids.h"
  63 #include "gromacs/utility/cstringutil.h"
  64 #include "gromacs/utility/exceptions.h"
  65 #include "gromacs/utility/fatalerror.h"
  66 #include "gromacs/utility/gmxassert.h"
  67 #include "gromacs/utility/gmxmpi.h"
  68 #include "gromacs/utility/logger.h"
  69 #include "gromacs/utility/physicalnodecommunicator.h"
  70 #include "gromacs/utility/stringutil.h"
  71 #include "gromacs/utility/sysinfo.h"
  72
  73 #include "findallgputasks.h"
  74 #include "reportgpuusage.h"
  75
  76 namespace gmx
  77 {
  78
  79 namespace
  80 {
  81
  82 /*! \brief Build data structure of types of GPU tasks on a rank,
  83  * together with the mapped GPU device IDs, for all GPU tasks on all
  84  * the ranks of this node.
  85  *
  86  * \param[in]   gpuTasksOnRanksOfThisNode  For each rank on this node, the set of tasks
  87  *                                         that are eligible to run on GPUs.
  88  * \param[in]   gpuIds                     The user-supplied GPU IDs.
  89  */
  90 GpuTaskAssignments
  91 buildTaskAssignment(const GpuTasksOnRanks  &gpuTasksOnRanksOfThisNode,
  92                     ArrayRef<const int>     gpuIds)
  93 {
  94     GpuTaskAssignments gpuTaskAssignmentOnRanksOfThisNode(gpuTasksOnRanksOfThisNode.size());
  95
  96     // Loop over the ranks on this node, and the tasks on each
  97     // rank. For each task, take the next device ID from those
  98     // provided by the user, to build a vector of mappings of task to
  99     // ID, for each rank on this node. Note that if there have not
 100     // been any GPU tasks identified, then gpuIds can be empty.
 101     auto               currentGpuId            = gpuIds.begin();
 102     auto               gpuTaskAssignmentOnRank = gpuTaskAssignmentOnRanksOfThisNode.begin();
 103     for (const auto &gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
 104     {
 105         gpuTaskAssignmentOnRank->reserve(gpuTasksOnRank.size());
 106         for (const auto &gpuTaskType : gpuTasksOnRank)
 107         {
 108             GMX_RELEASE_ASSERT(currentGpuId != gpuIds.end(), "Indexing out of range for GPU tasks");
 109             gpuTaskAssignmentOnRank->push_back({gpuTaskType, *currentGpuId});
 110             ++currentGpuId;
 111         }
 112         GMX_RELEASE_ASSERT(gpuTaskAssignmentOnRank->size() == gpuTasksOnRank.size(),
 113                            "Mismatch in number of GPU tasks on a rank with the number of elements in the resulting task assignment");
 114         ++gpuTaskAssignmentOnRank;
 115     }
 116
 117     return gpuTaskAssignmentOnRanksOfThisNode;
 118 }
 119
 120 /*! \brief Return whether a GPU device is shared between any ranks.
 121  *
 122  * Sharing GPUs among multiple ranks is possible via either user or
 123  * automated selection. */
 124 bool isAnyGpuSharedBetweenRanks(const GpuTaskAssignments &gpuTaskAssignments)
 125 {
 126     // Loop over all ranks i, looking on all higher ranks j whether
 127     // any tasks on them share GPU device IDs.
 128     //
 129     // TODO Should this functionality also consider whether tasks on
 130     // the same rank are sharing a device?
 131     for (size_t i = 0; i < gpuTaskAssignments.size(); ++i)
 132     {
 133         for (const auto &taskOnRankI : gpuTaskAssignments[i])
 134         {
 135             for (size_t j = i+1; j < gpuTaskAssignments.size(); ++j)
 136             {
 137                 for (const auto &taskOnRankJ : gpuTaskAssignments[j])
 138                 {
 139                     if (taskOnRankI.deviceId_ == taskOnRankJ.deviceId_)
 140                     {
 141                         return true;
 142                     }
 143                 }
 144             }
 145         }
 146     }
 147     return false;
 148 }
 149
 150 //! Logs to \c mdlog information that may help a user learn how to let mdrun make a task assignment that runs faster.
 151 void logPerformanceHints(const MDLogger           &mdlog,
 152                          size_t                    numCompatibleGpus,
 153                          size_t                    numGpuTasksOnThisNode,
 154                          const GpuTaskAssignments &gpuTaskAssignments)
 155 {
 156     if (numCompatibleGpus > numGpuTasksOnThisNode)
 157     {
 158         /* TODO In principle, this warning could be warranted only on
 159          * some nodes, but we lack the infrastructure to do a good job
 160          * of reporting that. */
 161         GMX_LOG(mdlog.warning).asParagraph().
 162             appendText("NOTE: You assigned the GPU tasks on a node such that some GPUs "
 163                        "available on that node are unused, which might not be optimal.");
 164     }
 165
 166     if (isAnyGpuSharedBetweenRanks(gpuTaskAssignments))
 167     {
 168         GMX_LOG(mdlog.warning).asParagraph().
 169             appendText("NOTE: You assigned the same GPU ID(s) to multiple ranks, which is a good idea if you have measured the performance of alternatives.");
 170     }
 171 }
 172
 173 //! Counts all the GPU tasks on this node.
 174 size_t countGpuTasksOnThisNode(const GpuTasksOnRanks &gpuTasksOnRanksOfThisNode)
 175 {
 176     size_t numGpuTasksOnThisNode = 0;
 177     for (const auto &gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
 178     {
 179         numGpuTasksOnThisNode += gpuTasksOnRank.size();
 180     }
 181     return numGpuTasksOnThisNode;
 182 }
 183
 184 }   // namespace
 185
 186 GpuTaskAssignments::value_type
 187 runTaskAssignment(const std::vector<int>         &gpuIdsToUse,
 188                   const std::vector<int>         &userGpuTaskAssignment,
 189                   const gmx_hw_info_t            &hardwareInfo,
 190                   const MDLogger                 &mdlog,
 191                   const t_commrec                *cr,
 192                   const gmx_multisim_t           *ms,
 193                   const PhysicalNodeCommunicator &physicalNodeComm,
 194                   const std::vector<GpuTask>     &gpuTasksOnThisRank,
 195                   bool                            useGpuForBonded,
 196                   PmeRunMode                      pmeRunMode)
 197 {
 198     /* Communicate among ranks on this node to find each task that can
 199      * be executed on a GPU, on each rank. */
 200     auto               gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank,
 201                                                                              physicalNodeComm);
 202     auto               numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
 203
 204     GpuTaskAssignments taskAssignmentOnRanksOfThisNode;
 205     try
 206     {
 207         // Use the GPU IDs from the user if they supplied
 208         // them. Otherwise, choose from the compatible GPUs.
 209         //
 210         // GPU ID assignment strings, if provided, cover all the ranks
 211         // on a node. If nodes or the process placement on them are
 212         // heterogeneous, then the GMX_GPU_ID environment variable
 213         // must be set by a user who also wishes to direct GPU ID
 214         // assignment.  Thus this implementation of task assignment
 215         // can assume it has a GPU ID assignment appropriate for the
 216         // node upon which its process is running.
 217         //
 218         // Valid GPU ID assignments are `an ordered set of digits that
 219         // identify GPU device IDs (e.g. as understood by the GPU
 220         // runtime, and subject to environment modification such as
 221         // with CUDA_VISIBLE_DEVICES) that will be used for the
 222         // GPU-suitable tasks on all of the ranks of that node.
 223         ArrayRef<const int> gpuIdsForTaskAssignment;
 224         std::vector<int>    generatedGpuIds;
 225         if (userGpuTaskAssignment.empty())
 226         {
 227             ArrayRef<const int> compatibleGpusToUse = gpuIdsToUse;
 228
 229             // enforce the single device/rank restriction
 230             if (physicalNodeComm.size_ == 1 && !compatibleGpusToUse.empty())
 231             {
 232                 compatibleGpusToUse = compatibleGpusToUse.subArray(0, 1);
 233             }
 234
 235             // When doing automated assignment of GPU tasks to GPU
 236             // IDs, even if we have more than one kind of GPU task, we
 237             // do a simple round-robin assignment. That's not ideal,
 238             // but we don't have any way to do a better job reliably.
 239             generatedGpuIds         = makeGpuIds(compatibleGpusToUse, numGpuTasksOnThisNode);
 240
 241             if ((numGpuTasksOnThisNode > gpuIdsToUse.size()) &&
 242                 (numGpuTasksOnThisNode % gpuIdsToUse.size() != 0))
 243             {
 244                 // TODO Decorating the message with hostname should be
 245                 // the job of an error-reporting module.
 246                 char host[STRLEN];
 247                 gmx_gethostname(host, STRLEN);
 248
 249                 GMX_THROW(InconsistentInputError
 250                               (formatString("There were %zu GPU tasks found on node %s, but %zu GPUs were "
 251                                             "available. If the GPUs are equivalent, then it is usually best "
 252                                             "to have a number of tasks that is a multiple of the number of GPUs. "
 253                                             "You should reconsider your GPU task assignment, "
 254                                             "number of ranks, or your use of the -nb, -pme, and -npme options, "
 255                                             "perhaps after measuring the performance you can get.", numGpuTasksOnThisNode,
 256                                             host, gpuIdsToUse.size())));
 257             }
 258             gpuIdsForTaskAssignment = generatedGpuIds;
 259         }
 260         else
 261         {
 262             if (numGpuTasksOnThisNode != userGpuTaskAssignment.size())
 263             {
 264                 // TODO Decorating the message with hostname should be
 265                 // the job of an error-reporting module.
 266                 char host[STRLEN];
 267                 gmx_gethostname(host, STRLEN);
 268
 269                 GMX_THROW(InconsistentInputError
 270                               (formatString("There were %zu GPU tasks assigned on node %s, but %zu GPU tasks were "
 271                                             "identified, and these must match. Reconsider your GPU task assignment, "
 272                                             "number of ranks, or your use of the -nb, -pme, and -npme options.", userGpuTaskAssignment.size(),
 273                                             host, numGpuTasksOnThisNode)));
 274             }
 275             // Did the user choose compatible GPUs?
 276             checkUserGpuIds(hardwareInfo.gpu_info, gpuIdsToUse, userGpuTaskAssignment);
 277
 278             gpuIdsForTaskAssignment = userGpuTaskAssignment;
 279         }
 280         taskAssignmentOnRanksOfThisNode =
 281             buildTaskAssignment(gpuTasksOnRanksOfThisNode, gpuIdsForTaskAssignment);
 282
 283     }
 284     catch (const std::exception &ex)
 285     {
 286         // TODO This implementation is quite similar to that of
 287         // processExceptionAsFatalError (which implements
 288         // GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear
 289         // how we should involve MPI in the implementation of error
 290         // handling.
 291         if (physicalNodeComm.rank_ == 0)
 292         {
 293             printFatalErrorMessage(stderr, ex);
 294         }
 295
 296         if (PAR(cr))
 297         {
 298 #if GMX_MPI
 299             MPI_Barrier(cr->mpi_comm_mysim);
 300 #endif
 301         }
 302         if (isMultiSim(ms))
 303         {
 304 #if GMX_MPI
 305             MPI_Barrier(ms->mpi_comm_masters);
 306 #endif
 307         }
 308
 309         gmx_exit_on_fatal_error(ExitType_Abort, 1);
 310     }
 311
 312     reportGpuUsage(mdlog, taskAssignmentOnRanksOfThisNode,
 313                    numGpuTasksOnThisNode, physicalNodeComm.size_, cr->nnodes > 1,
 314                    useGpuForBonded, pmeRunMode);
 315
 316     // If the user chose a task assignment, give them some hints where appropriate.
 317     if (!userGpuTaskAssignment.empty())
 318     {
 319         logPerformanceHints(mdlog, gpuIdsToUse.size(),
 320                             numGpuTasksOnThisNode,
 321                             taskAssignmentOnRanksOfThisNode);
 322     }
 323
 324     return taskAssignmentOnRanksOfThisNode[physicalNodeComm.rank_];
 325
 326     // TODO There is no check that mdrun -nb gpu or -pme gpu or
 327     // -gpu_id is actually being implemented such that nonbonded tasks
 328     // are being run on compatible GPUs, on all applicable ranks. That
 329     // would require communication.
 330 }
 331
 332 }  // namespace gmx