src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010-2018, The GROMACS development team.
   5  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \file
  37  *  \brief Define functions for detection and initialization for CUDA devices.
  38  *
  39  *  \author Szilard Pall <pall.szilard@gmail.com>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include "gpu_utils.h"
  45
  46 #include <assert.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49
  50 #include <cuda_profiler_api.h>
  51
  52 #include "gromacs/gpu_utils/cudautils.cuh"
  53 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  54 #include "gromacs/hardware/gpu_hw_info.h"
  55 #include "gromacs/utility/basedefinitions.h"
  56 #include "gromacs/utility/cstringutil.h"
  57 #include "gromacs/utility/exceptions.h"
  58 #include "gromacs/utility/fatalerror.h"
  59 #include "gromacs/utility/gmxassert.h"
  60 #include "gromacs/utility/logger.h"
  61 #include "gromacs/utility/programcontext.h"
  62 #include "gromacs/utility/smalloc.h"
  63 #include "gromacs/utility/snprintf.h"
  64 #include "gromacs/utility/stringutil.h"
  65
  66 /*! \internal \brief
  67  * Max number of devices supported by CUDA (for consistency checking).
  68  *
  69  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  70  */
  71 static int cuda_max_device_count = 32;
  72
  73 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
  74
  75 /** Dummy kernel used for sanity checking. */
  76 static __global__ void k_dummy_test(void) {}
  77
  78 static void checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  79 {
  80     cudaFuncAttributes attributes;
  81     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  82
  83     if (cudaErrorInvalidDeviceFunction == stat)
  84     {
  85         gmx_fatal(FARGS,
  86                   "The %s binary does not include support for the CUDA architecture of a "
  87                   "detected GPU: %s, ID #%d (compute capability %d.%d). "
  88                   "By default, GROMACS supports all architectures of compute "
  89                   "capability >= 3.0, so your GPU "
  90                   "might be rare, or some architectures were disabled in the build. "
  91                   "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  92                   "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. "
  93                   "To work around this error, use the CUDA_VISIBLE_DEVICES environment"
  94                   "variable to pass a list of GPUs that excludes the ID %d.",
  95                   gmx::getProgramContext().displayName(), deviceProp.name, deviceId,
  96                   deviceProp.major, deviceProp.minor, deviceId);
  97     }
  98
  99     CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
 100 }
 101
 102 bool isHostMemoryPinned(const void* h_ptr)
 103 {
 104     cudaPointerAttributes memoryAttributes;
 105     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 106
 107     bool result = false;
 108     switch (stat)
 109     {
 110         case cudaSuccess: result = true; break;
 111
 112         case cudaErrorInvalidValue:
 113             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 114             result = false;
 115             // Reset the last error status
 116             cudaGetLastError();
 117             break;
 118
 119         default: CU_RET_ERR(stat, "Unexpected CUDA error");
 120     }
 121     return result;
 122 }
 123
 124 /*!
 125  * \brief Runs GPU sanity checks.
 126  *
 127  * Runs a series of checks to determine that the given GPU and underlying CUDA
 128  * driver/runtime functions properly.
 129  *
 130  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 131  * \param[in]  dev_prop    The device properties structure
 132  * \returns                0 if the device looks OK
 133  *
 134  * TODO: introduce errors codes and handle errors more smoothly.
 135  */
 136 static int do_sanity_checks(int dev_id, const cudaDeviceProp& dev_prop)
 137 {
 138     cudaError_t cu_err;
 139     int         dev_count, id;
 140
 141     cu_err = cudaGetDeviceCount(&dev_count);
 142     if (cu_err != cudaSuccess)
 143     {
 144         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
 145         return -1;
 146     }
 147
 148     /* no CUDA compatible device at all */
 149     if (dev_count == 0)
 150     {
 151         return -1;
 152     }
 153
 154     /* things might go horribly wrong if cudart is not compatible with the driver */
 155     if (dev_count < 0 || dev_count > cuda_max_device_count)
 156     {
 157         return -1;
 158     }
 159
 160     if (dev_id == -1) /* device already selected let's not destroy the context */
 161     {
 162         cu_err = cudaGetDevice(&id);
 163         if (cu_err != cudaSuccess)
 164         {
 165             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
 166             return -1;
 167         }
 168     }
 169     else
 170     {
 171         id = dev_id;
 172         if (id > dev_count - 1) /* pfff there's no such device */
 173         {
 174             fprintf(stderr,
 175                     "The requested device with id %d does not seem to exist (device count=%d)\n",
 176                     dev_id, dev_count);
 177             return -1;
 178         }
 179     }
 180
 181     /* both major & minor is 9999 if no CUDA capable devices are present */
 182     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 183     {
 184         return -1;
 185     }
 186     /* we don't care about emulation mode */
 187     if (dev_prop.major == 0)
 188     {
 189         return -1;
 190     }
 191
 192     if (id != -1)
 193     {
 194         cu_err = cudaSetDevice(id);
 195         if (cu_err != cudaSuccess)
 196         {
 197             fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
 198                     cudaGetErrorString(cu_err));
 199             return -1;
 200         }
 201     }
 202
 203     /* try to execute a dummy kernel */
 204     checkCompiledTargetCompatibility(dev_id, dev_prop);
 205
 206     KernelLaunchConfig config;
 207     config.blockSize[0]       = 512;
 208     const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 209     launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
 210     if (cudaDeviceSynchronize() != cudaSuccess)
 211     {
 212         return -1;
 213     }
 214
 215     /* destroy context if we created one */
 216     if (id != -1)
 217     {
 218         cu_err = cudaDeviceReset();
 219         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 220     }
 221
 222     return 0;
 223 }
 224
 225 void init_gpu(const DeviceInformation* deviceInfo)
 226 {
 227     cudaError_t stat;
 228
 229     assert(deviceInfo);
 230
 231     stat = cudaSetDevice(deviceInfo->id);
 232     if (stat != cudaSuccess)
 233     {
 234         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 235         CU_RET_ERR(stat, message.c_str());
 236     }
 237
 238     if (debug)
 239     {
 240         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 241     }
 242 }
 243
 244 void free_gpu(const DeviceInformation* deviceInfo)
 245 {
 246     // One should only attempt to clear the device context when
 247     // it has been used, but currently the only way to know that a GPU
 248     // device was used is that deviceInfo will be non-null.
 249     if (deviceInfo == nullptr)
 250     {
 251         return;
 252     }
 253
 254     cudaError_t stat;
 255
 256     if (debug)
 257     {
 258         int gpuid;
 259         stat = cudaGetDevice(&gpuid);
 260         CU_RET_ERR(stat, "cudaGetDevice failed");
 261         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 262     }
 263
 264     stat = cudaDeviceReset();
 265     if (stat != cudaSuccess)
 266     {
 267         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 268     }
 269 }
 270
 271 DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
 272 {
 273     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 274     {
 275         gmx_incons("Invalid GPU deviceId requested");
 276     }
 277     return &gpu_info.deviceInfo[deviceId];
 278 }
 279
 280 /*! \brief Returns true if the gpu characterized by the device properties is
 281  *  supported by the native gpu acceleration.
 282  *
 283  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 284  * \returns             true if the GPU properties passed indicate a compatible
 285  *                      GPU, otherwise false.
 286  */
 287 static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
 288 {
 289     return (dev_prop.major >= 3);
 290 }
 291
 292 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 293  *
 294  *  Returns a status value which indicates compatibility or one of the following
 295  *  errors: incompatibility or insanity (=unexpected behavior).
 296  *
 297  *  As the error handling only permits returning the state of the GPU, this function
 298  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 299  *  upon return. Note that this also means it is the caller's responsibility to
 300  *  reset the CUDA runtime state.
 301  *
 302  *  \param[in]  deviceId   the ID of the GPU to check.
 303  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 304  *  \returns               the status of the requested device
 305  */
 306 static int is_gmx_supported_gpu_id(int deviceId, const cudaDeviceProp& deviceProp)
 307 {
 308     if (!is_gmx_supported_gpu(deviceProp))
 309     {
 310         return egpuIncompatible;
 311     }
 312
 313     /* TODO: currently we do not make a distinction between the type of errors
 314      * that can appear during sanity checks. This needs to be improved, e.g if
 315      * the dummy test kernel fails to execute with a "device busy message" we
 316      * should appropriately report that the device is busy instead of insane.
 317      */
 318     if (do_sanity_checks(deviceId, deviceProp) != 0)
 319     {
 320         return egpuInsane;
 321     }
 322
 323     return egpuCompatible;
 324 }
 325
 326 bool isGpuDetectionFunctional(std::string* errorMessage)
 327 {
 328     cudaError_t stat;
 329     int         driverVersion = -1;
 330     stat                      = cudaDriverGetVersion(&driverVersion);
 331     GMX_ASSERT(stat != cudaErrorInvalidValue,
 332                "An impossible null pointer was passed to cudaDriverGetVersion");
 333     GMX_RELEASE_ASSERT(
 334             stat == cudaSuccess,
 335             gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 336                               cudaGetErrorName(stat), cudaGetErrorString(stat))
 337                     .c_str());
 338     bool foundDriver = (driverVersion > 0);
 339     if (!foundDriver)
 340     {
 341         // Can't detect GPUs if there is no driver
 342         if (errorMessage != nullptr)
 343         {
 344             errorMessage->assign("No valid CUDA driver found");
 345         }
 346         return false;
 347     }
 348
 349     int numDevices;
 350     stat = cudaGetDeviceCount(&numDevices);
 351     if (stat != cudaSuccess)
 352     {
 353         if (errorMessage != nullptr)
 354         {
 355             /* cudaGetDeviceCount failed which means that there is
 356              * something wrong with the machine: driver-runtime
 357              * mismatch, all GPUs being busy in exclusive mode,
 358              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 359              * which should result in GROMACS issuing at least a
 360              * warning. */
 361             errorMessage->assign(cudaGetErrorString(stat));
 362         }
 363
 364         // Consume the error now that we have prepared to handle
 365         // it. This stops it reappearing next time we check for
 366         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 367         // valid devices, then cudaGetLastError returns the
 368         // (undocumented) cudaErrorNoDevice, but this should not be a
 369         // problem as there should be no future CUDA API calls.
 370         // NVIDIA bug report #2038718 has been filed.
 371         cudaGetLastError();
 372         // Can't detect GPUs
 373         return false;
 374     }
 375
 376     // We don't actually use numDevices here, that's not the job of
 377     // this function.
 378     return true;
 379 }
 380
 381 void findGpus(gmx_gpu_info_t* gpu_info)
 382 {
 383     assert(gpu_info);
 384
 385     gpu_info->n_dev_compatible = 0;
 386
 387     int         ndev;
 388     cudaError_t stat = cudaGetDeviceCount(&ndev);
 389     if (stat != cudaSuccess)
 390     {
 391         GMX_THROW(gmx::InternalError(
 392                 "Invalid call of findGpus() when CUDA API returned an error, perhaps "
 393                 "canDetectGpus() was not called appropriately beforehand."));
 394     }
 395
 396     // We expect to start device support/sanity checks with a clean runtime error state
 397     gmx::ensureNoPendingCudaError("");
 398
 399     DeviceInformation* devs;
 400     snew(devs, ndev);
 401     for (int i = 0; i < ndev; i++)
 402     {
 403         cudaDeviceProp prop;
 404         memset(&prop, 0, sizeof(cudaDeviceProp));
 405         stat = cudaGetDeviceProperties(&prop, i);
 406         int checkResult;
 407         if (stat != cudaSuccess)
 408         {
 409             // Will handle the error reporting below
 410             checkResult = egpuInsane;
 411         }
 412         else
 413         {
 414             checkResult = is_gmx_supported_gpu_id(i, prop);
 415         }
 416
 417         devs[i].id   = i;
 418         devs[i].prop = prop;
 419         devs[i].stat = checkResult;
 420
 421         if (checkResult == egpuCompatible)
 422         {
 423             gpu_info->n_dev_compatible++;
 424         }
 425         else
 426         {
 427             // TODO:
 428             //  - we inspect the CUDA API state to retrieve and record any
 429             //    errors that occurred during is_gmx_supported_gpu_id() here,
 430             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 431             //    and only return a string with the error if one was encountered.
 432             //  - we'll be reporting without rank information which is not ideal.
 433             //  - we'll end up warning also in cases where users would already
 434             //    get an error before mdrun aborts.
 435             //
 436             // Here we also clear the CUDA API error state so potential
 437             // errors during sanity checks don't propagate.
 438             if ((stat = cudaGetLastError()) != cudaSuccess)
 439             {
 440                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 441                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 442             }
 443         }
 444     }
 445
 446     stat = cudaPeekAtLastError();
 447     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 448                        gmx::formatString("We promise to return with clean CUDA state, but "
 449                                          "non-success state encountered: %s: %s",
 450                                          cudaGetErrorName(stat), cudaGetErrorString(stat))
 451                                .c_str());
 452
 453     gpu_info->n_dev      = ndev;
 454     gpu_info->deviceInfo = devs;
 455 }
 456
 457 void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
 458 {
 459     assert(s);
 460
 461     if (index < 0 && index >= gpu_info.n_dev)
 462     {
 463         return;
 464     }
 465
 466     DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
 467
 468     bool bGpuExists = (dinfo->stat != egpuNonexistent && dinfo->stat != egpuInsane);
 469
 470     if (!bGpuExists)
 471     {
 472         sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", gpu_detect_res_str[dinfo->stat]);
 473     }
 474     else
 475     {
 476         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
 477                 dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
 478                 dinfo->prop.ECCEnabled ? "yes" : " no", gpu_detect_res_str[dinfo->stat]);
 479     }
 480 }
 481
 482 int get_current_cuda_gpu_device_id(void)
 483 {
 484     int gpuid;
 485     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 486
 487     return gpuid;
 488 }
 489
 490 size_t sizeof_gpu_dev_info(void)
 491 {
 492     return sizeof(DeviceInformation);
 493 }
 494
 495 void startGpuProfiler(void)
 496 {
 497     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 498        mdrun is executed in the CUDA profiler.
 499        If nvprof was run is with "--profile-from-start off", the profiler will
 500        be started here. This way we can avoid tracing the CUDA events from the
 501        first part of the run. Starting the profiler again does nothing.
 502      */
 503     if (cudaProfilerRun)
 504     {
 505         cudaError_t stat;
 506         stat = cudaProfilerStart();
 507         CU_RET_ERR(stat, "cudaProfilerStart failed");
 508     }
 509 }
 510
 511 void stopGpuProfiler(void)
 512 {
 513     /* Stopping the nvidia here allows us to eliminate the subsequent
 514        API calls from the trace, e.g. uninitialization and cleanup. */
 515     if (cudaProfilerRun)
 516     {
 517         cudaError_t stat;
 518         stat = cudaProfilerStop();
 519         CU_RET_ERR(stat, "cudaProfilerStop failed");
 520     }
 521 }
 522
 523 void resetGpuProfiler(void)
 524 {
 525     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 526      *  the profiling here (can't stop it) which will achieve the desired effect if
 527      *  the run was started with the profiling disabled.
 528      *
 529      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 530      * stopGpuProfiler();
 531      */
 532     if (cudaProfilerRun)
 533     {
 534         startGpuProfiler();
 535     }
 536 }
 537
 538 int gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
 539 {
 540     return info.deviceInfo[index].stat;
 541 }
 542
 543 /*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately
 544  * \param[in] stat           CUDA call return status
 545  * \param[in] gpuA           ID for GPU initiating peer access call
 546  * \param[in] gpuB           ID for remote GPU
 547  * \param[in] mdlog          Logger object
 548  * \param[in] cudaCallName   name of CUDA peer access call
 549  */
 550 static void peerAccessCheckStat(const cudaError_t    stat,
 551                                 const int            gpuA,
 552                                 const int            gpuB,
 553                                 const gmx::MDLogger& mdlog,
 554                                 const char*          cudaCallName)
 555 {
 556     if ((stat == cudaErrorInvalidDevice) || (stat == cudaErrorInvalidValue))
 557     {
 558         std::string errorString =
 559                 gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
 560         CU_RET_ERR(stat, errorString.c_str());
 561     }
 562     if (stat != cudaSuccess)
 563     {
 564         GMX_LOG(mdlog.warning)
 565                 .asParagraph()
 566                 .appendTextFormatted(
 567                         "GPU peer access not enabled between GPUs %d and %d due to unexpected "
 568                         "return value from %s: %s",
 569                         gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
 570     }
 571 }
 572
 573 void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MDLogger& mdlog)
 574 {
 575     cudaError_t stat;
 576
 577     // take a note of currently-set GPU
 578     int currentGpu;
 579     stat = cudaGetDevice(&currentGpu);
 580     CU_RET_ERR(stat, "cudaGetDevice in setupGpuDevicePeerAccess failed");
 581
 582     std::string message = gmx::formatString(
 583             "Note: Peer access enabled between the following GPU pairs in the node:\n ");
 584     bool peerAccessEnabled = false;
 585
 586     for (unsigned int i = 0; i < gpuIdsToUse.size(); i++)
 587     {
 588         int gpuA = gpuIdsToUse[i];
 589         stat     = cudaSetDevice(gpuA);
 590         if (stat != cudaSuccess)
 591         {
 592             GMX_LOG(mdlog.warning)
 593                     .asParagraph()
 594                     .appendTextFormatted(
 595                             "GPU peer access not enabled due to unexpected return value from "
 596                             "cudaSetDevice(%d): %s",
 597                             gpuA, cudaGetErrorString(stat));
 598             return;
 599         }
 600         for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
 601         {
 602             if (j != i)
 603             {
 604                 int gpuB          = gpuIdsToUse[j];
 605                 int canAccessPeer = 0;
 606                 stat              = cudaDeviceCanAccessPeer(&canAccessPeer, gpuA, gpuB);
 607                 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceCanAccessPeer");
 608
 609                 if (canAccessPeer)
 610                 {
 611                     stat = cudaDeviceEnablePeerAccess(gpuB, 0);
 612                     peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceEnablePeerAccess");
 613
 614                     message           = gmx::formatString("%s%d->%d ", message.c_str(), gpuA, gpuB);
 615                     peerAccessEnabled = true;
 616                 }
 617             }
 618         }
 619     }
 620
 621     // re-set GPU to that originally set
 622     stat = cudaSetDevice(currentGpu);
 623     if (stat != cudaSuccess)
 624     {
 625         CU_RET_ERR(stat, "cudaSetDevice in setupGpuDevicePeerAccess failed");
 626         return;
 627     }
 628
 629     if (peerAccessEnabled)
 630     {
 631         GMX_LOG(mdlog.info).asParagraph().appendTextFormatted("%s", message.c_str());
 632     }
 633 }