src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50
  51 #include <cuda_profiler_api.h>
  52
  53 #include "gromacs/gpu_utils/cudautils.cuh"
  54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/utility/basedefinitions.h"
  57 #include "gromacs/utility/cstringutil.h"
  58 #include "gromacs/utility/logger.h"
  59 #include "gromacs/utility/smalloc.h"
  60
  61 #if HAVE_NVML
  62 #include <nvml.h>
  63 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
  64 #else  /* HAVE_NVML */
  65 #define HAVE_NVML_APPLICATION_CLOCKS 0
  66 #endif /* HAVE_NVML */
  67
  68 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
  69 /*! Check for NVML error on the return status of a NVML API call. */
  70 #  define HANDLE_NVML_RET_ERR(status, msg) \
  71     do { \
  72         if (status != NVML_SUCCESS) \
  73         { \
  74             gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
  75         } \
  76     } while (0)
  77 #else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  78 #  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
  79 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  80
  81 #if HAVE_NVML_APPLICATION_CLOCKS
  82 static const gmx_bool            bCompiledWithApplicationClockSupport = true;
  83 #else
  84 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
  85 #endif
  86
  87 /*! \internal \brief
  88  * Max number of devices supported by CUDA (for consistency checking).
  89  *
  90  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  91  */
  92 static int  cuda_max_device_count = 32;
  93
  94 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
  95
  96 /** Dummy kernel used for sanity checking. */
  97 __global__ void k_dummy_test()
  98 {
  99 }
 100
 101
 102 /*!
 103  * \brief Runs GPU sanity checks.
 104  *
 105  * Runs a series of checks to determine that the given GPU and underlying CUDA
 106  * driver/runtime functions properly.
 107  * Returns properties of a device with given ID or the one that has
 108  * already been initialized earlier in the case if of \dev_id == -1.
 109  *
 110  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 111  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 112  * \returns                0 if the device looks OK
 113  *
 114  * TODO: introduce errors codes and handle errors more smoothly.
 115  */
 116 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 117 {
 118     cudaError_t cu_err;
 119     int         dev_count, id;
 120
 121     cu_err = cudaGetDeviceCount(&dev_count);
 122     if (cu_err != cudaSuccess)
 123     {
 124         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 125                 cudaGetErrorString(cu_err));
 126         return -1;
 127     }
 128
 129     /* no CUDA compatible device at all */
 130     if (dev_count == 0)
 131     {
 132         return -1;
 133     }
 134
 135     /* things might go horribly wrong if cudart is not compatible with the driver */
 136     if (dev_count < 0 || dev_count > cuda_max_device_count)
 137     {
 138         return -1;
 139     }
 140
 141     if (dev_id == -1) /* device already selected let's not destroy the context */
 142     {
 143         cu_err = cudaGetDevice(&id);
 144         if (cu_err != cudaSuccess)
 145         {
 146             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 147                     cudaGetErrorString(cu_err));
 148             return -1;
 149         }
 150     }
 151     else
 152     {
 153         id = dev_id;
 154         if (id > dev_count - 1) /* pfff there's no such device */
 155         {
 156             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 157                     dev_id, dev_count);
 158             return -1;
 159         }
 160     }
 161
 162     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 163     cu_err = cudaGetDeviceProperties(dev_prop, id);
 164     if (cu_err != cudaSuccess)
 165     {
 166         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 167                 cudaGetErrorString(cu_err));
 168         return -1;
 169     }
 170
 171     /* both major & minor is 9999 if no CUDA capable devices are present */
 172     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 173     {
 174         return -1;
 175     }
 176     /* we don't care about emulation mode */
 177     if (dev_prop->major == 0)
 178     {
 179         return -1;
 180     }
 181
 182     if (id != -1)
 183     {
 184         cu_err = cudaSetDevice(id);
 185         if (cu_err != cudaSuccess)
 186         {
 187             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 188                     cu_err, id, cudaGetErrorString(cu_err));
 189             return -1;
 190         }
 191     }
 192
 193     /* try to execute a dummy kernel */
 194     k_dummy_test<<< 1, 512>>> ();
 195     if (cudaThreadSynchronize() != cudaSuccess)
 196     {
 197         return -1;
 198     }
 199
 200     /* destroy context if we created one */
 201     if (id != -1)
 202     {
 203         cu_err = cudaDeviceReset();
 204         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 205     }
 206
 207     return 0;
 208 }
 209
 210 #if HAVE_NVML_APPLICATION_CLOCKS
 211 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 212  *
 213  * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 214  * matching PCI-E information from \cuda_dev with the available NVML devices.
 215  *
 216  * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 217  * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 218  */
 219 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 220 {
 221     nvmlDevice_t nvml_device_id;
 222     unsigned int nvml_device_count  = 0;
 223     nvmlReturn_t nvml_stat          = nvmlDeviceGetCount ( &nvml_device_count );
 224     bool         nvmlWasInitialized = false;
 225     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 226     for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 227     {
 228         nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 229         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 230         if (nvml_stat != NVML_SUCCESS)
 231         {
 232             break;
 233         }
 234
 235         nvmlPciInfo_t nvml_pci_info;
 236         nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 237         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 238         if (nvml_stat != NVML_SUCCESS)
 239         {
 240             break;
 241         }
 242         if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 243             static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 244             static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 245         {
 246             nvmlWasInitialized         = true;
 247             cuda_dev->nvml_device_id   = nvml_device_id;
 248             break;
 249         }
 250     }
 251     return nvmlWasInitialized;
 252 }
 253
 254 /*! \brief Reads and returns the application clocks for device.
 255  *
 256  * \param[in]  device        The GPU device
 257  * \param[out] app_sm_clock  The current application SM clock
 258  * \param[out] app_mem_clock The current application memory clock
 259  * \returns if applacation clocks are supported
 260  */
 261 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
 262                                  unsigned int            *app_sm_clock,
 263                                  unsigned int            *app_mem_clock)
 264 {
 265     nvmlReturn_t nvml_stat;
 266
 267     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
 268     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
 269     {
 270         return false;
 271     }
 272     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
 273     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
 274     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed");
 275
 276     return true;
 277 }
 278 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 279
 280 /*! \brief Tries to set application clocks for the GPU with the given index.
 281  *
 282  * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
 283  * to handle the application clocks for. Application clocks are set to the
 284  * max supported value to increase performance if application clock permissions
 285  * allow this. For future GPU architectures a more sophisticated scheme might be
 286  * required.
 287  *
 288  * \todo Refactor this into a detection phase and a work phase. Also
 289  * refactor to remove compile-time dependence on logging header.
 290  *
 291  * \param     mdlog         log file to write to
 292  * \param[in] gpuid         index of the GPU to set application clocks for
 293  * \param[in] gpu_info      GPU info of all detected devices in the system.
 294  * \returns                 true if no error occurs during application clocks handling.
 295  */
 296 static gmx_bool init_gpu_application_clocks(
 297         const gmx::MDLogger &mdlog, int gmx_unused gpuid,
 298         const gmx_gpu_info_t gmx_unused *gpu_info)
 299 {
 300     const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
 301     int                   cuda_version_number         = prop->major * 10 + prop->minor;
 302     gmx_bool              bGpuCanUseApplicationClocks =
 303         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
 304          (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
 305     if (!bGpuCanUseApplicationClocks)
 306     {
 307         return true;
 308     }
 309 #if !HAVE_NVML
 310     GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 311             "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
 312             "      application clocks of the detected %s GPU to improve performance.\n"
 313             "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
 314             prop->name);
 315     return true;
 316 #else
 317     if (!bCompiledWithApplicationClockSupport)
 318     {
 319         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 320                 "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
 321                 "      managing application clocks of the detected %s GPU to improve performance.\n"
 322                 "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
 323                 prop->name );
 324         return true;
 325     }
 326
 327     /* We've compiled with NVML application clocks support, and have a GPU that can use it */
 328     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 329     char        *env;
 330     //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 331     //      this variable can be later used to give a user more fine grained control.
 332     env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 333     if (env != NULL && ( strcmp( env, "0") == 0 ||
 334                          gmx_strcasecmp( env, "OFF") == 0 ||
 335                          gmx_strcasecmp( env, "DISABLE") == 0 ))
 336     {
 337         return true;
 338     }
 339     nvml_stat = nvmlInit();
 340     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 341     if (nvml_stat != NVML_SUCCESS)
 342     {
 343         return false;
 344     }
 345
 346     gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
 347
 348     if (!addNVMLDeviceId(cuda_dev))
 349     {
 350         return false;
 351     }
 352     //get current application clocks setting
 353     if (!getApplicationClocks(cuda_dev,
 354                               &cuda_dev->nvml_orig_app_sm_clock,
 355                               &cuda_dev->nvml_orig_app_mem_clock))
 356     {
 357         return false;
 358     }
 359     //get max application clocks
 360     unsigned int max_sm_clock  = 0;
 361     unsigned int max_mem_clock = 0;
 362     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
 363     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 364     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
 365     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 366
 367     cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
 368     cuda_dev->nvml_app_clocks_changed = false;
 369
 370     nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
 371     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 372
 373     if (nvml_stat != NVML_SUCCESS)
 374     {
 375         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 376                 "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.",
 377                 nvml_stat, nvmlErrorString(nvml_stat));
 378         return false;
 379     }
 380
 381     if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
 382     {
 383         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 384                 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
 385                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 386         return true;
 387     }
 388
 389     if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
 390     {
 391         //TODO: This should probably be integrated into the GPU Properties table.
 392         GMX_LOG(mdlog.warning).appendTextFormatted(
 393                 "Application clocks (GPU clocks) for %s are (%d,%d)",
 394                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 395         return true;
 396     }
 397
 398     /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
 399        e.g. if max application clocks should not be used for certain GPUs. */
 400     GMX_LOG(mdlog.warning).appendTextFormatted(
 401             "Changing GPU application clocks for %s to (%d,%d)",
 402             cuda_dev->prop.name, max_mem_clock, max_sm_clock);
 403     nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
 404     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 405     cuda_dev->nvml_app_clocks_changed = true;
 406     cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
 407     cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
 408
 409     return true;
 410 #endif /* HAVE_NVML */
 411 }
 412
 413 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 414  *
 415  * \param[in] gpu_dev  CUDA device information
 416  */
 417 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 418 {
 419 #if !HAVE_NVML_APPLICATION_CLOCKS
 420     GMX_UNUSED_VALUE(cuda_dev);
 421     return true;
 422 #else /* HAVE_NVML_APPLICATION_CLOCKS */
 423     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 424     if (cuda_dev &&
 425         cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
 426         cuda_dev->nvml_app_clocks_changed)
 427     {
 428         /* Check if the clocks are still what we set them to.
 429          * If so, set them back to the state we originally found them in.
 430          * If not, don't touch them, because something else set them later.
 431          */
 432         unsigned int app_sm_clock, app_mem_clock;
 433         getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
 434         if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
 435             app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
 436         {
 437             nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 438             HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 439         }
 440     }
 441     nvml_stat = nvmlShutdown();
 442     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 443     return (nvml_stat == NVML_SUCCESS);
 444 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 445 }
 446
 447 gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str,
 448                   const struct gmx_gpu_info_t *gpu_info,
 449                   const struct gmx_gpu_opt_t *gpu_opt)
 450 {
 451     cudaError_t stat;
 452     char        sbuf[STRLEN];
 453     int         gpuid;
 454
 455     assert(gpu_info);
 456     assert(result_str);
 457
 458     if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
 459     {
 460         sprintf(sbuf, "Trying to initialize an non-existent GPU: "
 461                 "there are %d %s-selected GPU(s), but #%d was requested.",
 462                 gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 463         gmx_incons(sbuf);
 464     }
 465
 466     gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
 467
 468     stat = cudaSetDevice(gpuid);
 469     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 470
 471     if (debug)
 472     {
 473         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
 474     }
 475
 476     //Ignoring return value as NVML errors should be treated not critical.
 477     if (stat == cudaSuccess)
 478     {
 479         init_gpu_application_clocks(mdlog, gpuid, gpu_info);
 480     }
 481     return (stat == cudaSuccess);
 482 }
 483
 484 gmx_bool free_cuda_gpu(
 485         int gmx_unused mygpu, char *result_str,
 486         const gmx_gpu_info_t gmx_unused *gpu_info,
 487         const gmx_gpu_opt_t gmx_unused *gpu_opt
 488         )
 489 {
 490     cudaError_t  stat;
 491     gmx_bool     reset_gpu_application_clocks_status = true;
 492     int          gpuid;
 493
 494     assert(result_str);
 495
 496     if (debug)
 497     {
 498         int gpuid;
 499         stat = cudaGetDevice(&gpuid);
 500         CU_RET_ERR(stat, "cudaGetDevice failed");
 501         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 502     }
 503
 504     gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
 505     if (gpuid != -1)
 506     {
 507         reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
 508     }
 509
 510     stat = cudaDeviceReset();
 511     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 512     return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
 513 }
 514
 515 /*! \brief Returns true if the gpu characterized by the device properties is
 516  *  supported by the native gpu acceleration.
 517  *
 518  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 519  * \returns             true if the GPU properties passed indicate a compatible
 520  *                      GPU, otherwise false.
 521  */
 522 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 523 {
 524     return (dev_prop->major >= 2);
 525 }
 526
 527 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 528  *
 529  *  Returns a status value which indicates compatibility or one of the following
 530  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 531  *  It also returns the respective device's properties in \dev_prop (if applicable).
 532  *
 533  *  \param[in]  dev_id   the ID of the GPU to check.
 534  *  \param[out] dev_prop the CUDA device properties of the device checked.
 535  *  \returns             the status of the requested device
 536  */
 537 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 538 {
 539     cudaError_t stat;
 540     int         ndev;
 541
 542     stat = cudaGetDeviceCount(&ndev);
 543     if (stat != cudaSuccess)
 544     {
 545         return egpuInsane;
 546     }
 547
 548     if (dev_id > ndev - 1)
 549     {
 550         return egpuNonexistent;
 551     }
 552
 553     /* TODO: currently we do not make a distinction between the type of errors
 554      * that can appear during sanity checks. This needs to be improved, e.g if
 555      * the dummy test kernel fails to execute with a "device busy message" we
 556      * should appropriately report that the device is busy instead of insane.
 557      */
 558     if (do_sanity_checks(dev_id, dev_prop) == 0)
 559     {
 560         if (is_gmx_supported_gpu(dev_prop))
 561         {
 562             return egpuCompatible;
 563         }
 564         else
 565         {
 566             return egpuIncompatible;
 567         }
 568     }
 569     else
 570     {
 571         return egpuInsane;
 572     }
 573 }
 574
 575
 576 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 577 {
 578     int                i, ndev, checkres, retval;
 579     cudaError_t        stat;
 580     cudaDeviceProp     prop;
 581     gmx_device_info_t *devs;
 582
 583     assert(gpu_info);
 584     assert(err_str);
 585
 586     gpu_info->n_dev_compatible = 0;
 587
 588     ndev    = 0;
 589     devs    = NULL;
 590
 591     stat = cudaGetDeviceCount(&ndev);
 592     if (stat != cudaSuccess)
 593     {
 594         const char *s;
 595
 596         /* cudaGetDeviceCount failed which means that there is something
 597          * wrong with the machine: driver-runtime mismatch, all GPUs being
 598          * busy in exclusive mode, or some other condition which should
 599          * result in us issuing a warning a falling back to CPUs. */
 600         retval = -1;
 601         s      = cudaGetErrorString(stat);
 602         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 603     }
 604     else
 605     {
 606         snew(devs, ndev);
 607         for (i = 0; i < ndev; i++)
 608         {
 609             checkres = is_gmx_supported_gpu_id(i, &prop);
 610
 611             devs[i].id   = i;
 612             devs[i].prop = prop;
 613             devs[i].stat = checkres;
 614
 615             if (checkres == egpuCompatible)
 616             {
 617                 gpu_info->n_dev_compatible++;
 618             }
 619         }
 620         retval = 0;
 621     }
 622
 623     gpu_info->n_dev   = ndev;
 624     gpu_info->gpu_dev = devs;
 625
 626     return retval;
 627 }
 628
 629 int getGpuCompatibilityStatus(const gmx_gpu_info_t *gpu_info,
 630                               int                   index)
 631 {
 632     assert(gpu_info);
 633
 634     return (index >= gpu_info->n_dev) ? egpuNonexistent : gpu_info->gpu_dev[index].stat;
 635 }
 636
 637 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 638 {
 639     if (gpu_info == NULL)
 640     {
 641         return;
 642     }
 643
 644     sfree(gpu_info->gpu_dev);
 645 }
 646
 647 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 648 {
 649     assert(s);
 650     assert(gpu_info);
 651
 652     if (index < 0 && index >= gpu_info->n_dev)
 653     {
 654         return;
 655     }
 656
 657     gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
 658
 659     bool               bGpuExists =
 660         dinfo->stat == egpuCompatible ||
 661         dinfo->stat == egpuIncompatible;
 662
 663     if (!bGpuExists)
 664     {
 665         sprintf(s, "#%d: %s, stat: %s",
 666                 dinfo->id, "N/A",
 667                 gpu_detect_res_str[dinfo->stat]);
 668     }
 669     else
 670     {
 671         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 672                 dinfo->id, dinfo->prop.name,
 673                 dinfo->prop.major, dinfo->prop.minor,
 674                 dinfo->prop.ECCEnabled ? "yes" : " no",
 675                 gpu_detect_res_str[dinfo->stat]);
 676     }
 677 }
 678
 679 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 680                       const gmx_gpu_opt_t  *gpu_opt,
 681                       int                   idx)
 682 {
 683     assert(gpu_info);
 684     assert(gpu_opt);
 685     assert(idx >= 0 && idx < gpu_opt->n_dev_use);
 686
 687     return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
 688 }
 689
 690 int get_current_cuda_gpu_device_id(void)
 691 {
 692     int gpuid;
 693     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 694
 695     return gpuid;
 696 }
 697
 698 size_t sizeof_gpu_dev_info(void)
 699 {
 700     return sizeof(gmx_device_info_t);
 701 }
 702
 703 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 704                                   gmx_host_alloc_t **nb_alloc,
 705                                   gmx_host_free_t  **nb_free)
 706 {
 707     if (bUseGpuKernels)
 708     {
 709         *nb_alloc = &pmalloc;
 710         *nb_free  = &pfree;
 711     }
 712     else
 713     {
 714         *nb_alloc = NULL;
 715         *nb_free  = NULL;
 716     }
 717 }
 718
 719 void startGpuProfiler(void)
 720 {
 721     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 722        mdrun is executed in the CUDA profiler.
 723        If nvprof was run is with "--profile-from-start off", the profiler will
 724        be started here. This way we can avoid tracing the CUDA events from the
 725        first part of the run. Starting the profiler again does nothing.
 726      */
 727     if (cudaProfilerRun)
 728     {
 729         cudaError_t stat;
 730         stat = cudaProfilerStart();
 731         CU_RET_ERR(stat, "cudaProfilerStart failed");
 732     }
 733 }
 734
 735 void stopGpuProfiler(void)
 736 {
 737     /* Stopping the nvidia here allows us to eliminate the subsequent
 738        API calls from the trace, e.g. uninitialization and cleanup. */
 739     if (cudaProfilerRun)
 740     {
 741         cudaError_t stat;
 742         stat = cudaProfilerStop();
 743         CU_RET_ERR(stat, "cudaProfilerStop failed");
 744     }
 745 }
 746
 747 void resetGpuProfiler(void)
 748 {
 749     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 750      *  the profiling here (can't stop it) which will achieve the desired effect if
 751      *  the run was started with the profiling disabled.
 752      *
 753      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 754      * stopGpuProfiler();
 755      */
 756     if (cudaProfilerRun)
 757     {
 758         startGpuProfiler();
 759     }
 760 }