2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2010-2018, The GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define functions for detection and initialization for CUDA devices.
39 * \author Szilard Pall <pall.szilard@gmail.com>
44 #include "gpu_utils.h"
50 #include <cuda_profiler_api.h>
52 #include "gromacs/gpu_utils/cudautils.cuh"
53 #include "gromacs/gpu_utils/pmalloc_cuda.h"
54 #include "gromacs/hardware/gpu_hw_info.h"
55 #include "gromacs/utility/basedefinitions.h"
56 #include "gromacs/utility/cstringutil.h"
57 #include "gromacs/utility/exceptions.h"
58 #include "gromacs/utility/fatalerror.h"
59 #include "gromacs/utility/gmxassert.h"
60 #include "gromacs/utility/logger.h"
61 #include "gromacs/utility/programcontext.h"
62 #include "gromacs/utility/smalloc.h"
63 #include "gromacs/utility/snprintf.h"
64 #include "gromacs/utility/stringutil.h"
67 * Max number of devices supported by CUDA (for consistency checking).
69 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
71 static int cuda_max_device_count = 32;
73 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
75 /** Dummy kernel used for sanity checking. */
76 static __global__ void k_dummy_test(void) {}
78 static void checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
80 cudaFuncAttributes attributes;
81 cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
83 if (cudaErrorInvalidDeviceFunction == stat)
86 "The %s binary does not include support for the CUDA architecture of a "
87 "detected GPU: %s, ID #%d (compute capability %d.%d). "
88 "By default, GROMACS supports all architectures of compute "
89 "capability >= 3.0, so your GPU "
90 "might be rare, or some architectures were disabled in the build. "
91 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
92 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. "
93 "To work around this error, use the CUDA_VISIBLE_DEVICES environment"
94 "variable to pass a list of GPUs that excludes the ID %d.",
95 gmx::getProgramContext().displayName(), deviceProp.name, deviceId,
96 deviceProp.major, deviceProp.minor, deviceId);
99 CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
102 bool isHostMemoryPinned(const void* h_ptr)
104 cudaPointerAttributes memoryAttributes;
105 cudaError_t stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
110 case cudaSuccess: result = true; break;
112 case cudaErrorInvalidValue:
113 // If the buffer was not pinned, then it will not be recognized by CUDA at all
115 // Reset the last error status
119 default: CU_RET_ERR(stat, "Unexpected CUDA error");
125 * \brief Runs GPU sanity checks.
127 * Runs a series of checks to determine that the given GPU and underlying CUDA
128 * driver/runtime functions properly.
130 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
131 * \param[in] dev_prop The device properties structure
132 * \returns 0 if the device looks OK
134 * TODO: introduce errors codes and handle errors more smoothly.
136 static int do_sanity_checks(int dev_id, const cudaDeviceProp& dev_prop)
141 cu_err = cudaGetDeviceCount(&dev_count);
142 if (cu_err != cudaSuccess)
144 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
148 /* no CUDA compatible device at all */
154 /* things might go horribly wrong if cudart is not compatible with the driver */
155 if (dev_count < 0 || dev_count > cuda_max_device_count)
160 if (dev_id == -1) /* device already selected let's not destroy the context */
162 cu_err = cudaGetDevice(&id);
163 if (cu_err != cudaSuccess)
165 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
172 if (id > dev_count - 1) /* pfff there's no such device */
175 "The requested device with id %d does not seem to exist (device count=%d)\n",
181 /* both major & minor is 9999 if no CUDA capable devices are present */
182 if (dev_prop.major == 9999 && dev_prop.minor == 9999)
186 /* we don't care about emulation mode */
187 if (dev_prop.major == 0)
194 cu_err = cudaSetDevice(id);
195 if (cu_err != cudaSuccess)
197 fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
198 cudaGetErrorString(cu_err));
203 /* try to execute a dummy kernel */
204 checkCompiledTargetCompatibility(dev_id, dev_prop);
206 KernelLaunchConfig config;
207 config.blockSize[0] = 512;
208 const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
209 launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
210 if (cudaDeviceSynchronize() != cudaSuccess)
215 /* destroy context if we created one */
218 cu_err = cudaDeviceReset();
219 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
225 void init_gpu(const DeviceInformation* deviceInfo)
231 stat = cudaSetDevice(deviceInfo->id);
232 if (stat != cudaSuccess)
234 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
235 CU_RET_ERR(stat, message.c_str());
240 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
244 void free_gpu(const DeviceInformation* deviceInfo)
246 // One should only attempt to clear the device context when
247 // it has been used, but currently the only way to know that a GPU
248 // device was used is that deviceInfo will be non-null.
249 if (deviceInfo == nullptr)
259 stat = cudaGetDevice(&gpuid);
260 CU_RET_ERR(stat, "cudaGetDevice failed");
261 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
264 stat = cudaDeviceReset();
265 if (stat != cudaSuccess)
267 gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
271 DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
273 if (deviceId < 0 || deviceId >= gpu_info.n_dev)
275 gmx_incons("Invalid GPU deviceId requested");
277 return &gpu_info.deviceInfo[deviceId];
280 /*! \brief Returns true if the gpu characterized by the device properties is
281 * supported by the native gpu acceleration.
283 * \param[in] dev_prop the CUDA device properties of the gpus to test.
284 * \returns true if the GPU properties passed indicate a compatible
285 * GPU, otherwise false.
287 static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
289 return (dev_prop.major >= 3);
292 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
294 * Returns a status value which indicates compatibility or one of the following
295 * errors: incompatibility or insanity (=unexpected behavior).
297 * As the error handling only permits returning the state of the GPU, this function
298 * does not clear the CUDA runtime API status allowing the caller to inspect the error
299 * upon return. Note that this also means it is the caller's responsibility to
300 * reset the CUDA runtime state.
302 * \param[in] deviceId the ID of the GPU to check.
303 * \param[in] deviceProp the CUDA device properties of the device checked.
304 * \returns the status of the requested device
306 static int is_gmx_supported_gpu_id(int deviceId, const cudaDeviceProp& deviceProp)
308 if (!is_gmx_supported_gpu(deviceProp))
310 return egpuIncompatible;
313 /* TODO: currently we do not make a distinction between the type of errors
314 * that can appear during sanity checks. This needs to be improved, e.g if
315 * the dummy test kernel fails to execute with a "device busy message" we
316 * should appropriately report that the device is busy instead of insane.
318 if (do_sanity_checks(deviceId, deviceProp) != 0)
323 return egpuCompatible;
326 bool isGpuDetectionFunctional(std::string* errorMessage)
329 int driverVersion = -1;
330 stat = cudaDriverGetVersion(&driverVersion);
331 GMX_ASSERT(stat != cudaErrorInvalidValue,
332 "An impossible null pointer was passed to cudaDriverGetVersion");
335 gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
336 cudaGetErrorName(stat), cudaGetErrorString(stat))
338 bool foundDriver = (driverVersion > 0);
341 // Can't detect GPUs if there is no driver
342 if (errorMessage != nullptr)
344 errorMessage->assign("No valid CUDA driver found");
350 stat = cudaGetDeviceCount(&numDevices);
351 if (stat != cudaSuccess)
353 if (errorMessage != nullptr)
355 /* cudaGetDeviceCount failed which means that there is
356 * something wrong with the machine: driver-runtime
357 * mismatch, all GPUs being busy in exclusive mode,
358 * invalid CUDA_VISIBLE_DEVICES, or some other condition
359 * which should result in GROMACS issuing at least a
361 errorMessage->assign(cudaGetErrorString(stat));
364 // Consume the error now that we have prepared to handle
365 // it. This stops it reappearing next time we check for
366 // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
367 // valid devices, then cudaGetLastError returns the
368 // (undocumented) cudaErrorNoDevice, but this should not be a
369 // problem as there should be no future CUDA API calls.
370 // NVIDIA bug report #2038718 has been filed.
376 // We don't actually use numDevices here, that's not the job of
381 void findGpus(gmx_gpu_info_t* gpu_info)
385 gpu_info->n_dev_compatible = 0;
388 cudaError_t stat = cudaGetDeviceCount(&ndev);
389 if (stat != cudaSuccess)
391 GMX_THROW(gmx::InternalError(
392 "Invalid call of findGpus() when CUDA API returned an error, perhaps "
393 "canDetectGpus() was not called appropriately beforehand."));
396 // We expect to start device support/sanity checks with a clean runtime error state
397 gmx::ensureNoPendingCudaError("");
399 DeviceInformation* devs;
401 for (int i = 0; i < ndev; i++)
404 memset(&prop, 0, sizeof(cudaDeviceProp));
405 stat = cudaGetDeviceProperties(&prop, i);
407 if (stat != cudaSuccess)
409 // Will handle the error reporting below
410 checkResult = egpuInsane;
414 checkResult = is_gmx_supported_gpu_id(i, prop);
419 devs[i].stat = checkResult;
421 if (checkResult == egpuCompatible)
423 gpu_info->n_dev_compatible++;
428 // - we inspect the CUDA API state to retrieve and record any
429 // errors that occurred during is_gmx_supported_gpu_id() here,
430 // but this would be more elegant done within is_gmx_supported_gpu_id()
431 // and only return a string with the error if one was encountered.
432 // - we'll be reporting without rank information which is not ideal.
433 // - we'll end up warning also in cases where users would already
434 // get an error before mdrun aborts.
436 // Here we also clear the CUDA API error state so potential
437 // errors during sanity checks don't propagate.
438 if ((stat = cudaGetLastError()) != cudaSuccess)
440 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
441 devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
446 stat = cudaPeekAtLastError();
447 GMX_RELEASE_ASSERT(stat == cudaSuccess,
448 gmx::formatString("We promise to return with clean CUDA state, but "
449 "non-success state encountered: %s: %s",
450 cudaGetErrorName(stat), cudaGetErrorString(stat))
453 gpu_info->n_dev = ndev;
454 gpu_info->deviceInfo = devs;
457 void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
461 if (index < 0 && index >= gpu_info.n_dev)
466 DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
468 bool bGpuExists = (dinfo->stat != egpuNonexistent && dinfo->stat != egpuInsane);
472 sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", gpu_detect_res_str[dinfo->stat]);
476 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
477 dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
478 dinfo->prop.ECCEnabled ? "yes" : " no", gpu_detect_res_str[dinfo->stat]);
482 int get_current_cuda_gpu_device_id(void)
485 CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
490 size_t sizeof_gpu_dev_info(void)
492 return sizeof(DeviceInformation);
495 void startGpuProfiler(void)
497 /* The NVPROF_ID environment variable is set by nvprof and indicates that
498 mdrun is executed in the CUDA profiler.
499 If nvprof was run is with "--profile-from-start off", the profiler will
500 be started here. This way we can avoid tracing the CUDA events from the
501 first part of the run. Starting the profiler again does nothing.
506 stat = cudaProfilerStart();
507 CU_RET_ERR(stat, "cudaProfilerStart failed");
511 void stopGpuProfiler(void)
513 /* Stopping the nvidia here allows us to eliminate the subsequent
514 API calls from the trace, e.g. uninitialization and cleanup. */
518 stat = cudaProfilerStop();
519 CU_RET_ERR(stat, "cudaProfilerStop failed");
523 void resetGpuProfiler(void)
525 /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
526 * the profiling here (can't stop it) which will achieve the desired effect if
527 * the run was started with the profiling disabled.
529 * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
538 int gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
540 return info.deviceInfo[index].stat;
543 /*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately
544 * \param[in] stat CUDA call return status
545 * \param[in] gpuA ID for GPU initiating peer access call
546 * \param[in] gpuB ID for remote GPU
547 * \param[in] mdlog Logger object
548 * \param[in] cudaCallName name of CUDA peer access call
550 static void peerAccessCheckStat(const cudaError_t stat,
553 const gmx::MDLogger& mdlog,
554 const char* cudaCallName)
556 if ((stat == cudaErrorInvalidDevice) || (stat == cudaErrorInvalidValue))
558 std::string errorString =
559 gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
560 CU_RET_ERR(stat, errorString.c_str());
562 if (stat != cudaSuccess)
564 GMX_LOG(mdlog.warning)
566 .appendTextFormatted(
567 "GPU peer access not enabled between GPUs %d and %d due to unexpected "
568 "return value from %s: %s",
569 gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
573 void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MDLogger& mdlog)
577 // take a note of currently-set GPU
579 stat = cudaGetDevice(¤tGpu);
580 CU_RET_ERR(stat, "cudaGetDevice in setupGpuDevicePeerAccess failed");
582 std::string message = gmx::formatString(
583 "Note: Peer access enabled between the following GPU pairs in the node:\n ");
584 bool peerAccessEnabled = false;
586 for (unsigned int i = 0; i < gpuIdsToUse.size(); i++)
588 int gpuA = gpuIdsToUse[i];
589 stat = cudaSetDevice(gpuA);
590 if (stat != cudaSuccess)
592 GMX_LOG(mdlog.warning)
594 .appendTextFormatted(
595 "GPU peer access not enabled due to unexpected return value from "
596 "cudaSetDevice(%d): %s",
597 gpuA, cudaGetErrorString(stat));
600 for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
604 int gpuB = gpuIdsToUse[j];
605 int canAccessPeer = 0;
606 stat = cudaDeviceCanAccessPeer(&canAccessPeer, gpuA, gpuB);
607 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceCanAccessPeer");
611 stat = cudaDeviceEnablePeerAccess(gpuB, 0);
612 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceEnablePeerAccess");
614 message = gmx::formatString("%s%d->%d ", message.c_str(), gpuA, gpuB);
615 peerAccessEnabled = true;
621 // re-set GPU to that originally set
622 stat = cudaSetDevice(currentGpu);
623 if (stat != cudaSuccess)
625 CU_RET_ERR(stat, "cudaSetDevice in setupGpuDevicePeerAccess failed");
629 if (peerAccessEnabled)
631 GMX_LOG(mdlog.info).asParagraph().appendTextFormatted("%s", message.c_str());