2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Defines the CUDA implementations of the device management.
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Mark Abraham <mark.j.abraham@gmail.com>
43 * \author Szilárd Páll <pall.szilard@gmail.com>
44 * \author Artem Zhmurov <zhmurov@gmail.com>
46 * \ingroup module_hardware
50 #include "device_management.h"
54 #include "gromacs/gpu_utils/cudautils.cuh"
55 #include "gromacs/gpu_utils/device_context.h"
56 #include "gromacs/gpu_utils/device_stream.h"
57 #include "gromacs/utility/exceptions.h"
58 #include "gromacs/utility/programcontext.h"
59 #include "gromacs/utility/smalloc.h"
60 #include "gromacs/utility/stringutil.h"
62 #include "device_information.h"
65 * Max number of devices supported by CUDA (for consistency checking).
67 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
69 static int c_cudaMaxDeviceCount = 32;
71 /** Dummy kernel used for sanity checking. */
72 static __global__ void dummy_kernel(void) {}
74 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
76 cudaFuncAttributes attributes;
77 cudaError_t stat = cudaFuncGetAttributes(&attributes, dummy_kernel);
79 if (cudaErrorInvalidDeviceFunction == stat)
82 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
83 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
84 "By default, GROMACS supports all architectures of compute "
85 "capability >= 3.0, so your GPU "
86 "might be rare, or some architectures were disabled in the build. \n"
87 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
88 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
89 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
96 * \brief Runs GPU sanity checks.
98 * Runs a series of checks to determine that the given GPU and underlying CUDA
99 * driver/runtime functions properly.
101 * \todo Currently we do not make a distinction between the type of errors
102 * that can appear during functionality checks. This needs to be improved,
103 * e.g if the dummy test kernel fails to execute with a "device busy message"
104 * we should appropriately report that the device is busy instead of NonFunctional.
106 * \todo Introduce errors codes and handle errors more smoothly.
109 * \param[in] deviceInfo Device information on the device to check.
110 * \returns The status enumeration value for the checked device:
112 static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
116 /* both major & minor is 9999 if no CUDA capable devices are present */
117 if (deviceInfo.prop.major == 9999 && deviceInfo.prop.minor == 9999)
119 return DeviceStatus::NonFunctional;
121 /* we don't care about emulation mode */
122 if (deviceInfo.prop.major == 0)
124 return DeviceStatus::NonFunctional;
127 cu_err = cudaSetDevice(deviceInfo.id);
128 if (cu_err != cudaSuccess)
130 fprintf(stderr, "Error while switching to device #%d. %s\n", deviceInfo.id,
131 gmx::getDeviceErrorString(cu_err).c_str());
132 return DeviceStatus::NonFunctional;
135 cu_err = checkCompiledTargetCompatibility(deviceInfo.id, deviceInfo.prop);
136 // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
137 // it is enough to check for cudaErrorDevicesUnavailable only here because
138 // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
139 if (cu_err == cudaErrorDevicesUnavailable)
141 return DeviceStatus::Unavailable;
143 else if (cu_err != cudaSuccess)
145 return DeviceStatus::NonFunctional;
148 /* try to execute a dummy kernel */
151 KernelLaunchConfig config;
152 config.blockSize[0] = 512;
153 const auto dummyArguments = prepareGpuKernelArguments(dummy_kernel, config);
154 const DeviceContext deviceContext(deviceInfo);
155 const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
156 launchGpuKernel(dummy_kernel, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
158 catch (gmx::GromacsException& ex)
160 // launchGpuKernel error is not fatal and should continue with marking the device bad
162 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
163 deviceInfo.id, formatExceptionMessageToString(ex).c_str());
164 return DeviceStatus::NonFunctional;
167 if (cudaDeviceSynchronize() != cudaSuccess)
169 return DeviceStatus::NonFunctional;
172 cu_err = cudaDeviceReset();
173 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
175 return DeviceStatus::Compatible;
178 /*! \brief Returns true if the gpu characterized by the device properties is supported
179 * by the native gpu acceleration.
181 * \param[in] deviceProperties The CUDA device properties of the gpus to test.
182 * \returns True if the GPU properties passed indicate a compatible
183 * GPU, otherwise false.
185 static bool isDeviceGenerationSupported(const cudaDeviceProp& deviceProperties)
187 return (deviceProperties.major >= 3);
190 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
192 * Returns a status value which indicates compatibility or one of the following
193 * errors: incompatibility or insanity (=unexpected behavior).
195 * As the error handling only permits returning the state of the GPU, this function
196 * does not clear the CUDA runtime API status allowing the caller to inspect the error
197 * upon return. Note that this also means it is the caller's responsibility to
198 * reset the CUDA runtime state.
200 * \param[in] deviceInfo The device information on the device to check.
201 * \returns the status of the requested device
203 static DeviceStatus checkDeviceStatus(const DeviceInformation& deviceInfo)
205 if (!isDeviceGenerationSupported(deviceInfo.prop))
207 return DeviceStatus::Incompatible;
209 return isDeviceFunctional(deviceInfo);
212 bool isDeviceDetectionFunctional(std::string* errorMessage)
215 int driverVersion = -1;
216 stat = cudaDriverGetVersion(&driverVersion);
217 GMX_ASSERT(stat != cudaErrorInvalidValue,
218 "An impossible null pointer was passed to cudaDriverGetVersion");
219 GMX_RELEASE_ASSERT(stat == cudaSuccess,
220 ("An unexpected value was returned from cudaDriverGetVersion. "
221 + gmx::getDeviceErrorString(stat))
223 bool foundDriver = (driverVersion > 0);
226 // Can't detect GPUs if there is no driver
227 if (errorMessage != nullptr)
229 errorMessage->assign("No valid CUDA driver found");
235 stat = cudaGetDeviceCount(&numDevices);
236 if (stat != cudaSuccess)
238 if (errorMessage != nullptr)
240 /* cudaGetDeviceCount failed which means that there is
241 * something wrong with the machine: driver-runtime
242 * mismatch, all GPUs being busy in exclusive mode,
243 * invalid CUDA_VISIBLE_DEVICES, or some other condition
244 * which should result in GROMACS issuing at least a
246 errorMessage->assign(cudaGetErrorString(stat));
249 // Consume the error now that we have prepared to handle
250 // it. This stops it reappearing next time we check for
251 // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
252 // valid devices, then cudaGetLastError returns the
253 // (undocumented) cudaErrorNoDevice, but this should not be a
254 // problem as there should be no future CUDA API calls.
255 // NVIDIA bug report #2038718 has been filed.
261 // We don't actually use numDevices here, that's not the job of
266 std::vector<std::unique_ptr<DeviceInformation>> findDevices()
269 cudaError_t stat = cudaGetDeviceCount(&numDevices);
270 gmx::checkDeviceError(stat,
271 "Invalid call of findDevices() when CUDA API returned an error, perhaps "
272 "canPerformDeviceDetection() was not called appropriately beforehand.");
274 /* things might go horribly wrong if cudart is not compatible with the driver */
275 numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
277 // We expect to start device support/sanity checks with a clean runtime error state
278 gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
280 std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
281 for (int i = 0; i < numDevices; i++)
284 memset(&prop, 0, sizeof(cudaDeviceProp));
285 stat = cudaGetDeviceProperties(&prop, i);
287 deviceInfoList[i] = std::make_unique<DeviceInformation>();
288 deviceInfoList[i]->id = i;
289 deviceInfoList[i]->prop = prop;
290 deviceInfoList[i]->deviceVendor = DeviceVendor::Nvidia;
292 const DeviceStatus checkResult = (stat != cudaSuccess) ? DeviceStatus::NonFunctional
293 : checkDeviceStatus(*deviceInfoList[i]);
295 deviceInfoList[i]->status = checkResult;
297 if (checkResult != DeviceStatus::Compatible)
300 // - we inspect the CUDA API state to retrieve and record any
301 // errors that occurred during is_gmx_supported_gpu_id() here,
302 // but this would be more elegant done within is_gmx_supported_gpu_id()
303 // and only return a string with the error if one was encountered.
304 // - we'll be reporting without rank information which is not ideal.
305 // - we'll end up warning also in cases where users would already
306 // get an error before mdrun aborts.
308 // Here we also clear the CUDA API error state so potential
309 // errors during sanity checks don't propagate.
310 const std::string errorMessage = gmx::formatString(
311 "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
312 gmx::ensureNoPendingDeviceError(errorMessage);
316 stat = cudaPeekAtLastError();
319 ("We promise to return with clean CUDA state, but non-success state encountered. "
320 + gmx::getDeviceErrorString(stat))
323 return deviceInfoList;
326 void setActiveDevice(const DeviceInformation& deviceInfo)
328 int deviceId = deviceInfo.id;
331 stat = cudaSetDevice(deviceId);
332 if (stat != cudaSuccess)
334 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
335 CU_RET_ERR(stat, message.c_str());
340 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceId, deviceInfo.prop.name);
344 void releaseDevice(DeviceInformation* deviceInfo)
346 // device was used is that deviceInfo will be non-null.
347 if (deviceInfo != nullptr)
352 stat = cudaGetDevice(&gpuid);
353 if (stat == cudaSuccess)
357 fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
360 stat = cudaDeviceReset();
361 if (stat != cudaSuccess)
363 gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
369 std::string getDeviceInformationString(const DeviceInformation& deviceInfo)
371 bool gpuExists = (deviceInfo.status != DeviceStatus::Nonexistent
372 && deviceInfo.status != DeviceStatus::NonFunctional);
376 return gmx::formatString("#%d: %s, stat: %s", deviceInfo.id, "N/A",
377 c_deviceStateString[deviceInfo.status]);
381 return gmx::formatString("#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
382 deviceInfo.id, deviceInfo.prop.name, deviceInfo.prop.major,
383 deviceInfo.prop.minor, deviceInfo.prop.ECCEnabled ? "yes" : " no",
384 c_deviceStateString[deviceInfo.status]);