2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "cudautils.cuh"
43 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
44 #include "gromacs/utility/smalloc.h"
46 /*** Generic CUDA data operation wrappers ***/
48 /*! Launches synchronous or asynchronous host to device memory copy.
50 * The copy is launched in stream s or if not specified, in stream 0.
52 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
53 bool bAsync = false, cudaStream_t s = 0)
57 if (h_dest == NULL || d_src == NULL || bytes == 0)
64 stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
65 CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
70 stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
71 CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
77 int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
79 return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
83 * The copy is launched in stream s or if not specified, in stream 0.
85 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
87 return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
90 /*! Launches synchronous or asynchronous device to host memory copy.
92 * The copy is launched in stream s or if not specified, in stream 0.
94 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
95 bool bAsync = false, cudaStream_t s = 0)
99 if (d_dest == NULL || h_src == NULL || bytes == 0)
106 stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
107 CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
111 stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
112 CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
118 int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
120 return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
124 * The copy is launched in stream s or if not specified, in stream 0.
126 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
128 return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
131 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
134 * If the pointers to the size variables are NULL no resetting happens.
136 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
142 stat = cudaFree(d_ptr);
143 CU_RET_ERR(stat, "cudaFree failed");
158 * Reallocation of the memory pointed by d_ptr and copying of the data from
159 * the location pointed by h_src host-side pointer is done. Allocation is
160 * buffered and therefore freeing is only needed if the previously allocated
161 * space is not enough.
162 * The H2D copy is launched in stream s and can be done synchronously or
163 * asynchronously (the default is the latter).
165 void cu_realloc_buffered(void **d_dest, void *h_src,
167 int *curr_size, int *curr_alloc_size,
174 if (d_dest == NULL || req_size < 0)
179 /* reallocate only if the data does not fit = allocation size is smaller
180 than the current requested size */
181 if (req_size > *curr_alloc_size)
183 /* only free if the array has already been initialized */
184 if (*curr_alloc_size >= 0)
186 cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
189 *curr_alloc_size = over_alloc_large(req_size);
191 stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
192 CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
195 /* size could have changed without actual reallocation */
196 *curr_size = req_size;
198 /* upload to device */
203 cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
207 cu_copy_H2D_sync(*d_dest, h_src, *curr_size * type_size);
212 /*! \brief Return whether texture objects are used on this device.
214 * \param[in] pointer to the GPU device info structure to inspect for texture objects support
215 * \return true if texture objects are used on this device
217 static inline bool use_texobj(const gmx_device_info_t *dev_info)
219 assert(!c_disableCudaTextures);
220 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
221 return (dev_info->prop.major >= 3);
224 /*! \brief Set up texture object for an array of type T.
226 * Set up texture object for an array of type T and bind it to the device memory
227 * \p d_ptr points to.
229 * \tparam[in] T Raw data type
230 * \param[out] texObj texture object to initialize
231 * \param[in] d_ptr pointer to device global memory to bind \p texObj to
232 * \param[in] sizeInBytes size of memory area to bind \p texObj to
234 template <typename T>
235 static void setup1DTexture(cudaTextureObject_t &texObj,
239 assert(!c_disableCudaTextures);
245 memset(&rd, 0, sizeof(rd));
246 rd.resType = cudaResourceTypeLinear;
247 rd.res.linear.devPtr = d_ptr;
248 rd.res.linear.desc = cudaCreateChannelDesc<T>();
249 rd.res.linear.sizeInBytes = sizeInBytes;
251 memset(&td, 0, sizeof(td));
252 td.readMode = cudaReadModeElementType;
253 stat = cudaCreateTextureObject(&texObj, &rd, &td, NULL);
254 CU_RET_ERR(stat, "cudaCreateTextureObject failed");
257 /*! \brief Set up texture reference for an array of type T.
259 * Set up texture object for an array of type T and bind it to the device memory
260 * \p d_ptr points to.
262 * \tparam[in] T Raw data type
263 * \param[out] texObj texture reference to initialize
264 * \param[in] d_ptr pointer to device global memory to bind \p texObj to
265 * \param[in] sizeInBytes size of memory area to bind \p texObj to
267 template <typename T>
268 static void setup1DTexture(const struct texture<T, 1, cudaReadModeElementType> *texRef,
272 assert(!c_disableCudaTextures);
275 cudaChannelFormatDesc cd;
277 cd = cudaCreateChannelDesc<T>();
278 stat = cudaBindTexture(nullptr, texRef, d_ptr, &cd, sizeInBytes);
279 CU_RET_ERR(stat, "cudaBindTexture failed");
282 template <typename T>
283 void initParamLookupTable(T * &d_ptr,
284 cudaTextureObject_t &texObj,
285 const struct texture<T, 1, cudaReadModeElementType> *texRef,
288 const gmx_device_info_t *devInfo)
290 const size_t sizeInBytes = numElem * sizeof(*d_ptr);
291 cudaError_t stat = cudaMalloc((void **)&d_ptr, sizeInBytes);
292 CU_RET_ERR(stat, "cudaMalloc failed in initParamLookupTable");
293 cu_copy_H2D_sync(d_ptr, (void *)h_ptr, sizeInBytes);
295 if (!c_disableCudaTextures)
297 if (use_texobj(devInfo))
299 setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
303 setup1DTexture<T>(texRef, d_ptr, sizeInBytes);
308 template <typename T>
309 void destroyParamLookupTable(T *d_ptr,
310 cudaTextureObject_t texObj,
311 const struct texture<T, 1, cudaReadModeElementType> *texRef,
312 const gmx_device_info_t *devInfo)
314 if (!c_disableCudaTextures)
316 if (use_texobj(devInfo))
318 CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
322 CU_RET_ERR(cudaUnbindTexture(texRef), "cudaUnbindTexture on texRef failed");
325 CU_RET_ERR(cudaFree(d_ptr), "cudaFree failed");
328 /*! \brief Add explicit instantiations of init/destroyParamLookupTable() here as needed.
329 * One should also verify that the result of cudaCreateChannelDesc<T>() during texture setup
330 * looks reasonable, when instantiating the templates for new types - just in case.
332 template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const texture<float, 1, cudaReadModeElementType> *, const float *, int, const gmx_device_info_t *);
333 template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const texture<float, 1, cudaReadModeElementType> *, const gmx_device_info_t *);
334 template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const texture<int, 1, cudaReadModeElementType> *, const int *, int, const gmx_device_info_t *);
335 template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const texture<int, 1, cudaReadModeElementType> *, const gmx_device_info_t *);