Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / plugins-nextgen / cuda / dynamic_cuda / cuda.cpp
blob3d0de0d5b2caff68dad59a4b43613247ddaa4c79
1 //===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implement subset of cuda api by calling into cuda library via dlopen
10 // Does the dlopen/dlsym calls as part of the call to cuInit
12 //===----------------------------------------------------------------------===//
14 #include "llvm/Support/DynamicLibrary.h"
16 #include "Debug.h"
17 #include "cuda.h"
18 #include "dlwrap.h"
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
24 DLWRAP_INITIALIZE()
26 DLWRAP_INTERNAL(cuInit, 1)
28 DLWRAP(cuCtxGetDevice, 1)
29 DLWRAP(cuDeviceGet, 2)
30 DLWRAP(cuDeviceGetAttribute, 3)
31 DLWRAP(cuDeviceGetCount, 1)
32 DLWRAP(cuFuncGetAttribute, 3)
34 // Device info
35 DLWRAP(cuDeviceGetName, 3)
36 DLWRAP(cuDeviceTotalMem, 2)
37 DLWRAP(cuDriverGetVersion, 1)
39 DLWRAP(cuGetErrorString, 2)
40 DLWRAP(cuLaunchKernel, 11)
42 DLWRAP(cuMemAlloc, 2)
43 DLWRAP(cuMemAllocHost, 2)
44 DLWRAP(cuMemAllocManaged, 3)
46 DLWRAP(cuMemcpyDtoDAsync, 4)
47 DLWRAP(cuMemcpyDtoH, 3)
48 DLWRAP(cuMemcpyDtoHAsync, 4)
49 DLWRAP(cuMemcpyHtoD, 3)
50 DLWRAP(cuMemcpyHtoDAsync, 4)
52 DLWRAP(cuMemFree, 1)
53 DLWRAP(cuMemFreeHost, 1)
54 DLWRAP(cuModuleGetFunction, 3)
55 DLWRAP(cuModuleGetGlobal, 4)
57 DLWRAP(cuModuleUnload, 1)
58 DLWRAP(cuStreamCreate, 2)
59 DLWRAP(cuStreamDestroy, 1)
60 DLWRAP(cuStreamSynchronize, 1)
61 DLWRAP(cuStreamQuery, 1)
62 DLWRAP(cuCtxSetCurrent, 1)
63 DLWRAP(cuDevicePrimaryCtxRelease, 1)
64 DLWRAP(cuDevicePrimaryCtxGetState, 3)
65 DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
66 DLWRAP(cuDevicePrimaryCtxRetain, 2)
67 DLWRAP(cuModuleLoadDataEx, 5)
69 DLWRAP(cuDeviceCanAccessPeer, 3)
70 DLWRAP(cuCtxEnablePeerAccess, 2)
71 DLWRAP(cuMemcpyPeerAsync, 6)
73 DLWRAP(cuCtxGetLimit, 2)
74 DLWRAP(cuCtxSetLimit, 2)
76 DLWRAP(cuEventCreate, 2)
77 DLWRAP(cuEventRecord, 2)
78 DLWRAP(cuStreamWaitEvent, 3)
79 DLWRAP(cuEventSynchronize, 1)
80 DLWRAP(cuEventDestroy, 1)
82 DLWRAP_FINALIZE()
84 DLWRAP(cuMemUnmap, 2)
85 DLWRAP(cuMemRelease, 1)
86 DLWRAP(cuMemAddressFree, 2)
87 DLWRAP(cuMemGetInfo, 2)
88 DLWRAP(cuMemAddressReserve, 5)
89 DLWRAP(cuMemMap, 5)
90 DLWRAP(cuMemCreate, 4)
91 DLWRAP(cuMemSetAccess, 4)
92 DLWRAP(cuMemGetAllocationGranularity, 3)
94 #ifndef DYNAMIC_CUDA_PATH
95 #define DYNAMIC_CUDA_PATH "libcuda.so"
96 #endif
98 #ifndef TARGET_NAME
99 #define TARGET_NAME CUDA
100 #endif
101 #ifndef DEBUG_PREFIX
102 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
103 #endif
105 static bool checkForCUDA() {
106 // return true if dlopen succeeded and all functions found
108 // Prefer _v2 versions of functions if found in the library
109 std::unordered_map<std::string, const char *> TryFirst = {
110 {"cuMemAlloc", "cuMemAlloc_v2"},
111 {"cuMemFree", "cuMemFree_v2"},
112 {"cuMemcpyDtoH", "cuMemcpyDtoH_v2"},
113 {"cuMemcpyHtoD", "cuMemcpyHtoD_v2"},
114 {"cuStreamDestroy", "cuStreamDestroy_v2"},
115 {"cuModuleGetGlobal", "cuModuleGetGlobal_v2"},
116 {"cuMemcpyDtoHAsync", "cuMemcpyDtoHAsync_v2"},
117 {"cuMemcpyDtoDAsync", "cuMemcpyDtoDAsync_v2"},
118 {"cuMemcpyHtoDAsync", "cuMemcpyHtoDAsync_v2"},
119 {"cuDevicePrimaryCtxRelease", "cuDevicePrimaryCtxRelease_v2"},
120 {"cuDevicePrimaryCtxSetFlags", "cuDevicePrimaryCtxSetFlags_v2"},
123 const char *CudaLib = DYNAMIC_CUDA_PATH;
124 std::string ErrMsg;
125 auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
126 llvm::sys::DynamicLibrary::getPermanentLibrary(CudaLib, &ErrMsg));
127 if (!DynlibHandle->isValid()) {
128 DP("Unable to load library '%s': %s!\n", CudaLib, ErrMsg.c_str());
129 return false;
132 for (size_t I = 0; I < dlwrap::size(); I++) {
133 const char *Sym = dlwrap::symbol(I);
135 auto It = TryFirst.find(Sym);
136 if (It != TryFirst.end()) {
137 const char *First = It->second;
138 void *P = DynlibHandle->getAddressOfSymbol(First);
139 if (P) {
140 DP("Implementing %s with dlsym(%s) -> %p\n", Sym, First, P);
141 *dlwrap::pointer(I) = P;
142 continue;
146 void *P = DynlibHandle->getAddressOfSymbol(Sym);
147 if (P == nullptr) {
148 DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
149 return false;
151 DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
153 *dlwrap::pointer(I) = P;
156 return true;
159 CUresult cuInit(unsigned X) {
160 // Note: Called exactly once from cuda rtl.cpp in a global constructor so
161 // does not need to handle being called repeatedly or concurrently
162 if (!checkForCUDA()) {
163 return CUDA_ERROR_INVALID_HANDLE;
165 return dlwrap_cuInit(X);