[C++20] [Modules] Fix may-be incorrect ADL for module local entities (#123931)
[llvm-project.git] / libc / utils / gpu / loader / nvptx / nvptx-loader.cpp
blob7d6c176c6f3601dd6c43818d2e0a53fc7ebac9d5
1 //===-- Loader Implementation for NVPTX devices --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file impelements a simple loader to run images supporting the NVPTX
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
12 // function.
14 //===----------------------------------------------------------------------===//
16 #include "Loader.h"
18 #include "cuda.h"
20 #include "llvm/Object/ELF.h"
21 #include "llvm/Object/ELFObjectFile.h"
23 #include <atomic>
24 #include <cstddef>
25 #include <cstdio>
26 #include <cstdlib>
27 #include <cstring>
28 #include <thread>
29 #include <vector>
31 using namespace llvm;
32 using namespace object;
34 static void handle_error_impl(const char *file, int32_t line, CUresult err) {
35 if (err == CUDA_SUCCESS)
36 return;
38 const char *err_str = nullptr;
39 CUresult result = cuGetErrorString(err, &err_str);
40 if (result != CUDA_SUCCESS)
41 fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
42 else
43 fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
44 exit(1);
47 // Gets the names of all the globals that contain functions to initialize or
48 // deinitialize. We need to do this manually because the NVPTX toolchain does
49 // not contain the necessary binary manipulation tools.
50 template <typename Alloc>
51 Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
52 Alloc allocator, CUmodule binary) {
53 auto mem_buffer = MemoryBuffer::getMemBuffer(
54 StringRef(reinterpret_cast<const char *>(image), size), "image",
55 /*RequiresNullTerminator=*/false);
56 Expected<ELF64LEObjectFile> elf_or_err =
57 ELF64LEObjectFile::create(*mem_buffer);
58 if (!elf_or_err)
59 handle_error(toString(elf_or_err.takeError()).c_str());
61 std::vector<std::pair<const char *, uint16_t>> ctors;
62 std::vector<std::pair<const char *, uint16_t>> dtors;
63 // CUDA has no way to iterate over all the symbols so we need to inspect the
64 // ELF directly using the LLVM libraries.
65 for (const auto &symbol : elf_or_err->symbols()) {
66 auto name_or_err = symbol.getName();
67 if (!name_or_err)
68 handle_error(toString(name_or_err.takeError()).c_str());
70 // Search for all symbols that contain a constructor or destructor.
71 if (!name_or_err->starts_with("__init_array_object_") &&
72 !name_or_err->starts_with("__fini_array_object_"))
73 continue;
75 uint16_t priority;
76 if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
77 handle_error("Invalid priority for constructor or destructor");
79 if (name_or_err->starts_with("__init"))
80 ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
81 else
82 dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
84 // Lower priority constructors are run before higher ones. The reverse is true
85 // for destructors.
86 llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
87 llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
89 // Allocate host pinned memory to make these arrays visible to the GPU.
90 CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
91 ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
92 uint64_t global_size = 0;
94 // Get the address of the global and then store the address of the constructor
95 // function to call in the constructor array.
96 CUdeviceptr *dev_ctors_start = dev_memory;
97 CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
98 for (uint64_t i = 0; i < ctors.size(); ++i) {
99 CUdeviceptr dev_ptr;
100 if (CUresult err =
101 cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
102 handle_error(err);
103 if (CUresult err =
104 cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
105 handle_error(err);
108 // Get the address of the global and then store the address of the destructor
109 // function to call in the destructor array.
110 CUdeviceptr *dev_dtors_start = dev_ctors_end;
111 CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
112 for (uint64_t i = 0; i < dtors.size(); ++i) {
113 CUdeviceptr dev_ptr;
114 if (CUresult err =
115 cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
116 handle_error(err);
117 if (CUresult err =
118 cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
119 handle_error(err);
122 // Obtain the address of the pointers the startup implementation uses to
123 // iterate the constructors and destructors.
124 CUdeviceptr init_start;
125 if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
126 "__init_array_start"))
127 handle_error(err);
128 CUdeviceptr init_end;
129 if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
130 "__init_array_end"))
131 handle_error(err);
132 CUdeviceptr fini_start;
133 if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
134 "__fini_array_start"))
135 handle_error(err);
136 CUdeviceptr fini_end;
137 if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
138 "__fini_array_end"))
139 handle_error(err);
141 // Copy the pointers to the newly written array to the symbols so the startup
142 // implementation can iterate them.
143 if (CUresult err =
144 cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
145 handle_error(err);
146 if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
147 handle_error(err);
148 if (CUresult err =
149 cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
150 handle_error(err);
151 if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
152 handle_error(err);
154 return dev_memory;
157 void print_kernel_resources(CUmodule binary, const char *kernel_name) {
158 CUfunction function;
159 if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
160 handle_error(err);
161 int num_regs;
162 if (CUresult err =
163 cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
164 handle_error(err);
165 printf("Executing kernel %s:\n", kernel_name);
166 printf("%6s registers: %d\n", kernel_name, num_regs);
169 template <typename args_t>
170 CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
171 const LaunchParameters &params, const char *kernel_name,
172 args_t kernel_args, bool print_resource_usage) {
173 // look up the '_start' kernel in the loaded module.
174 CUfunction function;
175 if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
176 handle_error(err);
178 // Set up the arguments to the '_start' kernel on the GPU.
179 uint64_t args_size = sizeof(args_t);
180 void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
181 CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
182 CU_LAUNCH_PARAM_END};
183 if (print_resource_usage)
184 print_kernel_resources(binary, kernel_name);
186 // Initialize a non-blocking CUDA stream to allocate memory if needed.
187 // This needs to be done on a separate stream or else it will deadlock
188 // with the executing kernel.
189 CUstream memory_stream;
190 if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
191 handle_error(err);
193 std::atomic<bool> finished = false;
194 std::thread server_thread(
195 [](std::atomic<bool> *finished, rpc::Server *server,
196 CUstream memory_stream) {
197 auto malloc_handler = [&](size_t size) -> void * {
198 CUdeviceptr dev_ptr;
199 if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
200 dev_ptr = 0UL;
202 // Wait until the memory allocation is complete.
203 while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
205 return reinterpret_cast<void *>(dev_ptr);
208 auto free_handler = [&](void *ptr) -> void {
209 if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
210 memory_stream))
211 handle_error(err);
214 uint32_t index = 0;
215 while (!*finished) {
216 index =
217 handle_server<32>(*server, index, malloc_handler, free_handler);
220 &finished, &server, memory_stream);
222 // Call the kernel with the given arguments.
223 if (CUresult err = cuLaunchKernel(
224 function, params.num_blocks_x, params.num_blocks_y,
225 params.num_blocks_z, params.num_threads_x, params.num_threads_y,
226 params.num_threads_z, 0, stream, nullptr, args_config))
227 handle_error(err);
229 if (CUresult err = cuStreamSynchronize(stream))
230 handle_error(err);
232 finished = true;
233 if (server_thread.joinable())
234 server_thread.join();
236 return CUDA_SUCCESS;
239 int load(int argc, const char **argv, const char **envp, void *image,
240 size_t size, const LaunchParameters &params,
241 bool print_resource_usage) {
242 if (CUresult err = cuInit(0))
243 handle_error(err);
244 // Obtain the first device found on the system.
245 uint32_t device_id = 0;
246 CUdevice device;
247 if (CUresult err = cuDeviceGet(&device, device_id))
248 handle_error(err);
250 // Initialize the CUDA context and claim it for this execution.
251 CUcontext context;
252 if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
253 handle_error(err);
254 if (CUresult err = cuCtxSetCurrent(context))
255 handle_error(err);
257 // Increase the stack size per thread.
258 // TODO: We should allow this to be passed in so only the tests that require a
259 // larger stack can specify it to save on memory usage.
260 if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
261 handle_error(err);
263 // Initialize a non-blocking CUDA stream to execute the kernel.
264 CUstream stream;
265 if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
266 handle_error(err);
268 // Load the image into a CUDA module.
269 CUmodule binary;
270 if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
271 handle_error(err);
273 // Allocate pinned memory on the host to hold the pointer array for the
274 // copied argv and allow the GPU device to access it.
275 auto allocator = [&](uint64_t size) -> void * {
276 void *dev_ptr;
277 if (CUresult err = cuMemAllocHost(&dev_ptr, size))
278 handle_error(err);
279 return dev_ptr;
282 auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
283 if (!memory_or_err)
284 handle_error(toString(memory_or_err.takeError()).c_str());
286 void *dev_argv = copy_argument_vector(argc, argv, allocator);
287 if (!dev_argv)
288 handle_error("Failed to allocate device argv");
290 // Allocate pinned memory on the host to hold the pointer array for the
291 // copied environment array and allow the GPU device to access it.
292 void *dev_envp = copy_environment(envp, allocator);
293 if (!dev_envp)
294 handle_error("Failed to allocate device environment");
296 // Allocate space for the return pointer and initialize it to zero.
297 CUdeviceptr dev_ret;
298 if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
299 handle_error(err);
300 if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
301 handle_error(err);
303 uint32_t warp_size = 32;
304 void *rpc_buffer = nullptr;
305 if (CUresult err = cuMemAllocHost(
306 &rpc_buffer,
307 rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
308 handle_error(err);
309 rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
310 rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
312 // Initialize the RPC client on the device by copying the local data to the
313 // device's internal pointer.
314 CUdeviceptr rpc_client_dev = 0;
315 uint64_t client_ptr_size = sizeof(void *);
316 if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
317 binary, "__llvm_rpc_client"))
318 handle_error(err);
320 if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
321 handle_error(err);
323 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
324 begin_args_t init_args = {argc, dev_argv, dev_envp};
325 if (CUresult err =
326 launch_kernel(binary, stream, server, single_threaded_params,
327 "_begin", init_args, print_resource_usage))
328 handle_error(err);
330 start_args_t args = {argc, dev_argv, dev_envp,
331 reinterpret_cast<void *>(dev_ret)};
332 if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
333 args, print_resource_usage))
334 handle_error(err);
336 // Copy the return value back from the kernel and wait.
337 int host_ret = 0;
338 if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
339 handle_error(err);
341 if (CUresult err = cuStreamSynchronize(stream))
342 handle_error(err);
344 end_args_t fini_args = {host_ret};
345 if (CUresult err =
346 launch_kernel(binary, stream, server, single_threaded_params, "_end",
347 fini_args, print_resource_usage))
348 handle_error(err);
350 // Free the memory allocated for the device.
351 if (CUresult err = cuMemFreeHost(*memory_or_err))
352 handle_error(err);
353 if (CUresult err = cuMemFree(dev_ret))
354 handle_error(err);
355 if (CUresult err = cuMemFreeHost(dev_argv))
356 handle_error(err);
357 if (CUresult err = cuMemFreeHost(rpc_buffer))
358 handle_error(err);
360 // Destroy the context and the loaded binary.
361 if (CUresult err = cuModuleUnload(binary))
362 handle_error(err);
363 if (CUresult err = cuDevicePrimaryCtxRelease(device))
364 handle_error(err);
365 return host_ret;