Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / libc / utils / gpu / loader / nvptx / Loader.cpp
blobe920b65a7e10ccefc67f5239ae59bd4d179eb253
1 //===-- Loader Implementation for NVPTX devices --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file impelements a simple loader to run images supporting the NVPTX
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
12 // function.
14 //===----------------------------------------------------------------------===//
16 #include "Loader.h"
18 #include "cuda.h"
20 #include "llvm/Object/ELF.h"
21 #include "llvm/Object/ELFObjectFile.h"
23 #include <cstddef>
24 #include <cstdio>
25 #include <cstdlib>
26 #include <cstring>
27 #include <vector>
29 using namespace llvm;
30 using namespace object;
32 static void handle_error(CUresult err) {
33 if (err == CUDA_SUCCESS)
34 return;
36 const char *err_str = nullptr;
37 CUresult result = cuGetErrorString(err, &err_str);
38 if (result != CUDA_SUCCESS)
39 fprintf(stderr, "Unknown Error\n");
40 else
41 fprintf(stderr, "%s\n", err_str);
42 exit(1);
45 // Gets the names of all the globals that contain functions to initialize or
46 // deinitialize. We need to do this manually because the NVPTX toolchain does
47 // not contain the necessary binary manipulation tools.
48 template <typename Alloc>
49 Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
50 Alloc allocator, CUmodule binary) {
51 auto mem_buffer = MemoryBuffer::getMemBuffer(
52 StringRef(reinterpret_cast<const char *>(image), size), "image",
53 /*RequiresNullTerminator=*/false);
54 Expected<ELF64LEObjectFile> elf_or_err =
55 ELF64LEObjectFile::create(*mem_buffer);
56 if (!elf_or_err)
57 handle_error(toString(elf_or_err.takeError()).c_str());
59 std::vector<std::pair<const char *, uint16_t>> ctors;
60 std::vector<std::pair<const char *, uint16_t>> dtors;
61 // CUDA has no way to iterate over all the symbols so we need to inspect the
62 // ELF directly using the LLVM libraries.
63 for (const auto &symbol : elf_or_err->symbols()) {
64 auto name_or_err = symbol.getName();
65 if (!name_or_err)
66 handle_error(toString(name_or_err.takeError()).c_str());
68 // Search for all symbols that contain a constructor or destructor.
69 if (!name_or_err->starts_with("__init_array_object_") &&
70 !name_or_err->starts_with("__fini_array_object_"))
71 continue;
73 uint16_t priority;
74 if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
75 handle_error("Invalid priority for constructor or destructor");
77 if (name_or_err->starts_with("__init"))
78 ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
79 else
80 dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
82 // Lower priority constructors are run before higher ones. The reverse is true
83 // for destructors.
84 llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
85 llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
86 llvm::reverse(dtors);
88 // Allocate host pinned memory to make these arrays visible to the GPU.
89 CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
90 ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
91 uint64_t global_size = 0;
93 // Get the address of the global and then store the address of the constructor
94 // function to call in the constructor array.
95 CUdeviceptr *dev_ctors_start = dev_memory;
96 CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
97 for (uint64_t i = 0; i < ctors.size(); ++i) {
98 CUdeviceptr dev_ptr;
99 if (CUresult err =
100 cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
101 handle_error(err);
102 if (CUresult err =
103 cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
104 handle_error(err);
107 // Get the address of the global and then store the address of the destructor
108 // function to call in the destructor array.
109 CUdeviceptr *dev_dtors_start = dev_ctors_end;
110 CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
111 for (uint64_t i = 0; i < dtors.size(); ++i) {
112 CUdeviceptr dev_ptr;
113 if (CUresult err =
114 cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
115 handle_error(err);
116 if (CUresult err =
117 cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
118 handle_error(err);
121 // Obtain the address of the pointers the startup implementation uses to
122 // iterate the constructors and destructors.
123 CUdeviceptr init_start;
124 if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
125 "__init_array_start"))
126 handle_error(err);
127 CUdeviceptr init_end;
128 if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
129 "__init_array_end"))
130 handle_error(err);
131 CUdeviceptr fini_start;
132 if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
133 "__fini_array_start"))
134 handle_error(err);
135 CUdeviceptr fini_end;
136 if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
137 "__fini_array_end"))
138 handle_error(err);
140 // Copy the pointers to the newly written array to the symbols so the startup
141 // implementation can iterate them.
142 if (CUresult err =
143 cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
144 handle_error(err);
145 if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
146 handle_error(err);
147 if (CUresult err =
148 cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
149 handle_error(err);
150 if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
151 handle_error(err);
153 return dev_memory;
156 template <typename args_t>
157 CUresult launch_kernel(CUmodule binary, CUstream stream,
158 const LaunchParameters &params, const char *kernel_name,
159 args_t kernel_args) {
160 // look up the '_start' kernel in the loaded module.
161 CUfunction function;
162 if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
163 handle_error(err);
165 // Set up the arguments to the '_start' kernel on the GPU.
166 uint64_t args_size = sizeof(args_t);
167 void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
168 CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
169 CU_LAUNCH_PARAM_END};
171 // Initialize a non-blocking CUDA stream to allocate memory if needed. This
172 // needs to be done on a separate stream or else it will deadlock with the
173 // executing kernel.
174 CUstream memory_stream;
175 if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
176 handle_error(err);
178 // Register RPC callbacks for the malloc and free functions on HSA.
179 uint32_t device_id = 0;
180 register_rpc_callbacks<32>(device_id);
182 rpc_register_callback(
183 device_id, RPC_MALLOC,
184 [](rpc_port_t port, void *data) {
185 auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
186 CUstream memory_stream = *static_cast<CUstream *>(data);
187 uint64_t size = buffer->data[0];
188 CUdeviceptr dev_ptr;
189 if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
190 handle_error(err);
192 // Wait until the memory allocation is complete.
193 while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
195 buffer->data[0] = static_cast<uintptr_t>(dev_ptr);
197 rpc_recv_and_send(port, malloc_handler, data);
199 &memory_stream);
200 rpc_register_callback(
201 device_id, RPC_FREE,
202 [](rpc_port_t port, void *data) {
203 auto free_handler = [](rpc_buffer_t *buffer, void *data) {
204 CUstream memory_stream = *static_cast<CUstream *>(data);
205 if (CUresult err = cuMemFreeAsync(
206 static_cast<CUdeviceptr>(buffer->data[0]), memory_stream))
207 handle_error(err);
209 rpc_recv_and_send(port, free_handler, data);
211 &memory_stream);
213 // Call the kernel with the given arguments.
214 if (CUresult err = cuLaunchKernel(
215 function, params.num_blocks_x, params.num_blocks_y,
216 params.num_blocks_z, params.num_threads_x, params.num_threads_y,
217 params.num_threads_z, 0, stream, nullptr, args_config))
218 handle_error(err);
220 // Wait until the kernel has completed execution on the device. Periodically
221 // check the RPC client for work to be performed on the server.
222 while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
223 if (rpc_status_t err = rpc_handle_server(device_id))
224 handle_error(err);
226 // Handle the server one more time in case the kernel exited with a pending
227 // send still in flight.
228 if (rpc_status_t err = rpc_handle_server(device_id))
229 handle_error(err);
231 return CUDA_SUCCESS;
234 int load(int argc, char **argv, char **envp, void *image, size_t size,
235 const LaunchParameters &params) {
236 if (CUresult err = cuInit(0))
237 handle_error(err);
238 // Obtain the first device found on the system.
239 uint32_t num_devices = 1;
240 uint32_t device_id = 0;
241 CUdevice device;
242 if (CUresult err = cuDeviceGet(&device, device_id))
243 handle_error(err);
245 // Initialize the CUDA context and claim it for this execution.
246 CUcontext context;
247 if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
248 handle_error(err);
249 if (CUresult err = cuCtxSetCurrent(context))
250 handle_error(err);
252 // Increase the stack size per thread.
253 // TODO: We should allow this to be passed in so only the tests that require a
254 // larger stack can specify it to save on memory usage.
255 if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
256 handle_error(err);
258 // Initialize a non-blocking CUDA stream to execute the kernel.
259 CUstream stream;
260 if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
261 handle_error(err);
263 // Load the image into a CUDA module.
264 CUmodule binary;
265 if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
266 handle_error(err);
268 // Allocate pinned memory on the host to hold the pointer array for the
269 // copied argv and allow the GPU device to access it.
270 auto allocator = [&](uint64_t size) -> void * {
271 void *dev_ptr;
272 if (CUresult err = cuMemAllocHost(&dev_ptr, size))
273 handle_error(err);
274 return dev_ptr;
277 auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
278 if (!memory_or_err)
279 handle_error(toString(memory_or_err.takeError()).c_str());
281 void *dev_argv = copy_argument_vector(argc, argv, allocator);
282 if (!dev_argv)
283 handle_error("Failed to allocate device argv");
285 // Allocate pinned memory on the host to hold the pointer array for the
286 // copied environment array and allow the GPU device to access it.
287 void *dev_envp = copy_environment(envp, allocator);
288 if (!dev_envp)
289 handle_error("Failed to allocate device environment");
291 // Allocate space for the return pointer and initialize it to zero.
292 CUdeviceptr dev_ret;
293 if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
294 handle_error(err);
295 if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
296 handle_error(err);
298 if (rpc_status_t err = rpc_init(num_devices))
299 handle_error(err);
301 uint32_t warp_size = 32;
302 auto rpc_alloc = [](uint64_t size, void *) -> void * {
303 void *dev_ptr;
304 if (CUresult err = cuMemAllocHost(&dev_ptr, size))
305 handle_error(err);
306 return dev_ptr;
308 if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT,
309 warp_size, rpc_alloc, nullptr))
310 handle_error(err);
312 // Initialize the RPC client on the device by copying the local data to the
313 // device's internal pointer.
314 CUdeviceptr rpc_client_dev = 0;
315 uint64_t client_ptr_size = sizeof(void *);
316 if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
317 binary, rpc_client_symbol_name))
318 handle_error(err);
320 CUdeviceptr rpc_client_host = 0;
321 if (CUresult err =
322 cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
323 handle_error(err);
324 if (CUresult err =
325 cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(device_id),
326 rpc_get_client_size()))
327 handle_error(err);
329 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
330 begin_args_t init_args = {argc, dev_argv, dev_envp};
331 if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
332 "_begin", init_args))
333 handle_error(err);
335 start_args_t args = {argc, dev_argv, dev_envp,
336 reinterpret_cast<void *>(dev_ret)};
337 if (CUresult err = launch_kernel(binary, stream, params, "_start", args))
338 handle_error(err);
340 // Copy the return value back from the kernel and wait.
341 int host_ret = 0;
342 if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
343 handle_error(err);
345 if (CUresult err = cuStreamSynchronize(stream))
346 handle_error(err);
348 end_args_t fini_args = {host_ret};
349 if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
350 "_end", fini_args))
351 handle_error(err);
353 // Free the memory allocated for the device.
354 if (CUresult err = cuMemFreeHost(*memory_or_err))
355 handle_error(err);
356 if (CUresult err = cuMemFree(dev_ret))
357 handle_error(err);
358 if (CUresult err = cuMemFreeHost(dev_argv))
359 handle_error(err);
360 if (rpc_status_t err = rpc_server_shutdown(
361 device_id, [](void *ptr, void *) { cuMemFreeHost(ptr); }, nullptr))
362 handle_error(err);
364 // Destroy the context and the loaded binary.
365 if (CUresult err = cuModuleUnload(binary))
366 handle_error(err);
367 if (CUresult err = cuDevicePrimaryCtxRelease(device))
368 handle_error(err);
369 if (rpc_status_t err = rpc_shutdown())
370 handle_error(err);
371 return host_ret;