1 //===-- Loader Implementation for NVPTX devices --------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file impelements a simple loader to run images supporting the NVPTX
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
14 //===----------------------------------------------------------------------===//
20 #include "llvm/Object/ELF.h"
21 #include "llvm/Object/ELFObjectFile.h"
30 using namespace object
;
32 static void handle_error(CUresult err
) {
33 if (err
== CUDA_SUCCESS
)
36 const char *err_str
= nullptr;
37 CUresult result
= cuGetErrorString(err
, &err_str
);
38 if (result
!= CUDA_SUCCESS
)
39 fprintf(stderr
, "Unknown Error\n");
41 fprintf(stderr
, "%s\n", err_str
);
45 // Gets the names of all the globals that contain functions to initialize or
46 // deinitialize. We need to do this manually because the NVPTX toolchain does
47 // not contain the necessary binary manipulation tools.
48 template <typename Alloc
>
49 Expected
<void *> get_ctor_dtor_array(const void *image
, const size_t size
,
50 Alloc allocator
, CUmodule binary
) {
51 auto mem_buffer
= MemoryBuffer::getMemBuffer(
52 StringRef(reinterpret_cast<const char *>(image
), size
), "image",
53 /*RequiresNullTerminator=*/false);
54 Expected
<ELF64LEObjectFile
> elf_or_err
=
55 ELF64LEObjectFile::create(*mem_buffer
);
57 handle_error(toString(elf_or_err
.takeError()).c_str());
59 std::vector
<std::pair
<const char *, uint16_t>> ctors
;
60 std::vector
<std::pair
<const char *, uint16_t>> dtors
;
61 // CUDA has no way to iterate over all the symbols so we need to inspect the
62 // ELF directly using the LLVM libraries.
63 for (const auto &symbol
: elf_or_err
->symbols()) {
64 auto name_or_err
= symbol
.getName();
66 handle_error(toString(name_or_err
.takeError()).c_str());
68 // Search for all symbols that contain a constructor or destructor.
69 if (!name_or_err
->starts_with("__init_array_object_") &&
70 !name_or_err
->starts_with("__fini_array_object_"))
74 if (name_or_err
->rsplit('_').second
.getAsInteger(10, priority
))
75 handle_error("Invalid priority for constructor or destructor");
77 if (name_or_err
->starts_with("__init"))
78 ctors
.emplace_back(std::make_pair(name_or_err
->data(), priority
));
80 dtors
.emplace_back(std::make_pair(name_or_err
->data(), priority
));
82 // Lower priority constructors are run before higher ones. The reverse is true
84 llvm::sort(ctors
, [](auto x
, auto y
) { return x
.second
< y
.second
; });
85 llvm::sort(dtors
, [](auto x
, auto y
) { return x
.second
< y
.second
; });
88 // Allocate host pinned memory to make these arrays visible to the GPU.
89 CUdeviceptr
*dev_memory
= reinterpret_cast<CUdeviceptr
*>(allocator(
90 ctors
.size() * sizeof(CUdeviceptr
) + dtors
.size() * sizeof(CUdeviceptr
)));
91 uint64_t global_size
= 0;
93 // Get the address of the global and then store the address of the constructor
94 // function to call in the constructor array.
95 CUdeviceptr
*dev_ctors_start
= dev_memory
;
96 CUdeviceptr
*dev_ctors_end
= dev_ctors_start
+ ctors
.size();
97 for (uint64_t i
= 0; i
< ctors
.size(); ++i
) {
100 cuModuleGetGlobal(&dev_ptr
, &global_size
, binary
, ctors
[i
].first
))
103 cuMemcpyDtoH(&dev_ctors_start
[i
], dev_ptr
, sizeof(uintptr_t)))
107 // Get the address of the global and then store the address of the destructor
108 // function to call in the destructor array.
109 CUdeviceptr
*dev_dtors_start
= dev_ctors_end
;
110 CUdeviceptr
*dev_dtors_end
= dev_dtors_start
+ dtors
.size();
111 for (uint64_t i
= 0; i
< dtors
.size(); ++i
) {
114 cuModuleGetGlobal(&dev_ptr
, &global_size
, binary
, dtors
[i
].first
))
117 cuMemcpyDtoH(&dev_dtors_start
[i
], dev_ptr
, sizeof(uintptr_t)))
121 // Obtain the address of the pointers the startup implementation uses to
122 // iterate the constructors and destructors.
123 CUdeviceptr init_start
;
124 if (CUresult err
= cuModuleGetGlobal(&init_start
, &global_size
, binary
,
125 "__init_array_start"))
127 CUdeviceptr init_end
;
128 if (CUresult err
= cuModuleGetGlobal(&init_end
, &global_size
, binary
,
131 CUdeviceptr fini_start
;
132 if (CUresult err
= cuModuleGetGlobal(&fini_start
, &global_size
, binary
,
133 "__fini_array_start"))
135 CUdeviceptr fini_end
;
136 if (CUresult err
= cuModuleGetGlobal(&fini_end
, &global_size
, binary
,
140 // Copy the pointers to the newly written array to the symbols so the startup
141 // implementation can iterate them.
143 cuMemcpyHtoD(init_start
, &dev_ctors_start
, sizeof(uintptr_t)))
145 if (CUresult err
= cuMemcpyHtoD(init_end
, &dev_ctors_end
, sizeof(uintptr_t)))
148 cuMemcpyHtoD(fini_start
, &dev_dtors_start
, sizeof(uintptr_t)))
150 if (CUresult err
= cuMemcpyHtoD(fini_end
, &dev_dtors_end
, sizeof(uintptr_t)))
156 template <typename args_t
>
157 CUresult
launch_kernel(CUmodule binary
, CUstream stream
,
158 const LaunchParameters
¶ms
, const char *kernel_name
,
159 args_t kernel_args
) {
160 // look up the '_start' kernel in the loaded module.
162 if (CUresult err
= cuModuleGetFunction(&function
, binary
, kernel_name
))
165 // Set up the arguments to the '_start' kernel on the GPU.
166 uint64_t args_size
= sizeof(args_t
);
167 void *args_config
[] = {CU_LAUNCH_PARAM_BUFFER_POINTER
, &kernel_args
,
168 CU_LAUNCH_PARAM_BUFFER_SIZE
, &args_size
,
169 CU_LAUNCH_PARAM_END
};
171 // Initialize a non-blocking CUDA stream to allocate memory if needed. This
172 // needs to be done on a separate stream or else it will deadlock with the
174 CUstream memory_stream
;
175 if (CUresult err
= cuStreamCreate(&memory_stream
, CU_STREAM_NON_BLOCKING
))
178 // Register RPC callbacks for the malloc and free functions on HSA.
179 uint32_t device_id
= 0;
180 register_rpc_callbacks
<32>(device_id
);
182 rpc_register_callback(
183 device_id
, RPC_MALLOC
,
184 [](rpc_port_t port
, void *data
) {
185 auto malloc_handler
= [](rpc_buffer_t
*buffer
, void *data
) -> void {
186 CUstream memory_stream
= *static_cast<CUstream
*>(data
);
187 uint64_t size
= buffer
->data
[0];
189 if (CUresult err
= cuMemAllocAsync(&dev_ptr
, size
, memory_stream
))
192 // Wait until the memory allocation is complete.
193 while (cuStreamQuery(memory_stream
) == CUDA_ERROR_NOT_READY
)
195 buffer
->data
[0] = static_cast<uintptr_t>(dev_ptr
);
197 rpc_recv_and_send(port
, malloc_handler
, data
);
200 rpc_register_callback(
202 [](rpc_port_t port
, void *data
) {
203 auto free_handler
= [](rpc_buffer_t
*buffer
, void *data
) {
204 CUstream memory_stream
= *static_cast<CUstream
*>(data
);
205 if (CUresult err
= cuMemFreeAsync(
206 static_cast<CUdeviceptr
>(buffer
->data
[0]), memory_stream
))
209 rpc_recv_and_send(port
, free_handler
, data
);
213 // Call the kernel with the given arguments.
214 if (CUresult err
= cuLaunchKernel(
215 function
, params
.num_blocks_x
, params
.num_blocks_y
,
216 params
.num_blocks_z
, params
.num_threads_x
, params
.num_threads_y
,
217 params
.num_threads_z
, 0, stream
, nullptr, args_config
))
220 // Wait until the kernel has completed execution on the device. Periodically
221 // check the RPC client for work to be performed on the server.
222 while (cuStreamQuery(stream
) == CUDA_ERROR_NOT_READY
)
223 if (rpc_status_t err
= rpc_handle_server(device_id
))
226 // Handle the server one more time in case the kernel exited with a pending
227 // send still in flight.
228 if (rpc_status_t err
= rpc_handle_server(device_id
))
234 int load(int argc
, char **argv
, char **envp
, void *image
, size_t size
,
235 const LaunchParameters
¶ms
) {
236 if (CUresult err
= cuInit(0))
238 // Obtain the first device found on the system.
239 uint32_t num_devices
= 1;
240 uint32_t device_id
= 0;
242 if (CUresult err
= cuDeviceGet(&device
, device_id
))
245 // Initialize the CUDA context and claim it for this execution.
247 if (CUresult err
= cuDevicePrimaryCtxRetain(&context
, device
))
249 if (CUresult err
= cuCtxSetCurrent(context
))
252 // Increase the stack size per thread.
253 // TODO: We should allow this to be passed in so only the tests that require a
254 // larger stack can specify it to save on memory usage.
255 if (CUresult err
= cuCtxSetLimit(CU_LIMIT_STACK_SIZE
, 3 * 1024))
258 // Initialize a non-blocking CUDA stream to execute the kernel.
260 if (CUresult err
= cuStreamCreate(&stream
, CU_STREAM_NON_BLOCKING
))
263 // Load the image into a CUDA module.
265 if (CUresult err
= cuModuleLoadDataEx(&binary
, image
, 0, nullptr, nullptr))
268 // Allocate pinned memory on the host to hold the pointer array for the
269 // copied argv and allow the GPU device to access it.
270 auto allocator
= [&](uint64_t size
) -> void * {
272 if (CUresult err
= cuMemAllocHost(&dev_ptr
, size
))
277 auto memory_or_err
= get_ctor_dtor_array(image
, size
, allocator
, binary
);
279 handle_error(toString(memory_or_err
.takeError()).c_str());
281 void *dev_argv
= copy_argument_vector(argc
, argv
, allocator
);
283 handle_error("Failed to allocate device argv");
285 // Allocate pinned memory on the host to hold the pointer array for the
286 // copied environment array and allow the GPU device to access it.
287 void *dev_envp
= copy_environment(envp
, allocator
);
289 handle_error("Failed to allocate device environment");
291 // Allocate space for the return pointer and initialize it to zero.
293 if (CUresult err
= cuMemAlloc(&dev_ret
, sizeof(int)))
295 if (CUresult err
= cuMemsetD32(dev_ret
, 0, 1))
298 if (rpc_status_t err
= rpc_init(num_devices
))
301 uint32_t warp_size
= 32;
302 auto rpc_alloc
= [](uint64_t size
, void *) -> void * {
304 if (CUresult err
= cuMemAllocHost(&dev_ptr
, size
))
308 if (rpc_status_t err
= rpc_server_init(device_id
, RPC_MAXIMUM_PORT_COUNT
,
309 warp_size
, rpc_alloc
, nullptr))
312 // Initialize the RPC client on the device by copying the local data to the
313 // device's internal pointer.
314 CUdeviceptr rpc_client_dev
= 0;
315 uint64_t client_ptr_size
= sizeof(void *);
316 if (CUresult err
= cuModuleGetGlobal(&rpc_client_dev
, &client_ptr_size
,
317 binary
, rpc_client_symbol_name
))
320 CUdeviceptr rpc_client_host
= 0;
322 cuMemcpyDtoH(&rpc_client_host
, rpc_client_dev
, sizeof(void *)))
325 cuMemcpyHtoD(rpc_client_host
, rpc_get_client_buffer(device_id
),
326 rpc_get_client_size()))
329 LaunchParameters single_threaded_params
= {1, 1, 1, 1, 1, 1};
330 begin_args_t init_args
= {argc
, dev_argv
, dev_envp
};
331 if (CUresult err
= launch_kernel(binary
, stream
, single_threaded_params
,
332 "_begin", init_args
))
335 start_args_t args
= {argc
, dev_argv
, dev_envp
,
336 reinterpret_cast<void *>(dev_ret
)};
337 if (CUresult err
= launch_kernel(binary
, stream
, params
, "_start", args
))
340 // Copy the return value back from the kernel and wait.
342 if (CUresult err
= cuMemcpyDtoH(&host_ret
, dev_ret
, sizeof(int)))
345 if (CUresult err
= cuStreamSynchronize(stream
))
348 end_args_t fini_args
= {host_ret
};
349 if (CUresult err
= launch_kernel(binary
, stream
, single_threaded_params
,
353 // Free the memory allocated for the device.
354 if (CUresult err
= cuMemFreeHost(*memory_or_err
))
356 if (CUresult err
= cuMemFree(dev_ret
))
358 if (CUresult err
= cuMemFreeHost(dev_argv
))
360 if (rpc_status_t err
= rpc_server_shutdown(
361 device_id
, [](void *ptr
, void *) { cuMemFreeHost(ptr
); }, nullptr))
364 // Destroy the context and the loaded binary.
365 if (CUresult err
= cuModuleUnload(binary
))
367 if (CUresult err
= cuDevicePrimaryCtxRelease(device
))
369 if (rpc_status_t err
= rpc_shutdown())