1 //===-- Loader Implementation for NVPTX devices --------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file impelements a simple loader to run images supporting the NVPTX
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
14 //===----------------------------------------------------------------------===//
20 #include "llvm/Object/ELF.h"
21 #include "llvm/Object/ELFObjectFile.h"
32 using namespace object
;
34 static void handle_error_impl(const char *file
, int32_t line
, CUresult err
) {
35 if (err
== CUDA_SUCCESS
)
38 const char *err_str
= nullptr;
39 CUresult result
= cuGetErrorString(err
, &err_str
);
40 if (result
!= CUDA_SUCCESS
)
41 fprintf(stderr
, "%s:%d:0: Unknown Error\n", file
, line
);
43 fprintf(stderr
, "%s:%d:0: Error: %s\n", file
, line
, err_str
);
47 // Gets the names of all the globals that contain functions to initialize or
48 // deinitialize. We need to do this manually because the NVPTX toolchain does
49 // not contain the necessary binary manipulation tools.
50 template <typename Alloc
>
51 Expected
<void *> get_ctor_dtor_array(const void *image
, const size_t size
,
52 Alloc allocator
, CUmodule binary
) {
53 auto mem_buffer
= MemoryBuffer::getMemBuffer(
54 StringRef(reinterpret_cast<const char *>(image
), size
), "image",
55 /*RequiresNullTerminator=*/false);
56 Expected
<ELF64LEObjectFile
> elf_or_err
=
57 ELF64LEObjectFile::create(*mem_buffer
);
59 handle_error(toString(elf_or_err
.takeError()).c_str());
61 std::vector
<std::pair
<const char *, uint16_t>> ctors
;
62 std::vector
<std::pair
<const char *, uint16_t>> dtors
;
63 // CUDA has no way to iterate over all the symbols so we need to inspect the
64 // ELF directly using the LLVM libraries.
65 for (const auto &symbol
: elf_or_err
->symbols()) {
66 auto name_or_err
= symbol
.getName();
68 handle_error(toString(name_or_err
.takeError()).c_str());
70 // Search for all symbols that contain a constructor or destructor.
71 if (!name_or_err
->starts_with("__init_array_object_") &&
72 !name_or_err
->starts_with("__fini_array_object_"))
76 if (name_or_err
->rsplit('_').second
.getAsInteger(10, priority
))
77 handle_error("Invalid priority for constructor or destructor");
79 if (name_or_err
->starts_with("__init"))
80 ctors
.emplace_back(std::make_pair(name_or_err
->data(), priority
));
82 dtors
.emplace_back(std::make_pair(name_or_err
->data(), priority
));
84 // Lower priority constructors are run before higher ones. The reverse is true
86 llvm::sort(ctors
, [](auto x
, auto y
) { return x
.second
< y
.second
; });
87 llvm::sort(dtors
, [](auto x
, auto y
) { return x
.second
< y
.second
; });
89 // Allocate host pinned memory to make these arrays visible to the GPU.
90 CUdeviceptr
*dev_memory
= reinterpret_cast<CUdeviceptr
*>(allocator(
91 ctors
.size() * sizeof(CUdeviceptr
) + dtors
.size() * sizeof(CUdeviceptr
)));
92 uint64_t global_size
= 0;
94 // Get the address of the global and then store the address of the constructor
95 // function to call in the constructor array.
96 CUdeviceptr
*dev_ctors_start
= dev_memory
;
97 CUdeviceptr
*dev_ctors_end
= dev_ctors_start
+ ctors
.size();
98 for (uint64_t i
= 0; i
< ctors
.size(); ++i
) {
101 cuModuleGetGlobal(&dev_ptr
, &global_size
, binary
, ctors
[i
].first
))
104 cuMemcpyDtoH(&dev_ctors_start
[i
], dev_ptr
, sizeof(uintptr_t)))
108 // Get the address of the global and then store the address of the destructor
109 // function to call in the destructor array.
110 CUdeviceptr
*dev_dtors_start
= dev_ctors_end
;
111 CUdeviceptr
*dev_dtors_end
= dev_dtors_start
+ dtors
.size();
112 for (uint64_t i
= 0; i
< dtors
.size(); ++i
) {
115 cuModuleGetGlobal(&dev_ptr
, &global_size
, binary
, dtors
[i
].first
))
118 cuMemcpyDtoH(&dev_dtors_start
[i
], dev_ptr
, sizeof(uintptr_t)))
122 // Obtain the address of the pointers the startup implementation uses to
123 // iterate the constructors and destructors.
124 CUdeviceptr init_start
;
125 if (CUresult err
= cuModuleGetGlobal(&init_start
, &global_size
, binary
,
126 "__init_array_start"))
128 CUdeviceptr init_end
;
129 if (CUresult err
= cuModuleGetGlobal(&init_end
, &global_size
, binary
,
132 CUdeviceptr fini_start
;
133 if (CUresult err
= cuModuleGetGlobal(&fini_start
, &global_size
, binary
,
134 "__fini_array_start"))
136 CUdeviceptr fini_end
;
137 if (CUresult err
= cuModuleGetGlobal(&fini_end
, &global_size
, binary
,
141 // Copy the pointers to the newly written array to the symbols so the startup
142 // implementation can iterate them.
144 cuMemcpyHtoD(init_start
, &dev_ctors_start
, sizeof(uintptr_t)))
146 if (CUresult err
= cuMemcpyHtoD(init_end
, &dev_ctors_end
, sizeof(uintptr_t)))
149 cuMemcpyHtoD(fini_start
, &dev_dtors_start
, sizeof(uintptr_t)))
151 if (CUresult err
= cuMemcpyHtoD(fini_end
, &dev_dtors_end
, sizeof(uintptr_t)))
157 void print_kernel_resources(CUmodule binary
, const char *kernel_name
) {
159 if (CUresult err
= cuModuleGetFunction(&function
, binary
, kernel_name
))
163 cuFuncGetAttribute(&num_regs
, CU_FUNC_ATTRIBUTE_NUM_REGS
, function
))
165 printf("Executing kernel %s:\n", kernel_name
);
166 printf("%6s registers: %d\n", kernel_name
, num_regs
);
169 template <typename args_t
>
170 CUresult
launch_kernel(CUmodule binary
, CUstream stream
, rpc::Server
&server
,
171 const LaunchParameters
¶ms
, const char *kernel_name
,
172 args_t kernel_args
, bool print_resource_usage
) {
173 // look up the '_start' kernel in the loaded module.
175 if (CUresult err
= cuModuleGetFunction(&function
, binary
, kernel_name
))
178 // Set up the arguments to the '_start' kernel on the GPU.
179 uint64_t args_size
= sizeof(args_t
);
180 void *args_config
[] = {CU_LAUNCH_PARAM_BUFFER_POINTER
, &kernel_args
,
181 CU_LAUNCH_PARAM_BUFFER_SIZE
, &args_size
,
182 CU_LAUNCH_PARAM_END
};
183 if (print_resource_usage
)
184 print_kernel_resources(binary
, kernel_name
);
186 // Initialize a non-blocking CUDA stream to allocate memory if needed.
187 // This needs to be done on a separate stream or else it will deadlock
188 // with the executing kernel.
189 CUstream memory_stream
;
190 if (CUresult err
= cuStreamCreate(&memory_stream
, CU_STREAM_NON_BLOCKING
))
193 std::atomic
<bool> finished
= false;
194 std::thread
server_thread(
195 [](std::atomic
<bool> *finished
, rpc::Server
*server
,
196 CUstream memory_stream
) {
197 auto malloc_handler
= [&](size_t size
) -> void * {
199 if (CUresult err
= cuMemAllocAsync(&dev_ptr
, size
, memory_stream
))
202 // Wait until the memory allocation is complete.
203 while (cuStreamQuery(memory_stream
) == CUDA_ERROR_NOT_READY
)
205 return reinterpret_cast<void *>(dev_ptr
);
208 auto free_handler
= [&](void *ptr
) -> void {
209 if (CUresult err
= cuMemFreeAsync(reinterpret_cast<CUdeviceptr
>(ptr
),
217 handle_server
<32>(*server
, index
, malloc_handler
, free_handler
);
220 &finished
, &server
, memory_stream
);
222 // Call the kernel with the given arguments.
223 if (CUresult err
= cuLaunchKernel(
224 function
, params
.num_blocks_x
, params
.num_blocks_y
,
225 params
.num_blocks_z
, params
.num_threads_x
, params
.num_threads_y
,
226 params
.num_threads_z
, 0, stream
, nullptr, args_config
))
229 if (CUresult err
= cuStreamSynchronize(stream
))
233 if (server_thread
.joinable())
234 server_thread
.join();
239 int load(int argc
, const char **argv
, const char **envp
, void *image
,
240 size_t size
, const LaunchParameters
¶ms
,
241 bool print_resource_usage
) {
242 if (CUresult err
= cuInit(0))
244 // Obtain the first device found on the system.
245 uint32_t device_id
= 0;
247 if (CUresult err
= cuDeviceGet(&device
, device_id
))
250 // Initialize the CUDA context and claim it for this execution.
252 if (CUresult err
= cuDevicePrimaryCtxRetain(&context
, device
))
254 if (CUresult err
= cuCtxSetCurrent(context
))
257 // Increase the stack size per thread.
258 // TODO: We should allow this to be passed in so only the tests that require a
259 // larger stack can specify it to save on memory usage.
260 if (CUresult err
= cuCtxSetLimit(CU_LIMIT_STACK_SIZE
, 3 * 1024))
263 // Initialize a non-blocking CUDA stream to execute the kernel.
265 if (CUresult err
= cuStreamCreate(&stream
, CU_STREAM_NON_BLOCKING
))
268 // Load the image into a CUDA module.
270 if (CUresult err
= cuModuleLoadDataEx(&binary
, image
, 0, nullptr, nullptr))
273 // Allocate pinned memory on the host to hold the pointer array for the
274 // copied argv and allow the GPU device to access it.
275 auto allocator
= [&](uint64_t size
) -> void * {
277 if (CUresult err
= cuMemAllocHost(&dev_ptr
, size
))
282 auto memory_or_err
= get_ctor_dtor_array(image
, size
, allocator
, binary
);
284 handle_error(toString(memory_or_err
.takeError()).c_str());
286 void *dev_argv
= copy_argument_vector(argc
, argv
, allocator
);
288 handle_error("Failed to allocate device argv");
290 // Allocate pinned memory on the host to hold the pointer array for the
291 // copied environment array and allow the GPU device to access it.
292 void *dev_envp
= copy_environment(envp
, allocator
);
294 handle_error("Failed to allocate device environment");
296 // Allocate space for the return pointer and initialize it to zero.
298 if (CUresult err
= cuMemAlloc(&dev_ret
, sizeof(int)))
300 if (CUresult err
= cuMemsetD32(dev_ret
, 0, 1))
303 uint32_t warp_size
= 32;
304 void *rpc_buffer
= nullptr;
305 if (CUresult err
= cuMemAllocHost(
307 rpc::Server::allocation_size(warp_size
, rpc::MAX_PORT_COUNT
)))
309 rpc::Server
server(rpc::MAX_PORT_COUNT
, rpc_buffer
);
310 rpc::Client
client(rpc::MAX_PORT_COUNT
, rpc_buffer
);
312 // Initialize the RPC client on the device by copying the local data to the
313 // device's internal pointer.
314 CUdeviceptr rpc_client_dev
= 0;
315 uint64_t client_ptr_size
= sizeof(void *);
316 if (CUresult err
= cuModuleGetGlobal(&rpc_client_dev
, &client_ptr_size
,
317 binary
, "__llvm_rpc_client"))
320 if (CUresult err
= cuMemcpyHtoD(rpc_client_dev
, &client
, sizeof(rpc::Client
)))
323 LaunchParameters single_threaded_params
= {1, 1, 1, 1, 1, 1};
324 begin_args_t init_args
= {argc
, dev_argv
, dev_envp
};
326 launch_kernel(binary
, stream
, server
, single_threaded_params
,
327 "_begin", init_args
, print_resource_usage
))
330 start_args_t args
= {argc
, dev_argv
, dev_envp
,
331 reinterpret_cast<void *>(dev_ret
)};
332 if (CUresult err
= launch_kernel(binary
, stream
, server
, params
, "_start",
333 args
, print_resource_usage
))
336 // Copy the return value back from the kernel and wait.
338 if (CUresult err
= cuMemcpyDtoH(&host_ret
, dev_ret
, sizeof(int)))
341 if (CUresult err
= cuStreamSynchronize(stream
))
344 end_args_t fini_args
= {host_ret
};
346 launch_kernel(binary
, stream
, server
, single_threaded_params
, "_end",
347 fini_args
, print_resource_usage
))
350 // Free the memory allocated for the device.
351 if (CUresult err
= cuMemFreeHost(*memory_or_err
))
353 if (CUresult err
= cuMemFree(dev_ret
))
355 if (CUresult err
= cuMemFreeHost(dev_argv
))
357 if (CUresult err
= cuMemFreeHost(rpc_buffer
))
360 // Destroy the context and the loaded binary.
361 if (CUresult err
= cuModuleUnload(binary
))
363 if (CUresult err
= cuDevicePrimaryCtxRelease(device
))