libc/utils/gpu/loader/nvptx/Loader.cpp

   1 //===-- Loader Implementation for NVPTX devices --------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file impelements a simple loader to run images supporting the NVPTX
  10 // architecture. The file launches the '_start' kernel which should be provided
  11 // by the device application start code and call ultimately call the 'main'
  12 // function.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "Loader.h"
  17
  18 #include "cuda.h"
  19
  20 #include "llvm/Object/ELF.h"
  21 #include "llvm/Object/ELFObjectFile.h"
  22
  23 #include <cstddef>
  24 #include <cstdio>
  25 #include <cstdlib>
  26 #include <cstring>
  27 #include <vector>
  28
  29 using namespace llvm;
  30 using namespace object;
  31
  32 static void handle_error(CUresult err) {
  33   if (err == CUDA_SUCCESS)
  34     return;
  35
  36   const char *err_str = nullptr;
  37   CUresult result = cuGetErrorString(err, &err_str);
  38   if (result != CUDA_SUCCESS)
  39     fprintf(stderr, "Unknown Error\n");
  40   else
  41     fprintf(stderr, "%s\n", err_str);
  42   exit(1);
  43 }
  44
  45 // Gets the names of all the globals that contain functions to initialize or
  46 // deinitialize. We need to do this manually because the NVPTX toolchain does
  47 // not contain the necessary binary manipulation tools.
  48 template <typename Alloc>
  49 Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
  50                                      Alloc allocator, CUmodule binary) {
  51   auto mem_buffer = MemoryBuffer::getMemBuffer(
  52       StringRef(reinterpret_cast<const char *>(image), size), "image",
  53       /*RequiresNullTerminator=*/false);
  54   Expected<ELF64LEObjectFile> elf_or_err =
  55       ELF64LEObjectFile::create(*mem_buffer);
  56   if (!elf_or_err)
  57     handle_error(toString(elf_or_err.takeError()).c_str());
  58
  59   std::vector<std::pair<const char *, uint16_t>> ctors;
  60   std::vector<std::pair<const char *, uint16_t>> dtors;
  61   // CUDA has no way to iterate over all the symbols so we need to inspect the
  62   // ELF directly using the LLVM libraries.
  63   for (const auto &symbol : elf_or_err->symbols()) {
  64     auto name_or_err = symbol.getName();
  65     if (!name_or_err)
  66       handle_error(toString(name_or_err.takeError()).c_str());
  67
  68     // Search for all symbols that contain a constructor or destructor.
  69     if (!name_or_err->starts_with("__init_array_object_") &&
  70         !name_or_err->starts_with("__fini_array_object_"))
  71       continue;
  72
  73     uint16_t priority;
  74     if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
  75       handle_error("Invalid priority for constructor or destructor");
  76
  77     if (name_or_err->starts_with("__init"))
  78       ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
  79     else
  80       dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
  81   }
  82   // Lower priority constructors are run before higher ones. The reverse is true
  83   // for destructors.
  84   llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
  85   llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
  86   llvm::reverse(dtors);
  87
  88   // Allocate host pinned memory to make these arrays visible to the GPU.
  89   CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
  90       ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
  91   uint64_t global_size = 0;
  92
  93   // Get the address of the global and then store the address of the constructor
  94   // function to call in the constructor array.
  95   CUdeviceptr *dev_ctors_start = dev_memory;
  96   CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
  97   for (uint64_t i = 0; i < ctors.size(); ++i) {
  98     CUdeviceptr dev_ptr;
  99     if (CUresult err =
 100             cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
 101       handle_error(err);
 102     if (CUresult err =
 103             cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
 104       handle_error(err);
 105   }
 106
 107   // Get the address of the global and then store the address of the destructor
 108   // function to call in the destructor array.
 109   CUdeviceptr *dev_dtors_start = dev_ctors_end;
 110   CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
 111   for (uint64_t i = 0; i < dtors.size(); ++i) {
 112     CUdeviceptr dev_ptr;
 113     if (CUresult err =
 114             cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
 115       handle_error(err);
 116     if (CUresult err =
 117             cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
 118       handle_error(err);
 119   }
 120
 121   // Obtain the address of the pointers the startup implementation uses to
 122   // iterate the constructors and destructors.
 123   CUdeviceptr init_start;
 124   if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
 125                                        "__init_array_start"))
 126     handle_error(err);
 127   CUdeviceptr init_end;
 128   if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
 129                                        "__init_array_end"))
 130     handle_error(err);
 131   CUdeviceptr fini_start;
 132   if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
 133                                        "__fini_array_start"))
 134     handle_error(err);
 135   CUdeviceptr fini_end;
 136   if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
 137                                        "__fini_array_end"))
 138     handle_error(err);
 139
 140   // Copy the pointers to the newly written array to the symbols so the startup
 141   // implementation can iterate them.
 142   if (CUresult err =
 143           cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
 144     handle_error(err);
 145   if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
 146     handle_error(err);
 147   if (CUresult err =
 148           cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
 149     handle_error(err);
 150   if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
 151     handle_error(err);
 152
 153   return dev_memory;
 154 }
 155
 156 template <typename args_t>
 157 CUresult launch_kernel(CUmodule binary, CUstream stream,
 158                        const LaunchParameters &params, const char *kernel_name,
 159                        args_t kernel_args) {
 160   // look up the '_start' kernel in the loaded module.
 161   CUfunction function;
 162   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
 163     handle_error(err);
 164
 165   // Set up the arguments to the '_start' kernel on the GPU.
 166   uint64_t args_size = sizeof(args_t);
 167   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
 168                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
 169                          CU_LAUNCH_PARAM_END};
 170
 171   // Initialize a non-blocking CUDA stream to allocate memory if needed. This
 172   // needs to be done on a separate stream or else it will deadlock with the
 173   // executing kernel.
 174   CUstream memory_stream;
 175   if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
 176     handle_error(err);
 177
 178   // Register RPC callbacks for the malloc and free functions on HSA.
 179   uint32_t device_id = 0;
 180   register_rpc_callbacks<32>(device_id);
 181
 182   rpc_register_callback(
 183       device_id, RPC_MALLOC,
 184       [](rpc_port_t port, void *data) {
 185         auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
 186           CUstream memory_stream = *static_cast<CUstream *>(data);
 187           uint64_t size = buffer->data[0];
 188           CUdeviceptr dev_ptr;
 189           if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
 190             handle_error(err);
 191
 192           // Wait until the memory allocation is complete.
 193           while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
 194             ;
 195           buffer->data[0] = static_cast<uintptr_t>(dev_ptr);
 196         };
 197         rpc_recv_and_send(port, malloc_handler, data);
 198       },
 199       &memory_stream);
 200   rpc_register_callback(
 201       device_id, RPC_FREE,
 202       [](rpc_port_t port, void *data) {
 203         auto free_handler = [](rpc_buffer_t *buffer, void *data) {
 204           CUstream memory_stream = *static_cast<CUstream *>(data);
 205           if (CUresult err = cuMemFreeAsync(
 206                   static_cast<CUdeviceptr>(buffer->data[0]), memory_stream))
 207             handle_error(err);
 208         };
 209         rpc_recv_and_send(port, free_handler, data);
 210       },
 211       &memory_stream);
 212
 213   // Call the kernel with the given arguments.
 214   if (CUresult err = cuLaunchKernel(
 215           function, params.num_blocks_x, params.num_blocks_y,
 216           params.num_blocks_z, params.num_threads_x, params.num_threads_y,
 217           params.num_threads_z, 0, stream, nullptr, args_config))
 218     handle_error(err);
 219
 220   // Wait until the kernel has completed execution on the device. Periodically
 221   // check the RPC client for work to be performed on the server.
 222   while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
 223     if (rpc_status_t err = rpc_handle_server(device_id))
 224       handle_error(err);
 225
 226   // Handle the server one more time in case the kernel exited with a pending
 227   // send still in flight.
 228   if (rpc_status_t err = rpc_handle_server(device_id))
 229     handle_error(err);
 230
 231   return CUDA_SUCCESS;
 232 }
 233
 234 int load(int argc, char **argv, char **envp, void *image, size_t size,
 235          const LaunchParameters &params) {
 236   if (CUresult err = cuInit(0))
 237     handle_error(err);
 238   // Obtain the first device found on the system.
 239   uint32_t num_devices = 1;
 240   uint32_t device_id = 0;
 241   CUdevice device;
 242   if (CUresult err = cuDeviceGet(&device, device_id))
 243     handle_error(err);
 244
 245   // Initialize the CUDA context and claim it for this execution.
 246   CUcontext context;
 247   if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
 248     handle_error(err);
 249   if (CUresult err = cuCtxSetCurrent(context))
 250     handle_error(err);
 251
 252   // Increase the stack size per thread.
 253   // TODO: We should allow this to be passed in so only the tests that require a
 254   // larger stack can specify it to save on memory usage.
 255   if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
 256     handle_error(err);
 257
 258   // Initialize a non-blocking CUDA stream to execute the kernel.
 259   CUstream stream;
 260   if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
 261     handle_error(err);
 262
 263   // Load the image into a CUDA module.
 264   CUmodule binary;
 265   if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
 266     handle_error(err);
 267
 268   // Allocate pinned memory on the host to hold the pointer array for the
 269   // copied argv and allow the GPU device to access it.
 270   auto allocator = [&](uint64_t size) -> void * {
 271     void *dev_ptr;
 272     if (CUresult err = cuMemAllocHost(&dev_ptr, size))
 273       handle_error(err);
 274     return dev_ptr;
 275   };
 276
 277   auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
 278   if (!memory_or_err)
 279     handle_error(toString(memory_or_err.takeError()).c_str());
 280
 281   void *dev_argv = copy_argument_vector(argc, argv, allocator);
 282   if (!dev_argv)
 283     handle_error("Failed to allocate device argv");
 284
 285   // Allocate pinned memory on the host to hold the pointer array for the
 286   // copied environment array and allow the GPU device to access it.
 287   void *dev_envp = copy_environment(envp, allocator);
 288   if (!dev_envp)
 289     handle_error("Failed to allocate device environment");
 290
 291   // Allocate space for the return pointer and initialize it to zero.
 292   CUdeviceptr dev_ret;
 293   if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
 294     handle_error(err);
 295   if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
 296     handle_error(err);
 297
 298   if (rpc_status_t err = rpc_init(num_devices))
 299     handle_error(err);
 300
 301   uint32_t warp_size = 32;
 302   auto rpc_alloc = [](uint64_t size, void *) -> void * {
 303     void *dev_ptr;
 304     if (CUresult err = cuMemAllocHost(&dev_ptr, size))
 305       handle_error(err);
 306     return dev_ptr;
 307   };
 308   if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT,
 309                                          warp_size, rpc_alloc, nullptr))
 310     handle_error(err);
 311
 312   // Initialize the RPC client on the device by copying the local data to the
 313   // device's internal pointer.
 314   CUdeviceptr rpc_client_dev = 0;
 315   uint64_t client_ptr_size = sizeof(void *);
 316   if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
 317                                        binary, rpc_client_symbol_name))
 318     handle_error(err);
 319
 320   CUdeviceptr rpc_client_host = 0;
 321   if (CUresult err =
 322           cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
 323     handle_error(err);
 324   if (CUresult err =
 325           cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(device_id),
 326                        rpc_get_client_size()))
 327     handle_error(err);
 328
 329   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
 330   begin_args_t init_args = {argc, dev_argv, dev_envp};
 331   if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
 332                                    "_begin", init_args))
 333     handle_error(err);
 334
 335   start_args_t args = {argc, dev_argv, dev_envp,
 336                        reinterpret_cast<void *>(dev_ret)};
 337   if (CUresult err = launch_kernel(binary, stream, params, "_start", args))
 338     handle_error(err);
 339
 340   // Copy the return value back from the kernel and wait.
 341   int host_ret = 0;
 342   if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
 343     handle_error(err);
 344
 345   if (CUresult err = cuStreamSynchronize(stream))
 346     handle_error(err);
 347
 348   end_args_t fini_args = {host_ret};
 349   if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
 350                                    "_end", fini_args))
 351     handle_error(err);
 352
 353   // Free the memory allocated for the device.
 354   if (CUresult err = cuMemFreeHost(*memory_or_err))
 355     handle_error(err);
 356   if (CUresult err = cuMemFree(dev_ret))
 357     handle_error(err);
 358   if (CUresult err = cuMemFreeHost(dev_argv))
 359     handle_error(err);
 360   if (rpc_status_t err = rpc_server_shutdown(
 361           device_id, [](void *ptr, void *) { cuMemFreeHost(ptr); }, nullptr))
 362     handle_error(err);
 363
 364   // Destroy the context and the loaded binary.
 365   if (CUresult err = cuModuleUnload(binary))
 366     handle_error(err);
 367   if (CUresult err = cuDevicePrimaryCtxRelease(device))
 368     handle_error(err);
 369   if (rpc_status_t err = rpc_shutdown())
 370     handle_error(err);
 371   return host_ret;
 372 }