libc/utils/gpu/loader/nvptx/nvptx-loader.cpp

   1 //===-- Loader Implementation for NVPTX devices --------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file impelements a simple loader to run images supporting the NVPTX
  10 // architecture. The file launches the '_start' kernel which should be provided
  11 // by the device application start code and call ultimately call the 'main'
  12 // function.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "Loader.h"
  17
  18 #include "cuda.h"
  19
  20 #include "llvm/Object/ELF.h"
  21 #include "llvm/Object/ELFObjectFile.h"
  22
  23 #include <atomic>
  24 #include <cstddef>
  25 #include <cstdio>
  26 #include <cstdlib>
  27 #include <cstring>
  28 #include <thread>
  29 #include <vector>
  30
  31 using namespace llvm;
  32 using namespace object;
  33
  34 static void handle_error_impl(const char *file, int32_t line, CUresult err) {
  35   if (err == CUDA_SUCCESS)
  36     return;
  37
  38   const char *err_str = nullptr;
  39   CUresult result = cuGetErrorString(err, &err_str);
  40   if (result != CUDA_SUCCESS)
  41     fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
  42   else
  43     fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
  44   exit(1);
  45 }
  46
  47 // Gets the names of all the globals that contain functions to initialize or
  48 // deinitialize. We need to do this manually because the NVPTX toolchain does
  49 // not contain the necessary binary manipulation tools.
  50 template <typename Alloc>
  51 Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
  52                                      Alloc allocator, CUmodule binary) {
  53   auto mem_buffer = MemoryBuffer::getMemBuffer(
  54       StringRef(reinterpret_cast<const char *>(image), size), "image",
  55       /*RequiresNullTerminator=*/false);
  56   Expected<ELF64LEObjectFile> elf_or_err =
  57       ELF64LEObjectFile::create(*mem_buffer);
  58   if (!elf_or_err)
  59     handle_error(toString(elf_or_err.takeError()).c_str());
  60
  61   std::vector<std::pair<const char *, uint16_t>> ctors;
  62   std::vector<std::pair<const char *, uint16_t>> dtors;
  63   // CUDA has no way to iterate over all the symbols so we need to inspect the
  64   // ELF directly using the LLVM libraries.
  65   for (const auto &symbol : elf_or_err->symbols()) {
  66     auto name_or_err = symbol.getName();
  67     if (!name_or_err)
  68       handle_error(toString(name_or_err.takeError()).c_str());
  69
  70     // Search for all symbols that contain a constructor or destructor.
  71     if (!name_or_err->starts_with("__init_array_object_") &&
  72         !name_or_err->starts_with("__fini_array_object_"))
  73       continue;
  74
  75     uint16_t priority;
  76     if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
  77       handle_error("Invalid priority for constructor or destructor");
  78
  79     if (name_or_err->starts_with("__init"))
  80       ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
  81     else
  82       dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
  83   }
  84   // Lower priority constructors are run before higher ones. The reverse is true
  85   // for destructors.
  86   llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
  87   llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
  88
  89   // Allocate host pinned memory to make these arrays visible to the GPU.
  90   CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
  91       ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
  92   uint64_t global_size = 0;
  93
  94   // Get the address of the global and then store the address of the constructor
  95   // function to call in the constructor array.
  96   CUdeviceptr *dev_ctors_start = dev_memory;
  97   CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
  98   for (uint64_t i = 0; i < ctors.size(); ++i) {
  99     CUdeviceptr dev_ptr;
 100     if (CUresult err =
 101             cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
 102       handle_error(err);
 103     if (CUresult err =
 104             cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
 105       handle_error(err);
 106   }
 107
 108   // Get the address of the global and then store the address of the destructor
 109   // function to call in the destructor array.
 110   CUdeviceptr *dev_dtors_start = dev_ctors_end;
 111   CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
 112   for (uint64_t i = 0; i < dtors.size(); ++i) {
 113     CUdeviceptr dev_ptr;
 114     if (CUresult err =
 115             cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
 116       handle_error(err);
 117     if (CUresult err =
 118             cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
 119       handle_error(err);
 120   }
 121
 122   // Obtain the address of the pointers the startup implementation uses to
 123   // iterate the constructors and destructors.
 124   CUdeviceptr init_start;
 125   if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
 126                                        "__init_array_start"))
 127     handle_error(err);
 128   CUdeviceptr init_end;
 129   if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
 130                                        "__init_array_end"))
 131     handle_error(err);
 132   CUdeviceptr fini_start;
 133   if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
 134                                        "__fini_array_start"))
 135     handle_error(err);
 136   CUdeviceptr fini_end;
 137   if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
 138                                        "__fini_array_end"))
 139     handle_error(err);
 140
 141   // Copy the pointers to the newly written array to the symbols so the startup
 142   // implementation can iterate them.
 143   if (CUresult err =
 144           cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
 145     handle_error(err);
 146   if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
 147     handle_error(err);
 148   if (CUresult err =
 149           cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
 150     handle_error(err);
 151   if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
 152     handle_error(err);
 153
 154   return dev_memory;
 155 }
 156
 157 void print_kernel_resources(CUmodule binary, const char *kernel_name) {
 158   CUfunction function;
 159   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
 160     handle_error(err);
 161   int num_regs;
 162   if (CUresult err =
 163           cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
 164     handle_error(err);
 165   printf("Executing kernel %s:\n", kernel_name);
 166   printf("%6s registers: %d\n", kernel_name, num_regs);
 167 }
 168
 169 template <typename args_t>
 170 CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
 171                        const LaunchParameters &params, const char *kernel_name,
 172                        args_t kernel_args, bool print_resource_usage) {
 173   // look up the '_start' kernel in the loaded module.
 174   CUfunction function;
 175   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
 176     handle_error(err);
 177
 178   // Set up the arguments to the '_start' kernel on the GPU.
 179   uint64_t args_size = sizeof(args_t);
 180   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
 181                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
 182                          CU_LAUNCH_PARAM_END};
 183   if (print_resource_usage)
 184     print_kernel_resources(binary, kernel_name);
 185
 186   // Initialize a non-blocking CUDA stream to allocate memory if needed.
 187   // This needs to be done on a separate stream or else it will deadlock
 188   // with the executing kernel.
 189   CUstream memory_stream;
 190   if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
 191     handle_error(err);
 192
 193   std::atomic<bool> finished = false;
 194   std::thread server_thread(
 195       [](std::atomic<bool> *finished, rpc::Server *server,
 196          CUstream memory_stream) {
 197         auto malloc_handler = [&](size_t size) -> void * {
 198           CUdeviceptr dev_ptr;
 199           if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
 200             dev_ptr = 0UL;
 201
 202           // Wait until the memory allocation is complete.
 203           while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
 204             ;
 205           return reinterpret_cast<void *>(dev_ptr);
 206         };
 207
 208         auto free_handler = [&](void *ptr) -> void {
 209           if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
 210                                             memory_stream))
 211             handle_error(err);
 212         };
 213
 214         uint32_t index = 0;
 215         while (!*finished) {
 216           index =
 217               handle_server<32>(*server, index, malloc_handler, free_handler);
 218         }
 219       },
 220       &finished, &server, memory_stream);
 221
 222   // Call the kernel with the given arguments.
 223   if (CUresult err = cuLaunchKernel(
 224           function, params.num_blocks_x, params.num_blocks_y,
 225           params.num_blocks_z, params.num_threads_x, params.num_threads_y,
 226           params.num_threads_z, 0, stream, nullptr, args_config))
 227     handle_error(err);
 228
 229   if (CUresult err = cuStreamSynchronize(stream))
 230     handle_error(err);
 231
 232   finished = true;
 233   if (server_thread.joinable())
 234     server_thread.join();
 235
 236   return CUDA_SUCCESS;
 237 }
 238
 239 int load(int argc, const char **argv, const char **envp, void *image,
 240          size_t size, const LaunchParameters &params,
 241          bool print_resource_usage) {
 242   if (CUresult err = cuInit(0))
 243     handle_error(err);
 244   // Obtain the first device found on the system.
 245   uint32_t device_id = 0;
 246   CUdevice device;
 247   if (CUresult err = cuDeviceGet(&device, device_id))
 248     handle_error(err);
 249
 250   // Initialize the CUDA context and claim it for this execution.
 251   CUcontext context;
 252   if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
 253     handle_error(err);
 254   if (CUresult err = cuCtxSetCurrent(context))
 255     handle_error(err);
 256
 257   // Increase the stack size per thread.
 258   // TODO: We should allow this to be passed in so only the tests that require a
 259   // larger stack can specify it to save on memory usage.
 260   if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
 261     handle_error(err);
 262
 263   // Initialize a non-blocking CUDA stream to execute the kernel.
 264   CUstream stream;
 265   if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
 266     handle_error(err);
 267
 268   // Load the image into a CUDA module.
 269   CUmodule binary;
 270   if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
 271     handle_error(err);
 272
 273   // Allocate pinned memory on the host to hold the pointer array for the
 274   // copied argv and allow the GPU device to access it.
 275   auto allocator = [&](uint64_t size) -> void * {
 276     void *dev_ptr;
 277     if (CUresult err = cuMemAllocHost(&dev_ptr, size))
 278       handle_error(err);
 279     return dev_ptr;
 280   };
 281
 282   auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
 283   if (!memory_or_err)
 284     handle_error(toString(memory_or_err.takeError()).c_str());
 285
 286   void *dev_argv = copy_argument_vector(argc, argv, allocator);
 287   if (!dev_argv)
 288     handle_error("Failed to allocate device argv");
 289
 290   // Allocate pinned memory on the host to hold the pointer array for the
 291   // copied environment array and allow the GPU device to access it.
 292   void *dev_envp = copy_environment(envp, allocator);
 293   if (!dev_envp)
 294     handle_error("Failed to allocate device environment");
 295
 296   // Allocate space for the return pointer and initialize it to zero.
 297   CUdeviceptr dev_ret;
 298   if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
 299     handle_error(err);
 300   if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
 301     handle_error(err);
 302
 303   uint32_t warp_size = 32;
 304   void *rpc_buffer = nullptr;
 305   if (CUresult err = cuMemAllocHost(
 306           &rpc_buffer,
 307           rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
 308     handle_error(err);
 309   rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
 310   rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
 311
 312   // Initialize the RPC client on the device by copying the local data to the
 313   // device's internal pointer.
 314   CUdeviceptr rpc_client_dev = 0;
 315   uint64_t client_ptr_size = sizeof(void *);
 316   if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
 317                                        binary, "__llvm_rpc_client"))
 318     handle_error(err);
 319
 320   if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
 321     handle_error(err);
 322
 323   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
 324   begin_args_t init_args = {argc, dev_argv, dev_envp};
 325   if (CUresult err =
 326           launch_kernel(binary, stream, server, single_threaded_params,
 327                         "_begin", init_args, print_resource_usage))
 328     handle_error(err);
 329
 330   start_args_t args = {argc, dev_argv, dev_envp,
 331                        reinterpret_cast<void *>(dev_ret)};
 332   if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
 333                                    args, print_resource_usage))
 334     handle_error(err);
 335
 336   // Copy the return value back from the kernel and wait.
 337   int host_ret = 0;
 338   if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
 339     handle_error(err);
 340
 341   if (CUresult err = cuStreamSynchronize(stream))
 342     handle_error(err);
 343
 344   end_args_t fini_args = {host_ret};
 345   if (CUresult err =
 346           launch_kernel(binary, stream, server, single_threaded_params, "_end",
 347                         fini_args, print_resource_usage))
 348     handle_error(err);
 349
 350   // Free the memory allocated for the device.
 351   if (CUresult err = cuMemFreeHost(*memory_or_err))
 352     handle_error(err);
 353   if (CUresult err = cuMemFree(dev_ret))
 354     handle_error(err);
 355   if (CUresult err = cuMemFreeHost(dev_argv))
 356     handle_error(err);
 357   if (CUresult err = cuMemFreeHost(rpc_buffer))
 358     handle_error(err);
 359
 360   // Destroy the context and the loaded binary.
 361   if (CUresult err = cuModuleUnload(binary))
 362     handle_error(err);
 363   if (CUresult err = cuDevicePrimaryCtxRelease(device))
 364     handle_error(err);
 365   return host_ret;
 366 }