libc/utils/gpu/loader/amdgpu/Loader.cpp

   1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file impelements a simple loader to run images supporting the AMDHSA
  10 // architecture. The file launches the '_start' kernel which should be provided
  11 // by the device application start code and call ultimately call the 'main'
  12 // function.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "Loader.h"
  17
  18 #if defined(__has_include)
  19 #if __has_include("hsa/hsa.h")
  20 #include "hsa/hsa.h"
  21 #include "hsa/hsa_ext_amd.h"
  22 #elif __has_include("hsa.h")
  23 #include "hsa.h"
  24 #include "hsa_ext_amd.h"
  25 #endif
  26 #else
  27 #include "hsa/hsa.h"
  28 #include "hsa/hsa_ext_amd.h"
  29 #endif
  30
  31 #include <cstdio>
  32 #include <cstdlib>
  33 #include <cstring>
  34 #include <tuple>
  35 #include <utility>
  36
  37 /// Print the error code and exit if \p code indicates an error.
  38 static void handle_error(hsa_status_t code) {
  39   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
  40     return;
  41
  42   const char *desc;
  43   if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
  44     desc = "Unknown error";
  45   fprintf(stderr, "%s\n", desc);
  46   exit(EXIT_FAILURE);
  47 }
  48
  49 /// Generic interface for iterating using the HSA callbacks.
  50 template <typename elem_ty, typename func_ty, typename callback_ty>
  51 hsa_status_t iterate(func_ty func, callback_ty cb) {
  52   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
  53     callback_ty *unwrapped = static_cast<callback_ty *>(data);
  54     return (*unwrapped)(elem);
  55   };
  56   return func(l, static_cast<void *>(&cb));
  57 }
  58
  59 /// Generic interface for iterating using the HSA callbacks.
  60 template <typename elem_ty, typename func_ty, typename func_arg_ty,
  61           typename callback_ty>
  62 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
  63   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
  64     callback_ty *unwrapped = static_cast<callback_ty *>(data);
  65     return (*unwrapped)(elem);
  66   };
  67   return func(func_arg, l, static_cast<void *>(&cb));
  68 }
  69
  70 /// Iterate through all availible agents.
  71 template <typename callback_ty>
  72 hsa_status_t iterate_agents(callback_ty callback) {
  73   return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
  74 }
  75
  76 /// Iterate through all availible memory pools.
  77 template <typename callback_ty>
  78 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
  79   return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
  80                                         agent, cb);
  81 }
  82
  83 template <hsa_device_type_t flag>
  84 hsa_status_t get_agent(hsa_agent_t *output_agent) {
  85   // Find the first agent with a matching device type.
  86   auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
  87     hsa_device_type_t type;
  88     hsa_status_t status =
  89         hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
  90     if (status != HSA_STATUS_SUCCESS)
  91       return status;
  92
  93     if (type == flag) {
  94       // Ensure that a GPU agent supports kernel dispatch packets.
  95       if (type == HSA_DEVICE_TYPE_GPU) {
  96         hsa_agent_feature_t features;
  97         status =
  98             hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
  99         if (status != HSA_STATUS_SUCCESS)
 100           return status;
 101         if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
 102           *output_agent = hsa_agent;
 103       } else {
 104         *output_agent = hsa_agent;
 105       }
 106       return HSA_STATUS_INFO_BREAK;
 107     }
 108     return HSA_STATUS_SUCCESS;
 109   };
 110
 111   return iterate_agents(cb);
 112 }
 113
 114 /// Retrieve a global memory pool with a \p flag from the agent.
 115 template <hsa_amd_memory_pool_global_flag_t flag>
 116 hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
 117                                    hsa_amd_memory_pool_t *output_pool) {
 118   auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
 119     uint32_t flags;
 120     hsa_amd_segment_t segment;
 121     if (auto err = hsa_amd_memory_pool_get_info(
 122             memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
 123       return err;
 124     if (auto err = hsa_amd_memory_pool_get_info(
 125             memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
 126       return err;
 127
 128     if (segment != HSA_AMD_SEGMENT_GLOBAL)
 129       return HSA_STATUS_SUCCESS;
 130
 131     if (flags & flag)
 132       *output_pool = memory_pool;
 133
 134     return HSA_STATUS_SUCCESS;
 135   };
 136   return iterate_agent_memory_pools(agent, cb);
 137 }
 138
 139 template <typename args_t>
 140 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
 141                            hsa_amd_memory_pool_t kernargs_pool,
 142                            hsa_amd_memory_pool_t coarsegrained_pool,
 143                            hsa_queue_t *queue, const LaunchParameters &params,
 144                            const char *kernel_name, args_t kernel_args) {
 145   // Look up the '_start' kernel in the loaded executable.
 146   hsa_executable_symbol_t symbol;
 147   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
 148           executable, kernel_name, &dev_agent, &symbol))
 149     return err;
 150
 151   // Register RPC callbacks for the malloc and free functions on HSA.
 152   uint32_t device_id = 0;
 153   auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
 154   rpc_register_callback(
 155       device_id, RPC_MALLOC,
 156       [](rpc_port_t port, void *data) {
 157         auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
 158           auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
 159           uint64_t size = buffer->data[0];
 160           void *dev_ptr = nullptr;
 161           if (hsa_status_t err =
 162                   hsa_amd_memory_pool_allocate(pool, size,
 163                                                /*flags=*/0, &dev_ptr))
 164             handle_error(err);
 165           hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
 166           buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
 167         };
 168         rpc_recv_and_send(port, malloc_handler, data);
 169       },
 170       &tuple);
 171   rpc_register_callback(
 172       device_id, RPC_FREE,
 173       [](rpc_port_t port, void *data) {
 174         auto free_handler = [](rpc_buffer_t *buffer, void *) {
 175           if (hsa_status_t err = hsa_amd_memory_pool_free(
 176                   reinterpret_cast<void *>(buffer->data[0])))
 177             handle_error(err);
 178         };
 179         rpc_recv_and_send(port, free_handler, data);
 180       },
 181       nullptr);
 182
 183   // Retrieve different properties of the kernel symbol used for launch.
 184   uint64_t kernel;
 185   uint32_t args_size;
 186   uint32_t group_size;
 187   uint32_t private_size;
 188
 189   std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
 190       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
 191       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
 192       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
 193       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
 194
 195   for (auto &[info, value] : symbol_infos)
 196     if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
 197       return err;
 198
 199   // Allocate space for the kernel arguments on the host and allow the GPU agent
 200   // to access it.
 201   void *args;
 202   if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
 203                                                       /*flags=*/0, &args))
 204     handle_error(err);
 205   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
 206
 207   // Initialie all the arguments (explicit and implicit) to zero, then set the
 208   // explicit arguments to the values created above.
 209   std::memset(args, 0, args_size);
 210   std::memcpy(args, &kernel_args, sizeof(args_t));
 211
 212   // Obtain a packet from the queue.
 213   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
 214   while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
 215     ;
 216
 217   const uint32_t mask = queue->size - 1;
 218   hsa_kernel_dispatch_packet_t *packet =
 219       static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
 220       (packet_id & mask);
 221
 222   // Set up the packet for exeuction on the device. We currently only launch
 223   // with one thread on the device, forcing the rest of the wavefront to be
 224   // masked off.
 225   uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
 226                     (params.num_blocks_z * params.num_threads_z != 1))
 227                    << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
 228   packet->workgroup_size_x = params.num_threads_x;
 229   packet->workgroup_size_y = params.num_threads_y;
 230   packet->workgroup_size_z = params.num_threads_z;
 231   packet->reserved0 = 0;
 232   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
 233   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
 234   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
 235   packet->private_segment_size = private_size;
 236   packet->group_segment_size = group_size;
 237   packet->kernel_object = kernel;
 238   packet->kernarg_address = args;
 239   packet->reserved2 = 0;
 240   // Create a signal to indicate when this packet has been completed.
 241   if (hsa_status_t err =
 242           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
 243     handle_error(err);
 244
 245   // Initialize the packet header and set the doorbell signal to begin execution
 246   // by the HSA runtime.
 247   uint16_t header =
 248       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
 249       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
 250       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
 251   uint32_t header_word = header | (setup << 16u);
 252   __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
 253   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 254
 255   // Wait until the kernel has completed execution on the device. Periodically
 256   // check the RPC client for work to be performed on the server.
 257   while (hsa_signal_wait_scacquire(
 258              packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
 259              /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
 260     if (rpc_status_t err = rpc_handle_server(device_id))
 261       handle_error(err);
 262
 263   // Handle the server one more time in case the kernel exited with a pending
 264   // send still in flight.
 265   if (rpc_status_t err = rpc_handle_server(device_id))
 266     handle_error(err);
 267
 268   // Destroy the resources acquired to launch the kernel and return.
 269   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
 270     handle_error(err);
 271   if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
 272     handle_error(err);
 273
 274   return HSA_STATUS_SUCCESS;
 275 }
 276
 277 /// Copies data from the source agent to the destination agent. The source
 278 /// memory must first be pinned explicitly or allocated via HSA.
 279 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
 280                                const void *src, hsa_agent_t src_agent,
 281                                uint64_t size) {
 282   // Create a memory signal to copy information between the host and device.
 283   hsa_signal_t memory_signal;
 284   if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
 285     return err;
 286
 287   if (hsa_status_t err = hsa_amd_memory_async_copy(
 288           dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
 289     return err;
 290
 291   while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
 292                                    UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
 293     ;
 294
 295   if (hsa_status_t err = hsa_signal_destroy(memory_signal))
 296     return err;
 297
 298   return HSA_STATUS_SUCCESS;
 299 }
 300
 301 int load(int argc, char **argv, char **envp, void *image, size_t size,
 302          const LaunchParameters &params) {
 303   // Initialize the HSA runtime used to communicate with the device.
 304   if (hsa_status_t err = hsa_init())
 305     handle_error(err);
 306
 307   // Register a callback when the device encounters a memory fault.
 308   if (hsa_status_t err = hsa_amd_register_system_event_handler(
 309           [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
 310             if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
 311               return HSA_STATUS_ERROR;
 312             return HSA_STATUS_SUCCESS;
 313           },
 314           nullptr))
 315     handle_error(err);
 316
 317   // Obtain a single agent for the device and host to use the HSA memory model.
 318   uint32_t num_devices = 1;
 319   uint32_t device_id = 0;
 320   hsa_agent_t dev_agent;
 321   hsa_agent_t host_agent;
 322   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
 323     handle_error(err);
 324   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
 325     handle_error(err);
 326
 327   // Load the code object's ISA information and executable data segments.
 328   hsa_code_object_t object;
 329   if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
 330     handle_error(err);
 331
 332   hsa_executable_t executable;
 333   if (hsa_status_t err = hsa_executable_create_alt(
 334           HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
 335           &executable))
 336     handle_error(err);
 337
 338   if (hsa_status_t err =
 339           hsa_executable_load_code_object(executable, dev_agent, object, ""))
 340     handle_error(err);
 341
 342   // No modifications to the executable are allowed  after this point.
 343   if (hsa_status_t err = hsa_executable_freeze(executable, ""))
 344     handle_error(err);
 345
 346   // Check the validity of the loaded executable. If the agents ISA features do
 347   // not match the executable's code object it will fail here.
 348   uint32_t result;
 349   if (hsa_status_t err = hsa_executable_validate(executable, &result))
 350     handle_error(err);
 351   if (result)
 352     handle_error(HSA_STATUS_ERROR);
 353
 354   // Obtain memory pools to exchange data between the host and the device. The
 355   // fine-grained pool acts as pinned memory on the host for DMA transfers to
 356   // the device, the coarse-grained pool is for allocations directly on the
 357   // device, and the kernerl-argument pool is for executing the kernel.
 358   hsa_amd_memory_pool_t kernargs_pool;
 359   hsa_amd_memory_pool_t finegrained_pool;
 360   hsa_amd_memory_pool_t coarsegrained_pool;
 361   if (hsa_status_t err =
 362           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
 363               host_agent, &kernargs_pool))
 364     handle_error(err);
 365   if (hsa_status_t err =
 366           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
 367               host_agent, &finegrained_pool))
 368     handle_error(err);
 369   if (hsa_status_t err =
 370           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
 371               dev_agent, &coarsegrained_pool))
 372     handle_error(err);
 373
 374   // Allocate fine-grained memory on the host to hold the pointer array for the
 375   // copied argv and allow the GPU agent to access it.
 376   auto allocator = [&](uint64_t size) -> void * {
 377     void *dev_ptr = nullptr;
 378     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
 379                                                         /*flags=*/0, &dev_ptr))
 380       handle_error(err);
 381     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
 382     return dev_ptr;
 383   };
 384   void *dev_argv = copy_argument_vector(argc, argv, allocator);
 385   if (!dev_argv)
 386     handle_error("Failed to allocate device argv");
 387
 388   // Allocate fine-grained memory on the host to hold the pointer array for the
 389   // copied environment array and allow the GPU agent to access it.
 390   void *dev_envp = copy_environment(envp, allocator);
 391   if (!dev_envp)
 392     handle_error("Failed to allocate device environment");
 393
 394   // Allocate space for the return pointer and initialize it to zero.
 395   void *dev_ret;
 396   if (hsa_status_t err =
 397           hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
 398                                        /*flags=*/0, &dev_ret))
 399     handle_error(err);
 400   hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
 401
 402   // Allocate finegrained memory for the RPC server and client to share.
 403   uint32_t wavefront_size = 0;
 404   if (hsa_status_t err = hsa_agent_get_info(
 405           dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
 406     handle_error(err);
 407
 408   // Set up the RPC server.
 409   if (rpc_status_t err = rpc_init(num_devices))
 410     handle_error(err);
 411   auto tuple = std::make_tuple(dev_agent, finegrained_pool);
 412   auto rpc_alloc = [](uint64_t size, void *data) {
 413     auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
 414     void *dev_ptr = nullptr;
 415     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
 416                                                         /*flags=*/0, &dev_ptr))
 417       handle_error(err);
 418     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
 419     return dev_ptr;
 420   };
 421   if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT,
 422                                          wavefront_size, rpc_alloc, &tuple))
 423     handle_error(err);
 424
 425   // Register callbacks for the RPC unit tests.
 426   if (wavefront_size == 32)
 427     register_rpc_callbacks<32>(device_id);
 428   else if (wavefront_size == 64)
 429     register_rpc_callbacks<64>(device_id);
 430   else
 431     handle_error("Invalid wavefront size");
 432
 433   // Initialize the RPC client on the device by copying the local data to the
 434   // device's internal pointer.
 435   hsa_executable_symbol_t rpc_client_sym;
 436   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
 437           executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
 438     handle_error(err);
 439
 440   void *rpc_client_host;
 441   if (hsa_status_t err =
 442           hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(void *),
 443                                        /*flags=*/0, &rpc_client_host))
 444     handle_error(err);
 445
 446   void *rpc_client_dev;
 447   if (hsa_status_t err = hsa_executable_symbol_get_info(
 448           rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
 449           &rpc_client_dev))
 450     handle_error(err);
 451
 452   // Copy the address of the client buffer from the device to the host.
 453   if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
 454                                     dev_agent, sizeof(void *)))
 455     handle_error(err);
 456
 457   void *rpc_client_buffer;
 458   if (hsa_status_t err = hsa_amd_memory_pool_allocate(
 459           coarsegrained_pool, rpc_get_client_size(),
 460           /*flags=*/0, &rpc_client_buffer))
 461     handle_error(err);
 462   std::memcpy(rpc_client_buffer, rpc_get_client_buffer(device_id),
 463               rpc_get_client_size());
 464
 465   // Copy the RPC client buffer to the address pointed to by the symbol.
 466   if (hsa_status_t err =
 467           hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
 468                      rpc_client_buffer, host_agent, rpc_get_client_size()))
 469     handle_error(err);
 470
 471   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_buffer))
 472     handle_error(err);
 473   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
 474     handle_error(err);
 475
 476   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
 477   // If the clock_freq symbol is missing, no work to do.
 478   hsa_executable_symbol_t freq_sym;
 479   if (HSA_STATUS_SUCCESS ==
 480       hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
 481                                         &dev_agent, &freq_sym)) {
 482
 483     void *host_clock_freq;
 484     if (hsa_status_t err =
 485             hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
 486                                          /*flags=*/0, &host_clock_freq))
 487       handle_error(err);
 488     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
 489
 490     if (hsa_status_t err =
 491             hsa_agent_get_info(dev_agent,
 492                                static_cast<hsa_agent_info_t>(
 493                                    HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
 494                                host_clock_freq))
 495       handle_error(err);
 496
 497     void *freq_addr;
 498     if (hsa_status_t err = hsa_executable_symbol_get_info(
 499             freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &freq_addr))
 500       handle_error(err);
 501
 502     if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
 503                                       host_agent, sizeof(uint64_t)))
 504       handle_error(err);
 505   }
 506
 507   // Obtain a queue with the minimum (power of two) size, used to send commands
 508   // to the HSA runtime and launch execution on the device.
 509   uint64_t queue_size;
 510   if (hsa_status_t err = hsa_agent_get_info(
 511           dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
 512     handle_error(err);
 513   hsa_queue_t *queue = nullptr;
 514   if (hsa_status_t err =
 515           hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
 516                            nullptr, UINT32_MAX, UINT32_MAX, &queue))
 517     handle_error(err);
 518
 519   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
 520   begin_args_t init_args = {argc, dev_argv, dev_envp};
 521   if (hsa_status_t err = launch_kernel(
 522           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
 523           single_threaded_params, "_begin.kd", init_args))
 524     handle_error(err);
 525
 526   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
 527   if (hsa_status_t err =
 528           launch_kernel(dev_agent, executable, kernargs_pool,
 529                         coarsegrained_pool, queue, params, "_start.kd", args))
 530     handle_error(err);
 531
 532   void *host_ret;
 533   if (hsa_status_t err =
 534           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
 535                                        /*flags=*/0, &host_ret))
 536     handle_error(err);
 537   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
 538
 539   if (hsa_status_t err =
 540           hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
 541     handle_error(err);
 542
 543   // Save the return value and perform basic clean-up.
 544   int ret = *static_cast<int *>(host_ret);
 545
 546   end_args_t fini_args = {ret};
 547   if (hsa_status_t err = launch_kernel(
 548           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
 549           single_threaded_params, "_end.kd", fini_args))
 550     handle_error(err);
 551
 552   if (rpc_status_t err = rpc_server_shutdown(
 553           device_id, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); },
 554           nullptr))
 555     handle_error(err);
 556
 557   // Free the memory allocated for the device.
 558   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
 559     handle_error(err);
 560   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
 561     handle_error(err);
 562   if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
 563     handle_error(err);
 564
 565   if (hsa_status_t err = hsa_queue_destroy(queue))
 566     handle_error(err);
 567
 568   if (hsa_status_t err = hsa_executable_destroy(executable))
 569     handle_error(err);
 570
 571   if (hsa_status_t err = hsa_code_object_destroy(object))
 572     handle_error(err);
 573
 574   if (rpc_status_t err = rpc_shutdown())
 575     handle_error(err);
 576   if (hsa_status_t err = hsa_shut_down())
 577     handle_error(err);
 578
 579   return ret;
 580 }