libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp

   1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file impelements a simple loader to run images supporting the AMDHSA
  10 // architecture. The file launches the '_start' kernel which should be provided
  11 // by the device application start code and call ultimately call the 'main'
  12 // function.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "Loader.h"
  17
  18 #include "hsa/hsa.h"
  19 #include "hsa/hsa_ext_amd.h"
  20
  21 #include "llvm/Frontend/Offloading/Utility.h"
  22
  23 #include <atomic>
  24 #include <cstdio>
  25 #include <cstdlib>
  26 #include <cstring>
  27 #include <thread>
  28 #include <tuple>
  29 #include <utility>
  30
  31 // The implicit arguments of COV5 AMDGPU kernels.
  32 struct implicit_args_t {
  33   uint32_t grid_size_x;
  34   uint32_t grid_size_y;
  35   uint32_t grid_size_z;
  36   uint16_t workgroup_size_x;
  37   uint16_t workgroup_size_y;
  38   uint16_t workgroup_size_z;
  39   uint8_t Unused0[46];
  40   uint16_t grid_dims;
  41   uint8_t Unused1[190];
  42 };
  43
  44 /// Print the error code and exit if \p code indicates an error.
  45 static void handle_error_impl(const char *file, int32_t line,
  46                               hsa_status_t code) {
  47   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
  48     return;
  49
  50   const char *desc;
  51   if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
  52     desc = "Unknown error";
  53   fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
  54   exit(EXIT_FAILURE);
  55 }
  56
  57 /// Generic interface for iterating using the HSA callbacks.
  58 template <typename elem_ty, typename func_ty, typename callback_ty>
  59 hsa_status_t iterate(func_ty func, callback_ty cb) {
  60   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
  61     callback_ty *unwrapped = static_cast<callback_ty *>(data);
  62     return (*unwrapped)(elem);
  63   };
  64   return func(l, static_cast<void *>(&cb));
  65 }
  66
  67 /// Generic interface for iterating using the HSA callbacks.
  68 template <typename elem_ty, typename func_ty, typename func_arg_ty,
  69           typename callback_ty>
  70 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
  71   auto l = [](elem_ty elem, void *data) -> hsa_status_t {
  72     callback_ty *unwrapped = static_cast<callback_ty *>(data);
  73     return (*unwrapped)(elem);
  74   };
  75   return func(func_arg, l, static_cast<void *>(&cb));
  76 }
  77
  78 /// Iterate through all availible agents.
  79 template <typename callback_ty>
  80 hsa_status_t iterate_agents(callback_ty callback) {
  81   return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
  82 }
  83
  84 /// Iterate through all availible memory pools.
  85 template <typename callback_ty>
  86 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
  87   return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
  88                                         agent, cb);
  89 }
  90
  91 template <hsa_device_type_t flag>
  92 hsa_status_t get_agent(hsa_agent_t *output_agent) {
  93   // Find the first agent with a matching device type.
  94   auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
  95     hsa_device_type_t type;
  96     hsa_status_t status =
  97         hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
  98     if (status != HSA_STATUS_SUCCESS)
  99       return status;
 100
 101     if (type == flag) {
 102       // Ensure that a GPU agent supports kernel dispatch packets.
 103       if (type == HSA_DEVICE_TYPE_GPU) {
 104         hsa_agent_feature_t features;
 105         status =
 106             hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
 107         if (status != HSA_STATUS_SUCCESS)
 108           return status;
 109         if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
 110           *output_agent = hsa_agent;
 111       } else {
 112         *output_agent = hsa_agent;
 113       }
 114       return HSA_STATUS_INFO_BREAK;
 115     }
 116     return HSA_STATUS_SUCCESS;
 117   };
 118
 119   return iterate_agents(cb);
 120 }
 121
 122 void print_kernel_resources(const char *kernel_name) {
 123   fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
 124 }
 125
 126 /// Retrieve a global memory pool with a \p flag from the agent.
 127 template <hsa_amd_memory_pool_global_flag_t flag>
 128 hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
 129                                    hsa_amd_memory_pool_t *output_pool) {
 130   auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
 131     uint32_t flags;
 132     hsa_amd_segment_t segment;
 133     if (auto err = hsa_amd_memory_pool_get_info(
 134             memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
 135       return err;
 136     if (auto err = hsa_amd_memory_pool_get_info(
 137             memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
 138       return err;
 139
 140     if (segment != HSA_AMD_SEGMENT_GLOBAL)
 141       return HSA_STATUS_SUCCESS;
 142
 143     if (flags & flag)
 144       *output_pool = memory_pool;
 145
 146     return HSA_STATUS_SUCCESS;
 147   };
 148   return iterate_agent_memory_pools(agent, cb);
 149 }
 150
 151 template <typename args_t>
 152 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
 153                            hsa_amd_memory_pool_t kernargs_pool,
 154                            hsa_amd_memory_pool_t coarsegrained_pool,
 155                            hsa_queue_t *queue, rpc::Server &server,
 156                            const LaunchParameters &params,
 157                            const char *kernel_name, args_t kernel_args,
 158                            uint32_t wavefront_size, bool print_resource_usage) {
 159   // Look up the kernel in the loaded executable.
 160   hsa_executable_symbol_t symbol;
 161   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
 162           executable, kernel_name, &dev_agent, &symbol))
 163     return err;
 164
 165   // Retrieve different properties of the kernel symbol used for launch.
 166   uint64_t kernel;
 167   uint32_t args_size;
 168   uint32_t group_size;
 169   uint32_t private_size;
 170   bool dynamic_stack;
 171
 172   std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
 173       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
 174       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
 175       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
 176       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
 177       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
 178
 179   for (auto &[info, value] : symbol_infos)
 180     if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
 181       return err;
 182
 183   // Allocate space for the kernel arguments on the host and allow the GPU agent
 184   // to access it.
 185   void *args;
 186   if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
 187                                                       /*flags=*/0, &args))
 188     handle_error(err);
 189   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
 190
 191   // Initialize all the arguments (explicit and implicit) to zero, then set the
 192   // explicit arguments to the values created above.
 193   std::memset(args, 0, args_size);
 194   std::memcpy(args, &kernel_args, sizeof(args_t));
 195
 196   // Initialize the necessary implicit arguments to the proper values.
 197   int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
 198              (params.num_blocks_z * params.num_threads_z != 1);
 199   implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
 200       reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
 201   implicit_args->grid_dims = dims;
 202   implicit_args->grid_size_x = params.num_blocks_x;
 203   implicit_args->grid_size_y = params.num_blocks_y;
 204   implicit_args->grid_size_z = params.num_blocks_z;
 205   implicit_args->workgroup_size_x = params.num_threads_x;
 206   implicit_args->workgroup_size_y = params.num_threads_y;
 207   implicit_args->workgroup_size_z = params.num_threads_z;
 208
 209   // Obtain a packet from the queue.
 210   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
 211   while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
 212     ;
 213
 214   const uint32_t mask = queue->size - 1;
 215   hsa_kernel_dispatch_packet_t *packet =
 216       static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
 217       (packet_id & mask);
 218
 219   // Set up the packet for exeuction on the device. We currently only launch
 220   // with one thread on the device, forcing the rest of the wavefront to be
 221   // masked off.
 222   uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
 223   packet->workgroup_size_x = params.num_threads_x;
 224   packet->workgroup_size_y = params.num_threads_y;
 225   packet->workgroup_size_z = params.num_threads_z;
 226   packet->reserved0 = 0;
 227   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
 228   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
 229   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
 230   packet->private_segment_size =
 231       dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
 232   packet->group_segment_size = group_size;
 233   packet->kernel_object = kernel;
 234   packet->kernarg_address = args;
 235   packet->reserved2 = 0;
 236   // Create a signal to indicate when this packet has been completed.
 237   if (hsa_status_t err =
 238           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
 239     handle_error(err);
 240
 241   if (print_resource_usage)
 242     print_kernel_resources(kernel_name);
 243
 244   // Initialize the packet header and set the doorbell signal to begin execution
 245   // by the HSA runtime.
 246   uint16_t header =
 247       1u << HSA_PACKET_HEADER_BARRIER |
 248       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
 249       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
 250       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
 251   uint32_t header_word = header | (setup << 16u);
 252   __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
 253   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 254
 255   std::atomic<bool> finished = false;
 256   std::thread server_thread(
 257       [](std::atomic<bool> *finished, rpc::Server *server,
 258          uint32_t wavefront_size, hsa_agent_t dev_agent,
 259          hsa_amd_memory_pool_t coarsegrained_pool) {
 260         // Register RPC callbacks for the malloc and free functions on HSA.
 261         auto malloc_handler = [&](size_t size) -> void * {
 262           void *dev_ptr = nullptr;
 263           if (hsa_status_t err =
 264                   hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
 265                                                /*flags=*/0, &dev_ptr))
 266             dev_ptr = nullptr;
 267           hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
 268           return dev_ptr;
 269         };
 270
 271         auto free_handler = [](void *ptr) -> void {
 272           if (hsa_status_t err =
 273                   hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
 274             handle_error(err);
 275         };
 276
 277         uint32_t index = 0;
 278         while (!*finished) {
 279           if (wavefront_size == 32)
 280             index =
 281                 handle_server<32>(*server, index, malloc_handler, free_handler);
 282           else
 283             index =
 284                 handle_server<64>(*server, index, malloc_handler, free_handler);
 285         }
 286       },
 287       &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
 288
 289   // Wait until the kernel has completed execution on the device. Periodically
 290   // check the RPC client for work to be performed on the server.
 291   while (hsa_signal_wait_scacquire(packet->completion_signal,
 292                                    HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
 293                                    HSA_WAIT_STATE_BLOCKED) != 0)
 294     ;
 295
 296   finished = true;
 297   if (server_thread.joinable())
 298     server_thread.join();
 299
 300   // Destroy the resources acquired to launch the kernel and return.
 301   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
 302     handle_error(err);
 303   if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
 304     handle_error(err);
 305
 306   return HSA_STATUS_SUCCESS;
 307 }
 308
 309 /// Copies data from the source agent to the destination agent. The source
 310 /// memory must first be pinned explicitly or allocated via HSA.
 311 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
 312                                const void *src, hsa_agent_t src_agent,
 313                                uint64_t size) {
 314   // Create a memory signal to copy information between the host and device.
 315   hsa_signal_t memory_signal;
 316   if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
 317     return err;
 318
 319   if (hsa_status_t err = hsa_amd_memory_async_copy(
 320           dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
 321     return err;
 322
 323   while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
 324                                    UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
 325     ;
 326
 327   if (hsa_status_t err = hsa_signal_destroy(memory_signal))
 328     return err;
 329
 330   return HSA_STATUS_SUCCESS;
 331 }
 332
 333 int load(int argc, const char **argv, const char **envp, void *image,
 334          size_t size, const LaunchParameters &params,
 335          bool print_resource_usage) {
 336   // Initialize the HSA runtime used to communicate with the device.
 337   if (hsa_status_t err = hsa_init())
 338     handle_error(err);
 339
 340   // Register a callback when the device encounters a memory fault.
 341   if (hsa_status_t err = hsa_amd_register_system_event_handler(
 342           [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
 343             if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
 344               return HSA_STATUS_ERROR;
 345             return HSA_STATUS_SUCCESS;
 346           },
 347           nullptr))
 348     handle_error(err);
 349
 350   // Obtain a single agent for the device and host to use the HSA memory model.
 351   hsa_agent_t dev_agent;
 352   hsa_agent_t host_agent;
 353   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
 354     handle_error(err);
 355   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
 356     handle_error(err);
 357
 358   // Load the code object's ISA information and executable data segments.
 359   hsa_code_object_reader_t reader;
 360   if (hsa_status_t err =
 361           hsa_code_object_reader_create_from_memory(image, size, &reader))
 362     handle_error(err);
 363
 364   hsa_executable_t executable;
 365   if (hsa_status_t err = hsa_executable_create_alt(
 366           HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
 367           &executable))
 368     handle_error(err);
 369
 370   hsa_loaded_code_object_t object;
 371   if (hsa_status_t err = hsa_executable_load_agent_code_object(
 372           executable, dev_agent, reader, "", &object))
 373     handle_error(err);
 374
 375   // No modifications to the executable are allowed  after this point.
 376   if (hsa_status_t err = hsa_executable_freeze(executable, ""))
 377     handle_error(err);
 378
 379   // Check the validity of the loaded executable. If the agents ISA features do
 380   // not match the executable's code object it will fail here.
 381   uint32_t result;
 382   if (hsa_status_t err = hsa_executable_validate(executable, &result))
 383     handle_error(err);
 384   if (result)
 385     handle_error(HSA_STATUS_ERROR);
 386
 387   if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
 388     handle_error(err);
 389
 390   // Obtain memory pools to exchange data between the host and the device. The
 391   // fine-grained pool acts as pinned memory on the host for DMA transfers to
 392   // the device, the coarse-grained pool is for allocations directly on the
 393   // device, and the kernerl-argument pool is for executing the kernel.
 394   hsa_amd_memory_pool_t kernargs_pool;
 395   hsa_amd_memory_pool_t finegrained_pool;
 396   hsa_amd_memory_pool_t coarsegrained_pool;
 397   if (hsa_status_t err =
 398           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
 399               host_agent, &kernargs_pool))
 400     handle_error(err);
 401   if (hsa_status_t err =
 402           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
 403               host_agent, &finegrained_pool))
 404     handle_error(err);
 405   if (hsa_status_t err =
 406           get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
 407               dev_agent, &coarsegrained_pool))
 408     handle_error(err);
 409
 410   // The AMDGPU target can change its wavefront size. There currently isn't a
 411   // good way to look this up through the HSA API so we use the LLVM interface.
 412   uint16_t abi_version;
 413   llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
 414   llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
 415   if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
 416           llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
 417     handle_error(llvm::toString(std::move(err)).c_str());
 418   }
 419
 420   // Allocate fine-grained memory on the host to hold the pointer array for the
 421   // copied argv and allow the GPU agent to access it.
 422   auto allocator = [&](uint64_t size) -> void * {
 423     void *dev_ptr = nullptr;
 424     if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
 425                                                         /*flags=*/0, &dev_ptr))
 426       handle_error(err);
 427     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
 428     return dev_ptr;
 429   };
 430   void *dev_argv = copy_argument_vector(argc, argv, allocator);
 431   if (!dev_argv)
 432     handle_error("Failed to allocate device argv");
 433
 434   // Allocate fine-grained memory on the host to hold the pointer array for the
 435   // copied environment array and allow the GPU agent to access it.
 436   void *dev_envp = copy_environment(envp, allocator);
 437   if (!dev_envp)
 438     handle_error("Failed to allocate device environment");
 439
 440   // Allocate space for the return pointer and initialize it to zero.
 441   void *dev_ret;
 442   if (hsa_status_t err =
 443           hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
 444                                        /*flags=*/0, &dev_ret))
 445     handle_error(err);
 446   hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
 447
 448   // Allocate finegrained memory for the RPC server and client to share.
 449   uint32_t wavefront_size =
 450       llvm::max_element(info_map, [](auto &&x, auto &&y) {
 451         return x.second.WavefrontSize < y.second.WavefrontSize;
 452       })->second.WavefrontSize;
 453
 454   // Set up the RPC server.
 455   void *rpc_buffer;
 456   if (hsa_status_t err = hsa_amd_memory_pool_allocate(
 457           finegrained_pool,
 458           rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
 459           /*flags=*/0, &rpc_buffer))
 460     handle_error(err);
 461   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
 462
 463   rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
 464   rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
 465
 466   // Initialize the RPC client on the device by copying the local data to the
 467   // device's internal pointer.
 468   hsa_executable_symbol_t rpc_client_sym;
 469   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
 470           executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym))
 471     handle_error(err);
 472
 473   void *rpc_client_dev;
 474   if (hsa_status_t err = hsa_executable_symbol_get_info(
 475           rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
 476           &rpc_client_dev))
 477     handle_error(err);
 478
 479   void *rpc_client_buffer;
 480   if (hsa_status_t err =
 481           hsa_amd_memory_lock(&client, sizeof(rpc::Client),
 482                               /*agents=*/nullptr, 0, &rpc_client_buffer))
 483     handle_error(err);
 484
 485   // Copy the RPC client buffer to the address pointed to by the symbol.
 486   if (hsa_status_t err =
 487           hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent,
 488                      sizeof(rpc::Client)))
 489     handle_error(err);
 490
 491   if (hsa_status_t err = hsa_amd_memory_unlock(&client))
 492     handle_error(err);
 493
 494   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
 495   // If the clock_freq symbol is missing, no work to do.
 496   hsa_executable_symbol_t freq_sym;
 497   if (HSA_STATUS_SUCCESS ==
 498       hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
 499                                         &dev_agent, &freq_sym)) {
 500     void *host_clock_freq;
 501     if (hsa_status_t err =
 502             hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
 503                                          /*flags=*/0, &host_clock_freq))
 504       handle_error(err);
 505     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
 506
 507     if (HSA_STATUS_SUCCESS ==
 508         hsa_agent_get_info(dev_agent,
 509                            static_cast<hsa_agent_info_t>(
 510                                HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
 511                            host_clock_freq)) {
 512
 513       void *freq_addr;
 514       if (hsa_status_t err = hsa_executable_symbol_get_info(
 515               freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
 516               &freq_addr))
 517         handle_error(err);
 518
 519       if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
 520                                         host_agent, sizeof(uint64_t)))
 521         handle_error(err);
 522     }
 523   }
 524
 525   // Obtain a queue with the maximum (power of two) size, used to send commands
 526   // to the HSA runtime and launch execution on the device.
 527   uint64_t queue_size;
 528   if (hsa_status_t err = hsa_agent_get_info(
 529           dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
 530     handle_error(err);
 531   hsa_queue_t *queue = nullptr;
 532   if (hsa_status_t err =
 533           hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
 534                            nullptr, UINT32_MAX, UINT32_MAX, &queue))
 535     handle_error(err);
 536
 537   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
 538   begin_args_t init_args = {argc, dev_argv, dev_envp};
 539   if (hsa_status_t err = launch_kernel(
 540           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
 541           server, single_threaded_params, "_begin.kd", init_args,
 542           info_map["_begin"].WavefrontSize, print_resource_usage))
 543     handle_error(err);
 544
 545   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
 546   if (hsa_status_t err = launch_kernel(
 547           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
 548           server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
 549           print_resource_usage))
 550     handle_error(err);
 551
 552   void *host_ret;
 553   if (hsa_status_t err =
 554           hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
 555                                        /*flags=*/0, &host_ret))
 556     handle_error(err);
 557   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
 558
 559   if (hsa_status_t err =
 560           hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
 561     handle_error(err);
 562
 563   // Save the return value and perform basic clean-up.
 564   int ret = *static_cast<int *>(host_ret);
 565
 566   end_args_t fini_args = {ret};
 567   if (hsa_status_t err = launch_kernel(
 568           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
 569           server, single_threaded_params, "_end.kd", fini_args,
 570           info_map["_end"].WavefrontSize, print_resource_usage))
 571     handle_error(err);
 572
 573   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
 574     handle_error(err);
 575
 576   // Free the memory allocated for the device.
 577   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
 578     handle_error(err);
 579   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
 580     handle_error(err);
 581   if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
 582     handle_error(err);
 583
 584   if (hsa_status_t err = hsa_queue_destroy(queue))
 585     handle_error(err);
 586
 587   if (hsa_status_t err = hsa_executable_destroy(executable))
 588     handle_error(err);
 589
 590   if (hsa_status_t err = hsa_shut_down())
 591     handle_error(err);
 592
 593   return ret;
 594 }