Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / libc / utils / gpu / loader / amdgpu / Loader.cpp
blob2f99076a720e2aaa98feb091feedf19c0fc5827a
1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file impelements a simple loader to run images supporting the AMDHSA
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
12 // function.
14 //===----------------------------------------------------------------------===//
16 #include "Loader.h"
18 #if defined(__has_include)
19 #if __has_include("hsa/hsa.h")
20 #include "hsa/hsa.h"
21 #include "hsa/hsa_ext_amd.h"
22 #elif __has_include("hsa.h")
23 #include "hsa.h"
24 #include "hsa_ext_amd.h"
25 #endif
26 #else
27 #include "hsa/hsa.h"
28 #include "hsa/hsa_ext_amd.h"
29 #endif
31 #include <cstdio>
32 #include <cstdlib>
33 #include <cstring>
34 #include <tuple>
35 #include <utility>
37 /// Print the error code and exit if \p code indicates an error.
38 static void handle_error(hsa_status_t code) {
39 if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
40 return;
42 const char *desc;
43 if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
44 desc = "Unknown error";
45 fprintf(stderr, "%s\n", desc);
46 exit(EXIT_FAILURE);
49 /// Generic interface for iterating using the HSA callbacks.
50 template <typename elem_ty, typename func_ty, typename callback_ty>
51 hsa_status_t iterate(func_ty func, callback_ty cb) {
52 auto l = [](elem_ty elem, void *data) -> hsa_status_t {
53 callback_ty *unwrapped = static_cast<callback_ty *>(data);
54 return (*unwrapped)(elem);
56 return func(l, static_cast<void *>(&cb));
59 /// Generic interface for iterating using the HSA callbacks.
60 template <typename elem_ty, typename func_ty, typename func_arg_ty,
61 typename callback_ty>
62 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
63 auto l = [](elem_ty elem, void *data) -> hsa_status_t {
64 callback_ty *unwrapped = static_cast<callback_ty *>(data);
65 return (*unwrapped)(elem);
67 return func(func_arg, l, static_cast<void *>(&cb));
70 /// Iterate through all availible agents.
71 template <typename callback_ty>
72 hsa_status_t iterate_agents(callback_ty callback) {
73 return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
76 /// Iterate through all availible memory pools.
77 template <typename callback_ty>
78 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
79 return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
80 agent, cb);
83 template <hsa_device_type_t flag>
84 hsa_status_t get_agent(hsa_agent_t *output_agent) {
85 // Find the first agent with a matching device type.
86 auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
87 hsa_device_type_t type;
88 hsa_status_t status =
89 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
90 if (status != HSA_STATUS_SUCCESS)
91 return status;
93 if (type == flag) {
94 // Ensure that a GPU agent supports kernel dispatch packets.
95 if (type == HSA_DEVICE_TYPE_GPU) {
96 hsa_agent_feature_t features;
97 status =
98 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
99 if (status != HSA_STATUS_SUCCESS)
100 return status;
101 if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
102 *output_agent = hsa_agent;
103 } else {
104 *output_agent = hsa_agent;
106 return HSA_STATUS_INFO_BREAK;
108 return HSA_STATUS_SUCCESS;
111 return iterate_agents(cb);
114 /// Retrieve a global memory pool with a \p flag from the agent.
115 template <hsa_amd_memory_pool_global_flag_t flag>
116 hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
117 hsa_amd_memory_pool_t *output_pool) {
118 auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
119 uint32_t flags;
120 hsa_amd_segment_t segment;
121 if (auto err = hsa_amd_memory_pool_get_info(
122 memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
123 return err;
124 if (auto err = hsa_amd_memory_pool_get_info(
125 memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
126 return err;
128 if (segment != HSA_AMD_SEGMENT_GLOBAL)
129 return HSA_STATUS_SUCCESS;
131 if (flags & flag)
132 *output_pool = memory_pool;
134 return HSA_STATUS_SUCCESS;
136 return iterate_agent_memory_pools(agent, cb);
139 template <typename args_t>
140 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
141 hsa_amd_memory_pool_t kernargs_pool,
142 hsa_amd_memory_pool_t coarsegrained_pool,
143 hsa_queue_t *queue, const LaunchParameters &params,
144 const char *kernel_name, args_t kernel_args) {
145 // Look up the '_start' kernel in the loaded executable.
146 hsa_executable_symbol_t symbol;
147 if (hsa_status_t err = hsa_executable_get_symbol_by_name(
148 executable, kernel_name, &dev_agent, &symbol))
149 return err;
151 // Register RPC callbacks for the malloc and free functions on HSA.
152 uint32_t device_id = 0;
153 auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
154 rpc_register_callback(
155 device_id, RPC_MALLOC,
156 [](rpc_port_t port, void *data) {
157 auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
158 auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
159 uint64_t size = buffer->data[0];
160 void *dev_ptr = nullptr;
161 if (hsa_status_t err =
162 hsa_amd_memory_pool_allocate(pool, size,
163 /*flags=*/0, &dev_ptr))
164 handle_error(err);
165 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
166 buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
168 rpc_recv_and_send(port, malloc_handler, data);
170 &tuple);
171 rpc_register_callback(
172 device_id, RPC_FREE,
173 [](rpc_port_t port, void *data) {
174 auto free_handler = [](rpc_buffer_t *buffer, void *) {
175 if (hsa_status_t err = hsa_amd_memory_pool_free(
176 reinterpret_cast<void *>(buffer->data[0])))
177 handle_error(err);
179 rpc_recv_and_send(port, free_handler, data);
181 nullptr);
183 // Retrieve different properties of the kernel symbol used for launch.
184 uint64_t kernel;
185 uint32_t args_size;
186 uint32_t group_size;
187 uint32_t private_size;
189 std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
190 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
191 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
192 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
193 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
195 for (auto &[info, value] : symbol_infos)
196 if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
197 return err;
199 // Allocate space for the kernel arguments on the host and allow the GPU agent
200 // to access it.
201 void *args;
202 if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
203 /*flags=*/0, &args))
204 handle_error(err);
205 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
207 // Initialie all the arguments (explicit and implicit) to zero, then set the
208 // explicit arguments to the values created above.
209 std::memset(args, 0, args_size);
210 std::memcpy(args, &kernel_args, sizeof(args_t));
212 // Obtain a packet from the queue.
213 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
214 while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
217 const uint32_t mask = queue->size - 1;
218 hsa_kernel_dispatch_packet_t *packet =
219 static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
220 (packet_id & mask);
222 // Set up the packet for exeuction on the device. We currently only launch
223 // with one thread on the device, forcing the rest of the wavefront to be
224 // masked off.
225 uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
226 (params.num_blocks_z * params.num_threads_z != 1))
227 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
228 packet->workgroup_size_x = params.num_threads_x;
229 packet->workgroup_size_y = params.num_threads_y;
230 packet->workgroup_size_z = params.num_threads_z;
231 packet->reserved0 = 0;
232 packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
233 packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
234 packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
235 packet->private_segment_size = private_size;
236 packet->group_segment_size = group_size;
237 packet->kernel_object = kernel;
238 packet->kernarg_address = args;
239 packet->reserved2 = 0;
240 // Create a signal to indicate when this packet has been completed.
241 if (hsa_status_t err =
242 hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
243 handle_error(err);
245 // Initialize the packet header and set the doorbell signal to begin execution
246 // by the HSA runtime.
247 uint16_t header =
248 (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
249 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
250 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
251 uint32_t header_word = header | (setup << 16u);
252 __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
253 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
255 // Wait until the kernel has completed execution on the device. Periodically
256 // check the RPC client for work to be performed on the server.
257 while (hsa_signal_wait_scacquire(
258 packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
259 /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
260 if (rpc_status_t err = rpc_handle_server(device_id))
261 handle_error(err);
263 // Handle the server one more time in case the kernel exited with a pending
264 // send still in flight.
265 if (rpc_status_t err = rpc_handle_server(device_id))
266 handle_error(err);
268 // Destroy the resources acquired to launch the kernel and return.
269 if (hsa_status_t err = hsa_amd_memory_pool_free(args))
270 handle_error(err);
271 if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
272 handle_error(err);
274 return HSA_STATUS_SUCCESS;
277 /// Copies data from the source agent to the destination agent. The source
278 /// memory must first be pinned explicitly or allocated via HSA.
279 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
280 const void *src, hsa_agent_t src_agent,
281 uint64_t size) {
282 // Create a memory signal to copy information between the host and device.
283 hsa_signal_t memory_signal;
284 if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
285 return err;
287 if (hsa_status_t err = hsa_amd_memory_async_copy(
288 dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
289 return err;
291 while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
292 UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
295 if (hsa_status_t err = hsa_signal_destroy(memory_signal))
296 return err;
298 return HSA_STATUS_SUCCESS;
301 int load(int argc, char **argv, char **envp, void *image, size_t size,
302 const LaunchParameters &params) {
303 // Initialize the HSA runtime used to communicate with the device.
304 if (hsa_status_t err = hsa_init())
305 handle_error(err);
307 // Register a callback when the device encounters a memory fault.
308 if (hsa_status_t err = hsa_amd_register_system_event_handler(
309 [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
310 if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
311 return HSA_STATUS_ERROR;
312 return HSA_STATUS_SUCCESS;
314 nullptr))
315 handle_error(err);
317 // Obtain a single agent for the device and host to use the HSA memory model.
318 uint32_t num_devices = 1;
319 uint32_t device_id = 0;
320 hsa_agent_t dev_agent;
321 hsa_agent_t host_agent;
322 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
323 handle_error(err);
324 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
325 handle_error(err);
327 // Load the code object's ISA information and executable data segments.
328 hsa_code_object_t object;
329 if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
330 handle_error(err);
332 hsa_executable_t executable;
333 if (hsa_status_t err = hsa_executable_create_alt(
334 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
335 &executable))
336 handle_error(err);
338 if (hsa_status_t err =
339 hsa_executable_load_code_object(executable, dev_agent, object, ""))
340 handle_error(err);
342 // No modifications to the executable are allowed after this point.
343 if (hsa_status_t err = hsa_executable_freeze(executable, ""))
344 handle_error(err);
346 // Check the validity of the loaded executable. If the agents ISA features do
347 // not match the executable's code object it will fail here.
348 uint32_t result;
349 if (hsa_status_t err = hsa_executable_validate(executable, &result))
350 handle_error(err);
351 if (result)
352 handle_error(HSA_STATUS_ERROR);
354 // Obtain memory pools to exchange data between the host and the device. The
355 // fine-grained pool acts as pinned memory on the host for DMA transfers to
356 // the device, the coarse-grained pool is for allocations directly on the
357 // device, and the kernerl-argument pool is for executing the kernel.
358 hsa_amd_memory_pool_t kernargs_pool;
359 hsa_amd_memory_pool_t finegrained_pool;
360 hsa_amd_memory_pool_t coarsegrained_pool;
361 if (hsa_status_t err =
362 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
363 host_agent, &kernargs_pool))
364 handle_error(err);
365 if (hsa_status_t err =
366 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
367 host_agent, &finegrained_pool))
368 handle_error(err);
369 if (hsa_status_t err =
370 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
371 dev_agent, &coarsegrained_pool))
372 handle_error(err);
374 // Allocate fine-grained memory on the host to hold the pointer array for the
375 // copied argv and allow the GPU agent to access it.
376 auto allocator = [&](uint64_t size) -> void * {
377 void *dev_ptr = nullptr;
378 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
379 /*flags=*/0, &dev_ptr))
380 handle_error(err);
381 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
382 return dev_ptr;
384 void *dev_argv = copy_argument_vector(argc, argv, allocator);
385 if (!dev_argv)
386 handle_error("Failed to allocate device argv");
388 // Allocate fine-grained memory on the host to hold the pointer array for the
389 // copied environment array and allow the GPU agent to access it.
390 void *dev_envp = copy_environment(envp, allocator);
391 if (!dev_envp)
392 handle_error("Failed to allocate device environment");
394 // Allocate space for the return pointer and initialize it to zero.
395 void *dev_ret;
396 if (hsa_status_t err =
397 hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
398 /*flags=*/0, &dev_ret))
399 handle_error(err);
400 hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
402 // Allocate finegrained memory for the RPC server and client to share.
403 uint32_t wavefront_size = 0;
404 if (hsa_status_t err = hsa_agent_get_info(
405 dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
406 handle_error(err);
408 // Set up the RPC server.
409 if (rpc_status_t err = rpc_init(num_devices))
410 handle_error(err);
411 auto tuple = std::make_tuple(dev_agent, finegrained_pool);
412 auto rpc_alloc = [](uint64_t size, void *data) {
413 auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
414 void *dev_ptr = nullptr;
415 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
416 /*flags=*/0, &dev_ptr))
417 handle_error(err);
418 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
419 return dev_ptr;
421 if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT,
422 wavefront_size, rpc_alloc, &tuple))
423 handle_error(err);
425 // Register callbacks for the RPC unit tests.
426 if (wavefront_size == 32)
427 register_rpc_callbacks<32>(device_id);
428 else if (wavefront_size == 64)
429 register_rpc_callbacks<64>(device_id);
430 else
431 handle_error("Invalid wavefront size");
433 // Initialize the RPC client on the device by copying the local data to the
434 // device's internal pointer.
435 hsa_executable_symbol_t rpc_client_sym;
436 if (hsa_status_t err = hsa_executable_get_symbol_by_name(
437 executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
438 handle_error(err);
440 void *rpc_client_host;
441 if (hsa_status_t err =
442 hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(void *),
443 /*flags=*/0, &rpc_client_host))
444 handle_error(err);
446 void *rpc_client_dev;
447 if (hsa_status_t err = hsa_executable_symbol_get_info(
448 rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
449 &rpc_client_dev))
450 handle_error(err);
452 // Copy the address of the client buffer from the device to the host.
453 if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
454 dev_agent, sizeof(void *)))
455 handle_error(err);
457 void *rpc_client_buffer;
458 if (hsa_status_t err = hsa_amd_memory_pool_allocate(
459 coarsegrained_pool, rpc_get_client_size(),
460 /*flags=*/0, &rpc_client_buffer))
461 handle_error(err);
462 std::memcpy(rpc_client_buffer, rpc_get_client_buffer(device_id),
463 rpc_get_client_size());
465 // Copy the RPC client buffer to the address pointed to by the symbol.
466 if (hsa_status_t err =
467 hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
468 rpc_client_buffer, host_agent, rpc_get_client_size()))
469 handle_error(err);
471 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_buffer))
472 handle_error(err);
473 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
474 handle_error(err);
476 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
477 // If the clock_freq symbol is missing, no work to do.
478 hsa_executable_symbol_t freq_sym;
479 if (HSA_STATUS_SUCCESS ==
480 hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
481 &dev_agent, &freq_sym)) {
483 void *host_clock_freq;
484 if (hsa_status_t err =
485 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
486 /*flags=*/0, &host_clock_freq))
487 handle_error(err);
488 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
490 if (hsa_status_t err =
491 hsa_agent_get_info(dev_agent,
492 static_cast<hsa_agent_info_t>(
493 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
494 host_clock_freq))
495 handle_error(err);
497 void *freq_addr;
498 if (hsa_status_t err = hsa_executable_symbol_get_info(
499 freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &freq_addr))
500 handle_error(err);
502 if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
503 host_agent, sizeof(uint64_t)))
504 handle_error(err);
507 // Obtain a queue with the minimum (power of two) size, used to send commands
508 // to the HSA runtime and launch execution on the device.
509 uint64_t queue_size;
510 if (hsa_status_t err = hsa_agent_get_info(
511 dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
512 handle_error(err);
513 hsa_queue_t *queue = nullptr;
514 if (hsa_status_t err =
515 hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
516 nullptr, UINT32_MAX, UINT32_MAX, &queue))
517 handle_error(err);
519 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
520 begin_args_t init_args = {argc, dev_argv, dev_envp};
521 if (hsa_status_t err = launch_kernel(
522 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
523 single_threaded_params, "_begin.kd", init_args))
524 handle_error(err);
526 start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
527 if (hsa_status_t err =
528 launch_kernel(dev_agent, executable, kernargs_pool,
529 coarsegrained_pool, queue, params, "_start.kd", args))
530 handle_error(err);
532 void *host_ret;
533 if (hsa_status_t err =
534 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
535 /*flags=*/0, &host_ret))
536 handle_error(err);
537 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
539 if (hsa_status_t err =
540 hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
541 handle_error(err);
543 // Save the return value and perform basic clean-up.
544 int ret = *static_cast<int *>(host_ret);
546 end_args_t fini_args = {ret};
547 if (hsa_status_t err = launch_kernel(
548 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
549 single_threaded_params, "_end.kd", fini_args))
550 handle_error(err);
552 if (rpc_status_t err = rpc_server_shutdown(
553 device_id, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); },
554 nullptr))
555 handle_error(err);
557 // Free the memory allocated for the device.
558 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
559 handle_error(err);
560 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
561 handle_error(err);
562 if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
563 handle_error(err);
565 if (hsa_status_t err = hsa_queue_destroy(queue))
566 handle_error(err);
568 if (hsa_status_t err = hsa_executable_destroy(executable))
569 handle_error(err);
571 if (hsa_status_t err = hsa_code_object_destroy(object))
572 handle_error(err);
574 if (rpc_status_t err = rpc_shutdown())
575 handle_error(err);
576 if (hsa_status_t err = hsa_shut_down())
577 handle_error(err);
579 return ret;