1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file impelements a simple loader to run images supporting the AMDHSA
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
14 //===----------------------------------------------------------------------===//
19 #include "hsa/hsa_ext_amd.h"
21 #include "llvm/Frontend/Offloading/Utility.h"
31 // The implicit arguments of COV5 AMDGPU kernels.
32 struct implicit_args_t
{
36 uint16_t workgroup_size_x
;
37 uint16_t workgroup_size_y
;
38 uint16_t workgroup_size_z
;
44 /// Print the error code and exit if \p code indicates an error.
45 static void handle_error_impl(const char *file
, int32_t line
,
47 if (code
== HSA_STATUS_SUCCESS
|| code
== HSA_STATUS_INFO_BREAK
)
51 if (hsa_status_string(code
, &desc
) != HSA_STATUS_SUCCESS
)
52 desc
= "Unknown error";
53 fprintf(stderr
, "%s:%d:0: Error: %s\n", file
, line
, desc
);
57 /// Generic interface for iterating using the HSA callbacks.
58 template <typename elem_ty
, typename func_ty
, typename callback_ty
>
59 hsa_status_t
iterate(func_ty func
, callback_ty cb
) {
60 auto l
= [](elem_ty elem
, void *data
) -> hsa_status_t
{
61 callback_ty
*unwrapped
= static_cast<callback_ty
*>(data
);
62 return (*unwrapped
)(elem
);
64 return func(l
, static_cast<void *>(&cb
));
67 /// Generic interface for iterating using the HSA callbacks.
68 template <typename elem_ty
, typename func_ty
, typename func_arg_ty
,
70 hsa_status_t
iterate(func_ty func
, func_arg_ty func_arg
, callback_ty cb
) {
71 auto l
= [](elem_ty elem
, void *data
) -> hsa_status_t
{
72 callback_ty
*unwrapped
= static_cast<callback_ty
*>(data
);
73 return (*unwrapped
)(elem
);
75 return func(func_arg
, l
, static_cast<void *>(&cb
));
78 /// Iterate through all availible agents.
79 template <typename callback_ty
>
80 hsa_status_t
iterate_agents(callback_ty callback
) {
81 return iterate
<hsa_agent_t
>(hsa_iterate_agents
, callback
);
84 /// Iterate through all availible memory pools.
85 template <typename callback_ty
>
86 hsa_status_t
iterate_agent_memory_pools(hsa_agent_t agent
, callback_ty cb
) {
87 return iterate
<hsa_amd_memory_pool_t
>(hsa_amd_agent_iterate_memory_pools
,
91 template <hsa_device_type_t flag
>
92 hsa_status_t
get_agent(hsa_agent_t
*output_agent
) {
93 // Find the first agent with a matching device type.
94 auto cb
= [&](hsa_agent_t hsa_agent
) -> hsa_status_t
{
95 hsa_device_type_t type
;
97 hsa_agent_get_info(hsa_agent
, HSA_AGENT_INFO_DEVICE
, &type
);
98 if (status
!= HSA_STATUS_SUCCESS
)
102 // Ensure that a GPU agent supports kernel dispatch packets.
103 if (type
== HSA_DEVICE_TYPE_GPU
) {
104 hsa_agent_feature_t features
;
106 hsa_agent_get_info(hsa_agent
, HSA_AGENT_INFO_FEATURE
, &features
);
107 if (status
!= HSA_STATUS_SUCCESS
)
109 if (features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
)
110 *output_agent
= hsa_agent
;
112 *output_agent
= hsa_agent
;
114 return HSA_STATUS_INFO_BREAK
;
116 return HSA_STATUS_SUCCESS
;
119 return iterate_agents(cb
);
122 void print_kernel_resources(const char *kernel_name
) {
123 fprintf(stderr
, "Kernel resources on AMDGPU is not supported yet.\n");
126 /// Retrieve a global memory pool with a \p flag from the agent.
127 template <hsa_amd_memory_pool_global_flag_t flag
>
128 hsa_status_t
get_agent_memory_pool(hsa_agent_t agent
,
129 hsa_amd_memory_pool_t
*output_pool
) {
130 auto cb
= [&](hsa_amd_memory_pool_t memory_pool
) {
132 hsa_amd_segment_t segment
;
133 if (auto err
= hsa_amd_memory_pool_get_info(
134 memory_pool
, HSA_AMD_MEMORY_POOL_INFO_SEGMENT
, &segment
))
136 if (auto err
= hsa_amd_memory_pool_get_info(
137 memory_pool
, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS
, &flags
))
140 if (segment
!= HSA_AMD_SEGMENT_GLOBAL
)
141 return HSA_STATUS_SUCCESS
;
144 *output_pool
= memory_pool
;
146 return HSA_STATUS_SUCCESS
;
148 return iterate_agent_memory_pools(agent
, cb
);
151 template <typename args_t
>
152 hsa_status_t
launch_kernel(hsa_agent_t dev_agent
, hsa_executable_t executable
,
153 hsa_amd_memory_pool_t kernargs_pool
,
154 hsa_amd_memory_pool_t coarsegrained_pool
,
155 hsa_queue_t
*queue
, rpc::Server
&server
,
156 const LaunchParameters
¶ms
,
157 const char *kernel_name
, args_t kernel_args
,
158 uint32_t wavefront_size
, bool print_resource_usage
) {
159 // Look up the kernel in the loaded executable.
160 hsa_executable_symbol_t symbol
;
161 if (hsa_status_t err
= hsa_executable_get_symbol_by_name(
162 executable
, kernel_name
, &dev_agent
, &symbol
))
165 // Retrieve different properties of the kernel symbol used for launch.
169 uint32_t private_size
;
172 std::pair
<hsa_executable_symbol_info_t
, void *> symbol_infos
[] = {
173 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
},
174 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
, &args_size
},
175 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
, &group_size
},
176 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK
, &dynamic_stack
},
177 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
, &private_size
}};
179 for (auto &[info
, value
] : symbol_infos
)
180 if (hsa_status_t err
= hsa_executable_symbol_get_info(symbol
, info
, value
))
183 // Allocate space for the kernel arguments on the host and allow the GPU agent
186 if (hsa_status_t err
= hsa_amd_memory_pool_allocate(kernargs_pool
, args_size
,
189 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, args
);
191 // Initialize all the arguments (explicit and implicit) to zero, then set the
192 // explicit arguments to the values created above.
193 std::memset(args
, 0, args_size
);
194 std::memcpy(args
, &kernel_args
, sizeof(args_t
));
196 // Initialize the necessary implicit arguments to the proper values.
197 int dims
= 1 + (params
.num_blocks_y
* params
.num_threads_y
!= 1) +
198 (params
.num_blocks_z
* params
.num_threads_z
!= 1);
199 implicit_args_t
*implicit_args
= reinterpret_cast<implicit_args_t
*>(
200 reinterpret_cast<uint8_t *>(args
) + sizeof(args_t
));
201 implicit_args
->grid_dims
= dims
;
202 implicit_args
->grid_size_x
= params
.num_blocks_x
;
203 implicit_args
->grid_size_y
= params
.num_blocks_y
;
204 implicit_args
->grid_size_z
= params
.num_blocks_z
;
205 implicit_args
->workgroup_size_x
= params
.num_threads_x
;
206 implicit_args
->workgroup_size_y
= params
.num_threads_y
;
207 implicit_args
->workgroup_size_z
= params
.num_threads_z
;
209 // Obtain a packet from the queue.
210 uint64_t packet_id
= hsa_queue_add_write_index_relaxed(queue
, 1);
211 while (packet_id
- hsa_queue_load_read_index_scacquire(queue
) >= queue
->size
)
214 const uint32_t mask
= queue
->size
- 1;
215 hsa_kernel_dispatch_packet_t
*packet
=
216 static_cast<hsa_kernel_dispatch_packet_t
*>(queue
->base_address
) +
219 // Set up the packet for exeuction on the device. We currently only launch
220 // with one thread on the device, forcing the rest of the wavefront to be
222 uint16_t setup
= (dims
) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
;
223 packet
->workgroup_size_x
= params
.num_threads_x
;
224 packet
->workgroup_size_y
= params
.num_threads_y
;
225 packet
->workgroup_size_z
= params
.num_threads_z
;
226 packet
->reserved0
= 0;
227 packet
->grid_size_x
= params
.num_blocks_x
* params
.num_threads_x
;
228 packet
->grid_size_y
= params
.num_blocks_y
* params
.num_threads_y
;
229 packet
->grid_size_z
= params
.num_blocks_z
* params
.num_threads_z
;
230 packet
->private_segment_size
=
231 dynamic_stack
? 16 * 1024 /* 16 KB */ : private_size
;
232 packet
->group_segment_size
= group_size
;
233 packet
->kernel_object
= kernel
;
234 packet
->kernarg_address
= args
;
235 packet
->reserved2
= 0;
236 // Create a signal to indicate when this packet has been completed.
237 if (hsa_status_t err
=
238 hsa_signal_create(1, 0, nullptr, &packet
->completion_signal
))
241 if (print_resource_usage
)
242 print_kernel_resources(kernel_name
);
244 // Initialize the packet header and set the doorbell signal to begin execution
245 // by the HSA runtime.
247 1u << HSA_PACKET_HEADER_BARRIER
|
248 (HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
) |
249 (HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
) |
250 (HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
);
251 uint32_t header_word
= header
| (setup
<< 16u);
252 __atomic_store_n((uint32_t *)&packet
->header
, header_word
, __ATOMIC_RELEASE
);
253 hsa_signal_store_relaxed(queue
->doorbell_signal
, packet_id
);
255 std::atomic
<bool> finished
= false;
256 std::thread
server_thread(
257 [](std::atomic
<bool> *finished
, rpc::Server
*server
,
258 uint32_t wavefront_size
, hsa_agent_t dev_agent
,
259 hsa_amd_memory_pool_t coarsegrained_pool
) {
260 // Register RPC callbacks for the malloc and free functions on HSA.
261 auto malloc_handler
= [&](size_t size
) -> void * {
262 void *dev_ptr
= nullptr;
263 if (hsa_status_t err
=
264 hsa_amd_memory_pool_allocate(coarsegrained_pool
, size
,
265 /*flags=*/0, &dev_ptr
))
267 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, dev_ptr
);
271 auto free_handler
= [](void *ptr
) -> void {
272 if (hsa_status_t err
=
273 hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr
)))
279 if (wavefront_size
== 32)
281 handle_server
<32>(*server
, index
, malloc_handler
, free_handler
);
284 handle_server
<64>(*server
, index
, malloc_handler
, free_handler
);
287 &finished
, &server
, wavefront_size
, dev_agent
, coarsegrained_pool
);
289 // Wait until the kernel has completed execution on the device. Periodically
290 // check the RPC client for work to be performed on the server.
291 while (hsa_signal_wait_scacquire(packet
->completion_signal
,
292 HSA_SIGNAL_CONDITION_EQ
, 0, UINT64_MAX
,
293 HSA_WAIT_STATE_BLOCKED
) != 0)
297 if (server_thread
.joinable())
298 server_thread
.join();
300 // Destroy the resources acquired to launch the kernel and return.
301 if (hsa_status_t err
= hsa_amd_memory_pool_free(args
))
303 if (hsa_status_t err
= hsa_signal_destroy(packet
->completion_signal
))
306 return HSA_STATUS_SUCCESS
;
309 /// Copies data from the source agent to the destination agent. The source
310 /// memory must first be pinned explicitly or allocated via HSA.
311 static hsa_status_t
hsa_memcpy(void *dst
, hsa_agent_t dst_agent
,
312 const void *src
, hsa_agent_t src_agent
,
314 // Create a memory signal to copy information between the host and device.
315 hsa_signal_t memory_signal
;
316 if (hsa_status_t err
= hsa_signal_create(1, 0, nullptr, &memory_signal
))
319 if (hsa_status_t err
= hsa_amd_memory_async_copy(
320 dst
, dst_agent
, src
, src_agent
, size
, 0, nullptr, memory_signal
))
323 while (hsa_signal_wait_scacquire(memory_signal
, HSA_SIGNAL_CONDITION_EQ
, 0,
324 UINT64_MAX
, HSA_WAIT_STATE_ACTIVE
) != 0)
327 if (hsa_status_t err
= hsa_signal_destroy(memory_signal
))
330 return HSA_STATUS_SUCCESS
;
333 int load(int argc
, const char **argv
, const char **envp
, void *image
,
334 size_t size
, const LaunchParameters
¶ms
,
335 bool print_resource_usage
) {
336 // Initialize the HSA runtime used to communicate with the device.
337 if (hsa_status_t err
= hsa_init())
340 // Register a callback when the device encounters a memory fault.
341 if (hsa_status_t err
= hsa_amd_register_system_event_handler(
342 [](const hsa_amd_event_t
*event
, void *) -> hsa_status_t
{
343 if (event
->event_type
== HSA_AMD_GPU_MEMORY_FAULT_EVENT
)
344 return HSA_STATUS_ERROR
;
345 return HSA_STATUS_SUCCESS
;
350 // Obtain a single agent for the device and host to use the HSA memory model.
351 hsa_agent_t dev_agent
;
352 hsa_agent_t host_agent
;
353 if (hsa_status_t err
= get_agent
<HSA_DEVICE_TYPE_GPU
>(&dev_agent
))
355 if (hsa_status_t err
= get_agent
<HSA_DEVICE_TYPE_CPU
>(&host_agent
))
358 // Load the code object's ISA information and executable data segments.
359 hsa_code_object_reader_t reader
;
360 if (hsa_status_t err
=
361 hsa_code_object_reader_create_from_memory(image
, size
, &reader
))
364 hsa_executable_t executable
;
365 if (hsa_status_t err
= hsa_executable_create_alt(
366 HSA_PROFILE_FULL
, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO
, "",
370 hsa_loaded_code_object_t object
;
371 if (hsa_status_t err
= hsa_executable_load_agent_code_object(
372 executable
, dev_agent
, reader
, "", &object
))
375 // No modifications to the executable are allowed after this point.
376 if (hsa_status_t err
= hsa_executable_freeze(executable
, ""))
379 // Check the validity of the loaded executable. If the agents ISA features do
380 // not match the executable's code object it will fail here.
382 if (hsa_status_t err
= hsa_executable_validate(executable
, &result
))
385 handle_error(HSA_STATUS_ERROR
);
387 if (hsa_status_t err
= hsa_code_object_reader_destroy(reader
))
390 // Obtain memory pools to exchange data between the host and the device. The
391 // fine-grained pool acts as pinned memory on the host for DMA transfers to
392 // the device, the coarse-grained pool is for allocations directly on the
393 // device, and the kernerl-argument pool is for executing the kernel.
394 hsa_amd_memory_pool_t kernargs_pool
;
395 hsa_amd_memory_pool_t finegrained_pool
;
396 hsa_amd_memory_pool_t coarsegrained_pool
;
397 if (hsa_status_t err
=
398 get_agent_memory_pool
<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
>(
399 host_agent
, &kernargs_pool
))
401 if (hsa_status_t err
=
402 get_agent_memory_pool
<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
>(
403 host_agent
, &finegrained_pool
))
405 if (hsa_status_t err
=
406 get_agent_memory_pool
<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
>(
407 dev_agent
, &coarsegrained_pool
))
410 // The AMDGPU target can change its wavefront size. There currently isn't a
411 // good way to look this up through the HSA API so we use the LLVM interface.
412 uint16_t abi_version
;
413 llvm::StringRef
image_ref(reinterpret_cast<char *>(image
), size
);
414 llvm::StringMap
<llvm::offloading::amdgpu::AMDGPUKernelMetaData
> info_map
;
415 if (llvm::Error err
= llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
416 llvm::MemoryBufferRef(image_ref
, ""), info_map
, abi_version
)) {
417 handle_error(llvm::toString(std::move(err
)).c_str());
420 // Allocate fine-grained memory on the host to hold the pointer array for the
421 // copied argv and allow the GPU agent to access it.
422 auto allocator
= [&](uint64_t size
) -> void * {
423 void *dev_ptr
= nullptr;
424 if (hsa_status_t err
= hsa_amd_memory_pool_allocate(finegrained_pool
, size
,
425 /*flags=*/0, &dev_ptr
))
427 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, dev_ptr
);
430 void *dev_argv
= copy_argument_vector(argc
, argv
, allocator
);
432 handle_error("Failed to allocate device argv");
434 // Allocate fine-grained memory on the host to hold the pointer array for the
435 // copied environment array and allow the GPU agent to access it.
436 void *dev_envp
= copy_environment(envp
, allocator
);
438 handle_error("Failed to allocate device environment");
440 // Allocate space for the return pointer and initialize it to zero.
442 if (hsa_status_t err
=
443 hsa_amd_memory_pool_allocate(coarsegrained_pool
, sizeof(int),
444 /*flags=*/0, &dev_ret
))
446 hsa_amd_memory_fill(dev_ret
, 0, /*count=*/1);
448 // Allocate finegrained memory for the RPC server and client to share.
449 uint32_t wavefront_size
=
450 llvm::max_element(info_map
, [](auto &&x
, auto &&y
) {
451 return x
.second
.WavefrontSize
< y
.second
.WavefrontSize
;
452 })->second
.WavefrontSize
;
454 // Set up the RPC server.
456 if (hsa_status_t err
= hsa_amd_memory_pool_allocate(
458 rpc::Server::allocation_size(wavefront_size
, rpc::MAX_PORT_COUNT
),
459 /*flags=*/0, &rpc_buffer
))
461 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, rpc_buffer
);
463 rpc::Server
server(rpc::MAX_PORT_COUNT
, rpc_buffer
);
464 rpc::Client
client(rpc::MAX_PORT_COUNT
, rpc_buffer
);
466 // Initialize the RPC client on the device by copying the local data to the
467 // device's internal pointer.
468 hsa_executable_symbol_t rpc_client_sym
;
469 if (hsa_status_t err
= hsa_executable_get_symbol_by_name(
470 executable
, "__llvm_rpc_client", &dev_agent
, &rpc_client_sym
))
473 void *rpc_client_dev
;
474 if (hsa_status_t err
= hsa_executable_symbol_get_info(
475 rpc_client_sym
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
479 void *rpc_client_buffer
;
480 if (hsa_status_t err
=
481 hsa_amd_memory_lock(&client
, sizeof(rpc::Client
),
482 /*agents=*/nullptr, 0, &rpc_client_buffer
))
485 // Copy the RPC client buffer to the address pointed to by the symbol.
486 if (hsa_status_t err
=
487 hsa_memcpy(rpc_client_dev
, dev_agent
, rpc_client_buffer
, host_agent
,
488 sizeof(rpc::Client
)))
491 if (hsa_status_t err
= hsa_amd_memory_unlock(&client
))
494 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
495 // If the clock_freq symbol is missing, no work to do.
496 hsa_executable_symbol_t freq_sym
;
497 if (HSA_STATUS_SUCCESS
==
498 hsa_executable_get_symbol_by_name(executable
, "__llvm_libc_clock_freq",
499 &dev_agent
, &freq_sym
)) {
500 void *host_clock_freq
;
501 if (hsa_status_t err
=
502 hsa_amd_memory_pool_allocate(finegrained_pool
, sizeof(uint64_t),
503 /*flags=*/0, &host_clock_freq
))
505 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, host_clock_freq
);
507 if (HSA_STATUS_SUCCESS
==
508 hsa_agent_get_info(dev_agent
,
509 static_cast<hsa_agent_info_t
>(
510 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY
),
514 if (hsa_status_t err
= hsa_executable_symbol_get_info(
515 freq_sym
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
519 if (hsa_status_t err
= hsa_memcpy(freq_addr
, dev_agent
, host_clock_freq
,
520 host_agent
, sizeof(uint64_t)))
525 // Obtain a queue with the maximum (power of two) size, used to send commands
526 // to the HSA runtime and launch execution on the device.
528 if (hsa_status_t err
= hsa_agent_get_info(
529 dev_agent
, HSA_AGENT_INFO_QUEUE_MAX_SIZE
, &queue_size
))
531 hsa_queue_t
*queue
= nullptr;
532 if (hsa_status_t err
=
533 hsa_queue_create(dev_agent
, queue_size
, HSA_QUEUE_TYPE_MULTI
, nullptr,
534 nullptr, UINT32_MAX
, UINT32_MAX
, &queue
))
537 LaunchParameters single_threaded_params
= {1, 1, 1, 1, 1, 1};
538 begin_args_t init_args
= {argc
, dev_argv
, dev_envp
};
539 if (hsa_status_t err
= launch_kernel(
540 dev_agent
, executable
, kernargs_pool
, coarsegrained_pool
, queue
,
541 server
, single_threaded_params
, "_begin.kd", init_args
,
542 info_map
["_begin"].WavefrontSize
, print_resource_usage
))
545 start_args_t args
= {argc
, dev_argv
, dev_envp
, dev_ret
};
546 if (hsa_status_t err
= launch_kernel(
547 dev_agent
, executable
, kernargs_pool
, coarsegrained_pool
, queue
,
548 server
, params
, "_start.kd", args
, info_map
["_start"].WavefrontSize
,
549 print_resource_usage
))
553 if (hsa_status_t err
=
554 hsa_amd_memory_pool_allocate(finegrained_pool
, sizeof(int),
555 /*flags=*/0, &host_ret
))
557 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, host_ret
);
559 if (hsa_status_t err
=
560 hsa_memcpy(host_ret
, host_agent
, dev_ret
, dev_agent
, sizeof(int)))
563 // Save the return value and perform basic clean-up.
564 int ret
= *static_cast<int *>(host_ret
);
566 end_args_t fini_args
= {ret
};
567 if (hsa_status_t err
= launch_kernel(
568 dev_agent
, executable
, kernargs_pool
, coarsegrained_pool
, queue
,
569 server
, single_threaded_params
, "_end.kd", fini_args
,
570 info_map
["_end"].WavefrontSize
, print_resource_usage
))
573 if (hsa_status_t err
= hsa_amd_memory_pool_free(rpc_buffer
))
576 // Free the memory allocated for the device.
577 if (hsa_status_t err
= hsa_amd_memory_pool_free(dev_argv
))
579 if (hsa_status_t err
= hsa_amd_memory_pool_free(dev_ret
))
581 if (hsa_status_t err
= hsa_amd_memory_pool_free(host_ret
))
584 if (hsa_status_t err
= hsa_queue_destroy(queue
))
587 if (hsa_status_t err
= hsa_executable_destroy(executable
))
590 if (hsa_status_t err
= hsa_shut_down())