[InstCombine] Remove foldSelectICmpEq() fold (#122098)
[llvm-project.git] / libc / utils / gpu / loader / amdgpu / amdhsa-loader.cpp
blob00fde147b0abde179fb106c840ab8f9584704dee
1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file impelements a simple loader to run images supporting the AMDHSA
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
12 // function.
14 //===----------------------------------------------------------------------===//
16 #include "Loader.h"
18 #include "hsa/hsa.h"
19 #include "hsa/hsa_ext_amd.h"
21 #include "llvm/Frontend/Offloading/Utility.h"
23 #include <atomic>
24 #include <cstdio>
25 #include <cstdlib>
26 #include <cstring>
27 #include <thread>
28 #include <tuple>
29 #include <utility>
31 // The implicit arguments of COV5 AMDGPU kernels.
32 struct implicit_args_t {
33 uint32_t grid_size_x;
34 uint32_t grid_size_y;
35 uint32_t grid_size_z;
36 uint16_t workgroup_size_x;
37 uint16_t workgroup_size_y;
38 uint16_t workgroup_size_z;
39 uint8_t Unused0[46];
40 uint16_t grid_dims;
41 uint8_t Unused1[190];
44 /// Print the error code and exit if \p code indicates an error.
45 static void handle_error_impl(const char *file, int32_t line,
46 hsa_status_t code) {
47 if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
48 return;
50 const char *desc;
51 if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
52 desc = "Unknown error";
53 fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc);
54 exit(EXIT_FAILURE);
57 /// Generic interface for iterating using the HSA callbacks.
58 template <typename elem_ty, typename func_ty, typename callback_ty>
59 hsa_status_t iterate(func_ty func, callback_ty cb) {
60 auto l = [](elem_ty elem, void *data) -> hsa_status_t {
61 callback_ty *unwrapped = static_cast<callback_ty *>(data);
62 return (*unwrapped)(elem);
64 return func(l, static_cast<void *>(&cb));
67 /// Generic interface for iterating using the HSA callbacks.
68 template <typename elem_ty, typename func_ty, typename func_arg_ty,
69 typename callback_ty>
70 hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
71 auto l = [](elem_ty elem, void *data) -> hsa_status_t {
72 callback_ty *unwrapped = static_cast<callback_ty *>(data);
73 return (*unwrapped)(elem);
75 return func(func_arg, l, static_cast<void *>(&cb));
78 /// Iterate through all availible agents.
79 template <typename callback_ty>
80 hsa_status_t iterate_agents(callback_ty callback) {
81 return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
84 /// Iterate through all availible memory pools.
85 template <typename callback_ty>
86 hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
87 return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
88 agent, cb);
91 template <hsa_device_type_t flag>
92 hsa_status_t get_agent(hsa_agent_t *output_agent) {
93 // Find the first agent with a matching device type.
94 auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
95 hsa_device_type_t type;
96 hsa_status_t status =
97 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
98 if (status != HSA_STATUS_SUCCESS)
99 return status;
101 if (type == flag) {
102 // Ensure that a GPU agent supports kernel dispatch packets.
103 if (type == HSA_DEVICE_TYPE_GPU) {
104 hsa_agent_feature_t features;
105 status =
106 hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
107 if (status != HSA_STATUS_SUCCESS)
108 return status;
109 if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
110 *output_agent = hsa_agent;
111 } else {
112 *output_agent = hsa_agent;
114 return HSA_STATUS_INFO_BREAK;
116 return HSA_STATUS_SUCCESS;
119 return iterate_agents(cb);
122 void print_kernel_resources(const char *kernel_name) {
123 fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n");
126 /// Retrieve a global memory pool with a \p flag from the agent.
127 template <hsa_amd_memory_pool_global_flag_t flag>
128 hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
129 hsa_amd_memory_pool_t *output_pool) {
130 auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
131 uint32_t flags;
132 hsa_amd_segment_t segment;
133 if (auto err = hsa_amd_memory_pool_get_info(
134 memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
135 return err;
136 if (auto err = hsa_amd_memory_pool_get_info(
137 memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
138 return err;
140 if (segment != HSA_AMD_SEGMENT_GLOBAL)
141 return HSA_STATUS_SUCCESS;
143 if (flags & flag)
144 *output_pool = memory_pool;
146 return HSA_STATUS_SUCCESS;
148 return iterate_agent_memory_pools(agent, cb);
151 template <typename args_t>
152 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
153 hsa_amd_memory_pool_t kernargs_pool,
154 hsa_amd_memory_pool_t coarsegrained_pool,
155 hsa_queue_t *queue, rpc::Server &server,
156 const LaunchParameters &params,
157 const char *kernel_name, args_t kernel_args,
158 uint32_t wavefront_size, bool print_resource_usage) {
159 // Look up the kernel in the loaded executable.
160 hsa_executable_symbol_t symbol;
161 if (hsa_status_t err = hsa_executable_get_symbol_by_name(
162 executable, kernel_name, &dev_agent, &symbol))
163 return err;
165 // Retrieve different properties of the kernel symbol used for launch.
166 uint64_t kernel;
167 uint32_t args_size;
168 uint32_t group_size;
169 uint32_t private_size;
170 bool dynamic_stack;
172 std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
173 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
174 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
175 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
176 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
177 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
179 for (auto &[info, value] : symbol_infos)
180 if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
181 return err;
183 // Allocate space for the kernel arguments on the host and allow the GPU agent
184 // to access it.
185 void *args;
186 if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
187 /*flags=*/0, &args))
188 handle_error(err);
189 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
191 // Initialize all the arguments (explicit and implicit) to zero, then set the
192 // explicit arguments to the values created above.
193 std::memset(args, 0, args_size);
194 std::memcpy(args, &kernel_args, sizeof(args_t));
196 // Initialize the necessary implicit arguments to the proper values.
197 int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
198 (params.num_blocks_z * params.num_threads_z != 1);
199 implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
200 reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
201 implicit_args->grid_dims = dims;
202 implicit_args->grid_size_x = params.num_blocks_x;
203 implicit_args->grid_size_y = params.num_blocks_y;
204 implicit_args->grid_size_z = params.num_blocks_z;
205 implicit_args->workgroup_size_x = params.num_threads_x;
206 implicit_args->workgroup_size_y = params.num_threads_y;
207 implicit_args->workgroup_size_z = params.num_threads_z;
209 // Obtain a packet from the queue.
210 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
211 while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
214 const uint32_t mask = queue->size - 1;
215 hsa_kernel_dispatch_packet_t *packet =
216 static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
217 (packet_id & mask);
219 // Set up the packet for exeuction on the device. We currently only launch
220 // with one thread on the device, forcing the rest of the wavefront to be
221 // masked off.
222 uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
223 packet->workgroup_size_x = params.num_threads_x;
224 packet->workgroup_size_y = params.num_threads_y;
225 packet->workgroup_size_z = params.num_threads_z;
226 packet->reserved0 = 0;
227 packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
228 packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
229 packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
230 packet->private_segment_size =
231 dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
232 packet->group_segment_size = group_size;
233 packet->kernel_object = kernel;
234 packet->kernarg_address = args;
235 packet->reserved2 = 0;
236 // Create a signal to indicate when this packet has been completed.
237 if (hsa_status_t err =
238 hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
239 handle_error(err);
241 if (print_resource_usage)
242 print_kernel_resources(kernel_name);
244 // Initialize the packet header and set the doorbell signal to begin execution
245 // by the HSA runtime.
246 uint16_t header =
247 1u << HSA_PACKET_HEADER_BARRIER |
248 (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
249 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
250 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
251 uint32_t header_word = header | (setup << 16u);
252 __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
253 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
255 std::atomic<bool> finished = false;
256 std::thread server_thread(
257 [](std::atomic<bool> *finished, rpc::Server *server,
258 uint32_t wavefront_size, hsa_agent_t dev_agent,
259 hsa_amd_memory_pool_t coarsegrained_pool) {
260 // Register RPC callbacks for the malloc and free functions on HSA.
261 auto malloc_handler = [&](size_t size) -> void * {
262 void *dev_ptr = nullptr;
263 if (hsa_status_t err =
264 hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
265 /*flags=*/0, &dev_ptr))
266 dev_ptr = nullptr;
267 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
268 return dev_ptr;
271 auto free_handler = [](void *ptr) -> void {
272 if (hsa_status_t err =
273 hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
274 handle_error(err);
277 uint32_t index = 0;
278 while (!*finished) {
279 if (wavefront_size == 32)
280 index =
281 handle_server<32>(*server, index, malloc_handler, free_handler);
282 else
283 index =
284 handle_server<64>(*server, index, malloc_handler, free_handler);
287 &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
289 // Wait until the kernel has completed execution on the device. Periodically
290 // check the RPC client for work to be performed on the server.
291 while (hsa_signal_wait_scacquire(packet->completion_signal,
292 HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
293 HSA_WAIT_STATE_BLOCKED) != 0)
296 finished = true;
297 if (server_thread.joinable())
298 server_thread.join();
300 // Destroy the resources acquired to launch the kernel and return.
301 if (hsa_status_t err = hsa_amd_memory_pool_free(args))
302 handle_error(err);
303 if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
304 handle_error(err);
306 return HSA_STATUS_SUCCESS;
309 /// Copies data from the source agent to the destination agent. The source
310 /// memory must first be pinned explicitly or allocated via HSA.
311 static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
312 const void *src, hsa_agent_t src_agent,
313 uint64_t size) {
314 // Create a memory signal to copy information between the host and device.
315 hsa_signal_t memory_signal;
316 if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
317 return err;
319 if (hsa_status_t err = hsa_amd_memory_async_copy(
320 dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
321 return err;
323 while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
324 UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
327 if (hsa_status_t err = hsa_signal_destroy(memory_signal))
328 return err;
330 return HSA_STATUS_SUCCESS;
333 int load(int argc, const char **argv, const char **envp, void *image,
334 size_t size, const LaunchParameters &params,
335 bool print_resource_usage) {
336 // Initialize the HSA runtime used to communicate with the device.
337 if (hsa_status_t err = hsa_init())
338 handle_error(err);
340 // Register a callback when the device encounters a memory fault.
341 if (hsa_status_t err = hsa_amd_register_system_event_handler(
342 [](const hsa_amd_event_t *event, void *) -> hsa_status_t {
343 if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
344 return HSA_STATUS_ERROR;
345 return HSA_STATUS_SUCCESS;
347 nullptr))
348 handle_error(err);
350 // Obtain a single agent for the device and host to use the HSA memory model.
351 hsa_agent_t dev_agent;
352 hsa_agent_t host_agent;
353 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
354 handle_error(err);
355 if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
356 handle_error(err);
358 // Load the code object's ISA information and executable data segments.
359 hsa_code_object_reader_t reader;
360 if (hsa_status_t err =
361 hsa_code_object_reader_create_from_memory(image, size, &reader))
362 handle_error(err);
364 hsa_executable_t executable;
365 if (hsa_status_t err = hsa_executable_create_alt(
366 HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
367 &executable))
368 handle_error(err);
370 hsa_loaded_code_object_t object;
371 if (hsa_status_t err = hsa_executable_load_agent_code_object(
372 executable, dev_agent, reader, "", &object))
373 handle_error(err);
375 // No modifications to the executable are allowed after this point.
376 if (hsa_status_t err = hsa_executable_freeze(executable, ""))
377 handle_error(err);
379 // Check the validity of the loaded executable. If the agents ISA features do
380 // not match the executable's code object it will fail here.
381 uint32_t result;
382 if (hsa_status_t err = hsa_executable_validate(executable, &result))
383 handle_error(err);
384 if (result)
385 handle_error(HSA_STATUS_ERROR);
387 if (hsa_status_t err = hsa_code_object_reader_destroy(reader))
388 handle_error(err);
390 // Obtain memory pools to exchange data between the host and the device. The
391 // fine-grained pool acts as pinned memory on the host for DMA transfers to
392 // the device, the coarse-grained pool is for allocations directly on the
393 // device, and the kernerl-argument pool is for executing the kernel.
394 hsa_amd_memory_pool_t kernargs_pool;
395 hsa_amd_memory_pool_t finegrained_pool;
396 hsa_amd_memory_pool_t coarsegrained_pool;
397 if (hsa_status_t err =
398 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
399 host_agent, &kernargs_pool))
400 handle_error(err);
401 if (hsa_status_t err =
402 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
403 host_agent, &finegrained_pool))
404 handle_error(err);
405 if (hsa_status_t err =
406 get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
407 dev_agent, &coarsegrained_pool))
408 handle_error(err);
410 // The AMDGPU target can change its wavefront size. There currently isn't a
411 // good way to look this up through the HSA API so we use the LLVM interface.
412 uint16_t abi_version;
413 llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
414 llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
415 if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
416 llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
417 handle_error(llvm::toString(std::move(err)).c_str());
420 // Allocate fine-grained memory on the host to hold the pointer array for the
421 // copied argv and allow the GPU agent to access it.
422 auto allocator = [&](uint64_t size) -> void * {
423 void *dev_ptr = nullptr;
424 if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
425 /*flags=*/0, &dev_ptr))
426 handle_error(err);
427 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
428 return dev_ptr;
430 void *dev_argv = copy_argument_vector(argc, argv, allocator);
431 if (!dev_argv)
432 handle_error("Failed to allocate device argv");
434 // Allocate fine-grained memory on the host to hold the pointer array for the
435 // copied environment array and allow the GPU agent to access it.
436 void *dev_envp = copy_environment(envp, allocator);
437 if (!dev_envp)
438 handle_error("Failed to allocate device environment");
440 // Allocate space for the return pointer and initialize it to zero.
441 void *dev_ret;
442 if (hsa_status_t err =
443 hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
444 /*flags=*/0, &dev_ret))
445 handle_error(err);
446 hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
448 // Allocate finegrained memory for the RPC server and client to share.
449 uint32_t wavefront_size =
450 llvm::max_element(info_map, [](auto &&x, auto &&y) {
451 return x.second.WavefrontSize < y.second.WavefrontSize;
452 })->second.WavefrontSize;
454 // Set up the RPC server.
455 void *rpc_buffer;
456 if (hsa_status_t err = hsa_amd_memory_pool_allocate(
457 finegrained_pool,
458 rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
459 /*flags=*/0, &rpc_buffer))
460 handle_error(err);
461 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
463 rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
464 rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
466 // Initialize the RPC client on the device by copying the local data to the
467 // device's internal pointer.
468 hsa_executable_symbol_t rpc_client_sym;
469 if (hsa_status_t err = hsa_executable_get_symbol_by_name(
470 executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym))
471 handle_error(err);
473 void *rpc_client_dev;
474 if (hsa_status_t err = hsa_executable_symbol_get_info(
475 rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
476 &rpc_client_dev))
477 handle_error(err);
479 void *rpc_client_buffer;
480 if (hsa_status_t err =
481 hsa_amd_memory_lock(&client, sizeof(rpc::Client),
482 /*agents=*/nullptr, 0, &rpc_client_buffer))
483 handle_error(err);
485 // Copy the RPC client buffer to the address pointed to by the symbol.
486 if (hsa_status_t err =
487 hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent,
488 sizeof(rpc::Client)))
489 handle_error(err);
491 if (hsa_status_t err = hsa_amd_memory_unlock(&client))
492 handle_error(err);
494 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
495 // If the clock_freq symbol is missing, no work to do.
496 hsa_executable_symbol_t freq_sym;
497 if (HSA_STATUS_SUCCESS ==
498 hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
499 &dev_agent, &freq_sym)) {
500 void *host_clock_freq;
501 if (hsa_status_t err =
502 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
503 /*flags=*/0, &host_clock_freq))
504 handle_error(err);
505 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
507 if (HSA_STATUS_SUCCESS ==
508 hsa_agent_get_info(dev_agent,
509 static_cast<hsa_agent_info_t>(
510 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
511 host_clock_freq)) {
513 void *freq_addr;
514 if (hsa_status_t err = hsa_executable_symbol_get_info(
515 freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
516 &freq_addr))
517 handle_error(err);
519 if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
520 host_agent, sizeof(uint64_t)))
521 handle_error(err);
525 // Obtain a queue with the maximum (power of two) size, used to send commands
526 // to the HSA runtime and launch execution on the device.
527 uint64_t queue_size;
528 if (hsa_status_t err = hsa_agent_get_info(
529 dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size))
530 handle_error(err);
531 hsa_queue_t *queue = nullptr;
532 if (hsa_status_t err =
533 hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
534 nullptr, UINT32_MAX, UINT32_MAX, &queue))
535 handle_error(err);
537 LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
538 begin_args_t init_args = {argc, dev_argv, dev_envp};
539 if (hsa_status_t err = launch_kernel(
540 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
541 server, single_threaded_params, "_begin.kd", init_args,
542 info_map["_begin"].WavefrontSize, print_resource_usage))
543 handle_error(err);
545 start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
546 if (hsa_status_t err = launch_kernel(
547 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
548 server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
549 print_resource_usage))
550 handle_error(err);
552 void *host_ret;
553 if (hsa_status_t err =
554 hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
555 /*flags=*/0, &host_ret))
556 handle_error(err);
557 hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
559 if (hsa_status_t err =
560 hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
561 handle_error(err);
563 // Save the return value and perform basic clean-up.
564 int ret = *static_cast<int *>(host_ret);
566 end_args_t fini_args = {ret};
567 if (hsa_status_t err = launch_kernel(
568 dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
569 server, single_threaded_params, "_end.kd", fini_args,
570 info_map["_end"].WavefrontSize, print_resource_usage))
571 handle_error(err);
573 if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
574 handle_error(err);
576 // Free the memory allocated for the device.
577 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
578 handle_error(err);
579 if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
580 handle_error(err);
581 if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
582 handle_error(err);
584 if (hsa_status_t err = hsa_queue_destroy(queue))
585 handle_error(err);
587 if (hsa_status_t err = hsa_executable_destroy(executable))
588 handle_error(err);
590 if (hsa_status_t err = hsa_shut_down())
591 handle_error(err);
593 return ret;