1 //===-- Loader Implementation for AMDHSA devices --------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file impelements a simple loader to run images supporting the AMDHSA
10 // architecture. The file launches the '_start' kernel which should be provided
11 // by the device application start code and call ultimately call the 'main'
14 //===----------------------------------------------------------------------===//
18 #if defined(__has_include)
19 #if __has_include("hsa/hsa.h")
21 #include "hsa/hsa_ext_amd.h"
22 #elif __has_include("hsa.h")
24 #include "hsa_ext_amd.h"
28 #include "hsa/hsa_ext_amd.h"
37 /// Print the error code and exit if \p code indicates an error.
38 static void handle_error(hsa_status_t code
) {
39 if (code
== HSA_STATUS_SUCCESS
|| code
== HSA_STATUS_INFO_BREAK
)
43 if (hsa_status_string(code
, &desc
) != HSA_STATUS_SUCCESS
)
44 desc
= "Unknown error";
45 fprintf(stderr
, "%s\n", desc
);
49 /// Generic interface for iterating using the HSA callbacks.
50 template <typename elem_ty
, typename func_ty
, typename callback_ty
>
51 hsa_status_t
iterate(func_ty func
, callback_ty cb
) {
52 auto l
= [](elem_ty elem
, void *data
) -> hsa_status_t
{
53 callback_ty
*unwrapped
= static_cast<callback_ty
*>(data
);
54 return (*unwrapped
)(elem
);
56 return func(l
, static_cast<void *>(&cb
));
59 /// Generic interface for iterating using the HSA callbacks.
60 template <typename elem_ty
, typename func_ty
, typename func_arg_ty
,
62 hsa_status_t
iterate(func_ty func
, func_arg_ty func_arg
, callback_ty cb
) {
63 auto l
= [](elem_ty elem
, void *data
) -> hsa_status_t
{
64 callback_ty
*unwrapped
= static_cast<callback_ty
*>(data
);
65 return (*unwrapped
)(elem
);
67 return func(func_arg
, l
, static_cast<void *>(&cb
));
70 /// Iterate through all availible agents.
71 template <typename callback_ty
>
72 hsa_status_t
iterate_agents(callback_ty callback
) {
73 return iterate
<hsa_agent_t
>(hsa_iterate_agents
, callback
);
76 /// Iterate through all availible memory pools.
77 template <typename callback_ty
>
78 hsa_status_t
iterate_agent_memory_pools(hsa_agent_t agent
, callback_ty cb
) {
79 return iterate
<hsa_amd_memory_pool_t
>(hsa_amd_agent_iterate_memory_pools
,
83 template <hsa_device_type_t flag
>
84 hsa_status_t
get_agent(hsa_agent_t
*output_agent
) {
85 // Find the first agent with a matching device type.
86 auto cb
= [&](hsa_agent_t hsa_agent
) -> hsa_status_t
{
87 hsa_device_type_t type
;
89 hsa_agent_get_info(hsa_agent
, HSA_AGENT_INFO_DEVICE
, &type
);
90 if (status
!= HSA_STATUS_SUCCESS
)
94 // Ensure that a GPU agent supports kernel dispatch packets.
95 if (type
== HSA_DEVICE_TYPE_GPU
) {
96 hsa_agent_feature_t features
;
98 hsa_agent_get_info(hsa_agent
, HSA_AGENT_INFO_FEATURE
, &features
);
99 if (status
!= HSA_STATUS_SUCCESS
)
101 if (features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
)
102 *output_agent
= hsa_agent
;
104 *output_agent
= hsa_agent
;
106 return HSA_STATUS_INFO_BREAK
;
108 return HSA_STATUS_SUCCESS
;
111 return iterate_agents(cb
);
114 /// Retrieve a global memory pool with a \p flag from the agent.
115 template <hsa_amd_memory_pool_global_flag_t flag
>
116 hsa_status_t
get_agent_memory_pool(hsa_agent_t agent
,
117 hsa_amd_memory_pool_t
*output_pool
) {
118 auto cb
= [&](hsa_amd_memory_pool_t memory_pool
) {
120 hsa_amd_segment_t segment
;
121 if (auto err
= hsa_amd_memory_pool_get_info(
122 memory_pool
, HSA_AMD_MEMORY_POOL_INFO_SEGMENT
, &segment
))
124 if (auto err
= hsa_amd_memory_pool_get_info(
125 memory_pool
, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS
, &flags
))
128 if (segment
!= HSA_AMD_SEGMENT_GLOBAL
)
129 return HSA_STATUS_SUCCESS
;
132 *output_pool
= memory_pool
;
134 return HSA_STATUS_SUCCESS
;
136 return iterate_agent_memory_pools(agent
, cb
);
139 template <typename args_t
>
140 hsa_status_t
launch_kernel(hsa_agent_t dev_agent
, hsa_executable_t executable
,
141 hsa_amd_memory_pool_t kernargs_pool
,
142 hsa_amd_memory_pool_t coarsegrained_pool
,
143 hsa_queue_t
*queue
, const LaunchParameters
¶ms
,
144 const char *kernel_name
, args_t kernel_args
) {
145 // Look up the '_start' kernel in the loaded executable.
146 hsa_executable_symbol_t symbol
;
147 if (hsa_status_t err
= hsa_executable_get_symbol_by_name(
148 executable
, kernel_name
, &dev_agent
, &symbol
))
151 // Register RPC callbacks for the malloc and free functions on HSA.
152 uint32_t device_id
= 0;
153 auto tuple
= std::make_tuple(dev_agent
, coarsegrained_pool
);
154 rpc_register_callback(
155 device_id
, RPC_MALLOC
,
156 [](rpc_port_t port
, void *data
) {
157 auto malloc_handler
= [](rpc_buffer_t
*buffer
, void *data
) -> void {
158 auto &[dev_agent
, pool
] = *static_cast<decltype(tuple
) *>(data
);
159 uint64_t size
= buffer
->data
[0];
160 void *dev_ptr
= nullptr;
161 if (hsa_status_t err
=
162 hsa_amd_memory_pool_allocate(pool
, size
,
163 /*flags=*/0, &dev_ptr
))
165 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, dev_ptr
);
166 buffer
->data
[0] = reinterpret_cast<uintptr_t>(dev_ptr
);
168 rpc_recv_and_send(port
, malloc_handler
, data
);
171 rpc_register_callback(
173 [](rpc_port_t port
, void *data
) {
174 auto free_handler
= [](rpc_buffer_t
*buffer
, void *) {
175 if (hsa_status_t err
= hsa_amd_memory_pool_free(
176 reinterpret_cast<void *>(buffer
->data
[0])))
179 rpc_recv_and_send(port
, free_handler
, data
);
183 // Retrieve different properties of the kernel symbol used for launch.
187 uint32_t private_size
;
189 std::pair
<hsa_executable_symbol_info_t
, void *> symbol_infos
[] = {
190 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
},
191 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
, &args_size
},
192 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
, &group_size
},
193 {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
, &private_size
}};
195 for (auto &[info
, value
] : symbol_infos
)
196 if (hsa_status_t err
= hsa_executable_symbol_get_info(symbol
, info
, value
))
199 // Allocate space for the kernel arguments on the host and allow the GPU agent
202 if (hsa_status_t err
= hsa_amd_memory_pool_allocate(kernargs_pool
, args_size
,
205 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, args
);
207 // Initialie all the arguments (explicit and implicit) to zero, then set the
208 // explicit arguments to the values created above.
209 std::memset(args
, 0, args_size
);
210 std::memcpy(args
, &kernel_args
, sizeof(args_t
));
212 // Obtain a packet from the queue.
213 uint64_t packet_id
= hsa_queue_add_write_index_relaxed(queue
, 1);
214 while (packet_id
- hsa_queue_load_read_index_scacquire(queue
) >= queue
->size
)
217 const uint32_t mask
= queue
->size
- 1;
218 hsa_kernel_dispatch_packet_t
*packet
=
219 static_cast<hsa_kernel_dispatch_packet_t
*>(queue
->base_address
) +
222 // Set up the packet for exeuction on the device. We currently only launch
223 // with one thread on the device, forcing the rest of the wavefront to be
225 uint16_t setup
= (1 + (params
.num_blocks_y
* params
.num_threads_y
!= 1) +
226 (params
.num_blocks_z
* params
.num_threads_z
!= 1))
227 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
;
228 packet
->workgroup_size_x
= params
.num_threads_x
;
229 packet
->workgroup_size_y
= params
.num_threads_y
;
230 packet
->workgroup_size_z
= params
.num_threads_z
;
231 packet
->reserved0
= 0;
232 packet
->grid_size_x
= params
.num_blocks_x
* params
.num_threads_x
;
233 packet
->grid_size_y
= params
.num_blocks_y
* params
.num_threads_y
;
234 packet
->grid_size_z
= params
.num_blocks_z
* params
.num_threads_z
;
235 packet
->private_segment_size
= private_size
;
236 packet
->group_segment_size
= group_size
;
237 packet
->kernel_object
= kernel
;
238 packet
->kernarg_address
= args
;
239 packet
->reserved2
= 0;
240 // Create a signal to indicate when this packet has been completed.
241 if (hsa_status_t err
=
242 hsa_signal_create(1, 0, nullptr, &packet
->completion_signal
))
245 // Initialize the packet header and set the doorbell signal to begin execution
246 // by the HSA runtime.
248 (HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
) |
249 (HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
) |
250 (HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
);
251 uint32_t header_word
= header
| (setup
<< 16u);
252 __atomic_store_n((uint32_t *)&packet
->header
, header_word
, __ATOMIC_RELEASE
);
253 hsa_signal_store_relaxed(queue
->doorbell_signal
, packet_id
);
255 // Wait until the kernel has completed execution on the device. Periodically
256 // check the RPC client for work to be performed on the server.
257 while (hsa_signal_wait_scacquire(
258 packet
->completion_signal
, HSA_SIGNAL_CONDITION_EQ
, 0,
259 /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE
) != 0)
260 if (rpc_status_t err
= rpc_handle_server(device_id
))
263 // Handle the server one more time in case the kernel exited with a pending
264 // send still in flight.
265 if (rpc_status_t err
= rpc_handle_server(device_id
))
268 // Destroy the resources acquired to launch the kernel and return.
269 if (hsa_status_t err
= hsa_amd_memory_pool_free(args
))
271 if (hsa_status_t err
= hsa_signal_destroy(packet
->completion_signal
))
274 return HSA_STATUS_SUCCESS
;
277 /// Copies data from the source agent to the destination agent. The source
278 /// memory must first be pinned explicitly or allocated via HSA.
279 static hsa_status_t
hsa_memcpy(void *dst
, hsa_agent_t dst_agent
,
280 const void *src
, hsa_agent_t src_agent
,
282 // Create a memory signal to copy information between the host and device.
283 hsa_signal_t memory_signal
;
284 if (hsa_status_t err
= hsa_signal_create(1, 0, nullptr, &memory_signal
))
287 if (hsa_status_t err
= hsa_amd_memory_async_copy(
288 dst
, dst_agent
, src
, src_agent
, size
, 0, nullptr, memory_signal
))
291 while (hsa_signal_wait_scacquire(memory_signal
, HSA_SIGNAL_CONDITION_EQ
, 0,
292 UINT64_MAX
, HSA_WAIT_STATE_ACTIVE
) != 0)
295 if (hsa_status_t err
= hsa_signal_destroy(memory_signal
))
298 return HSA_STATUS_SUCCESS
;
301 int load(int argc
, char **argv
, char **envp
, void *image
, size_t size
,
302 const LaunchParameters
¶ms
) {
303 // Initialize the HSA runtime used to communicate with the device.
304 if (hsa_status_t err
= hsa_init())
307 // Register a callback when the device encounters a memory fault.
308 if (hsa_status_t err
= hsa_amd_register_system_event_handler(
309 [](const hsa_amd_event_t
*event
, void *) -> hsa_status_t
{
310 if (event
->event_type
== HSA_AMD_GPU_MEMORY_FAULT_EVENT
)
311 return HSA_STATUS_ERROR
;
312 return HSA_STATUS_SUCCESS
;
317 // Obtain a single agent for the device and host to use the HSA memory model.
318 uint32_t num_devices
= 1;
319 uint32_t device_id
= 0;
320 hsa_agent_t dev_agent
;
321 hsa_agent_t host_agent
;
322 if (hsa_status_t err
= get_agent
<HSA_DEVICE_TYPE_GPU
>(&dev_agent
))
324 if (hsa_status_t err
= get_agent
<HSA_DEVICE_TYPE_CPU
>(&host_agent
))
327 // Load the code object's ISA information and executable data segments.
328 hsa_code_object_t object
;
329 if (hsa_status_t err
= hsa_code_object_deserialize(image
, size
, "", &object
))
332 hsa_executable_t executable
;
333 if (hsa_status_t err
= hsa_executable_create_alt(
334 HSA_PROFILE_FULL
, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO
, "",
338 if (hsa_status_t err
=
339 hsa_executable_load_code_object(executable
, dev_agent
, object
, ""))
342 // No modifications to the executable are allowed after this point.
343 if (hsa_status_t err
= hsa_executable_freeze(executable
, ""))
346 // Check the validity of the loaded executable. If the agents ISA features do
347 // not match the executable's code object it will fail here.
349 if (hsa_status_t err
= hsa_executable_validate(executable
, &result
))
352 handle_error(HSA_STATUS_ERROR
);
354 // Obtain memory pools to exchange data between the host and the device. The
355 // fine-grained pool acts as pinned memory on the host for DMA transfers to
356 // the device, the coarse-grained pool is for allocations directly on the
357 // device, and the kernerl-argument pool is for executing the kernel.
358 hsa_amd_memory_pool_t kernargs_pool
;
359 hsa_amd_memory_pool_t finegrained_pool
;
360 hsa_amd_memory_pool_t coarsegrained_pool
;
361 if (hsa_status_t err
=
362 get_agent_memory_pool
<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
>(
363 host_agent
, &kernargs_pool
))
365 if (hsa_status_t err
=
366 get_agent_memory_pool
<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
>(
367 host_agent
, &finegrained_pool
))
369 if (hsa_status_t err
=
370 get_agent_memory_pool
<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
>(
371 dev_agent
, &coarsegrained_pool
))
374 // Allocate fine-grained memory on the host to hold the pointer array for the
375 // copied argv and allow the GPU agent to access it.
376 auto allocator
= [&](uint64_t size
) -> void * {
377 void *dev_ptr
= nullptr;
378 if (hsa_status_t err
= hsa_amd_memory_pool_allocate(finegrained_pool
, size
,
379 /*flags=*/0, &dev_ptr
))
381 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, dev_ptr
);
384 void *dev_argv
= copy_argument_vector(argc
, argv
, allocator
);
386 handle_error("Failed to allocate device argv");
388 // Allocate fine-grained memory on the host to hold the pointer array for the
389 // copied environment array and allow the GPU agent to access it.
390 void *dev_envp
= copy_environment(envp
, allocator
);
392 handle_error("Failed to allocate device environment");
394 // Allocate space for the return pointer and initialize it to zero.
396 if (hsa_status_t err
=
397 hsa_amd_memory_pool_allocate(coarsegrained_pool
, sizeof(int),
398 /*flags=*/0, &dev_ret
))
400 hsa_amd_memory_fill(dev_ret
, 0, /*count=*/1);
402 // Allocate finegrained memory for the RPC server and client to share.
403 uint32_t wavefront_size
= 0;
404 if (hsa_status_t err
= hsa_agent_get_info(
405 dev_agent
, HSA_AGENT_INFO_WAVEFRONT_SIZE
, &wavefront_size
))
408 // Set up the RPC server.
409 if (rpc_status_t err
= rpc_init(num_devices
))
411 auto tuple
= std::make_tuple(dev_agent
, finegrained_pool
);
412 auto rpc_alloc
= [](uint64_t size
, void *data
) {
413 auto &[dev_agent
, finegrained_pool
] = *static_cast<decltype(tuple
) *>(data
);
414 void *dev_ptr
= nullptr;
415 if (hsa_status_t err
= hsa_amd_memory_pool_allocate(finegrained_pool
, size
,
416 /*flags=*/0, &dev_ptr
))
418 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, dev_ptr
);
421 if (rpc_status_t err
= rpc_server_init(device_id
, RPC_MAXIMUM_PORT_COUNT
,
422 wavefront_size
, rpc_alloc
, &tuple
))
425 // Register callbacks for the RPC unit tests.
426 if (wavefront_size
== 32)
427 register_rpc_callbacks
<32>(device_id
);
428 else if (wavefront_size
== 64)
429 register_rpc_callbacks
<64>(device_id
);
431 handle_error("Invalid wavefront size");
433 // Initialize the RPC client on the device by copying the local data to the
434 // device's internal pointer.
435 hsa_executable_symbol_t rpc_client_sym
;
436 if (hsa_status_t err
= hsa_executable_get_symbol_by_name(
437 executable
, rpc_client_symbol_name
, &dev_agent
, &rpc_client_sym
))
440 void *rpc_client_host
;
441 if (hsa_status_t err
=
442 hsa_amd_memory_pool_allocate(coarsegrained_pool
, sizeof(void *),
443 /*flags=*/0, &rpc_client_host
))
446 void *rpc_client_dev
;
447 if (hsa_status_t err
= hsa_executable_symbol_get_info(
448 rpc_client_sym
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
452 // Copy the address of the client buffer from the device to the host.
453 if (hsa_status_t err
= hsa_memcpy(rpc_client_host
, host_agent
, rpc_client_dev
,
454 dev_agent
, sizeof(void *)))
457 void *rpc_client_buffer
;
458 if (hsa_status_t err
= hsa_amd_memory_pool_allocate(
459 coarsegrained_pool
, rpc_get_client_size(),
460 /*flags=*/0, &rpc_client_buffer
))
462 std::memcpy(rpc_client_buffer
, rpc_get_client_buffer(device_id
),
463 rpc_get_client_size());
465 // Copy the RPC client buffer to the address pointed to by the symbol.
466 if (hsa_status_t err
=
467 hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host
), dev_agent
,
468 rpc_client_buffer
, host_agent
, rpc_get_client_size()))
471 if (hsa_status_t err
= hsa_amd_memory_pool_free(rpc_client_buffer
))
473 if (hsa_status_t err
= hsa_amd_memory_pool_free(rpc_client_host
))
476 // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
477 // If the clock_freq symbol is missing, no work to do.
478 hsa_executable_symbol_t freq_sym
;
479 if (HSA_STATUS_SUCCESS
==
480 hsa_executable_get_symbol_by_name(executable
, "__llvm_libc_clock_freq",
481 &dev_agent
, &freq_sym
)) {
483 void *host_clock_freq
;
484 if (hsa_status_t err
=
485 hsa_amd_memory_pool_allocate(finegrained_pool
, sizeof(uint64_t),
486 /*flags=*/0, &host_clock_freq
))
488 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, host_clock_freq
);
490 if (hsa_status_t err
=
491 hsa_agent_get_info(dev_agent
,
492 static_cast<hsa_agent_info_t
>(
493 HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY
),
498 if (hsa_status_t err
= hsa_executable_symbol_get_info(
499 freq_sym
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
, &freq_addr
))
502 if (hsa_status_t err
= hsa_memcpy(freq_addr
, dev_agent
, host_clock_freq
,
503 host_agent
, sizeof(uint64_t)))
507 // Obtain a queue with the minimum (power of two) size, used to send commands
508 // to the HSA runtime and launch execution on the device.
510 if (hsa_status_t err
= hsa_agent_get_info(
511 dev_agent
, HSA_AGENT_INFO_QUEUE_MIN_SIZE
, &queue_size
))
513 hsa_queue_t
*queue
= nullptr;
514 if (hsa_status_t err
=
515 hsa_queue_create(dev_agent
, queue_size
, HSA_QUEUE_TYPE_MULTI
, nullptr,
516 nullptr, UINT32_MAX
, UINT32_MAX
, &queue
))
519 LaunchParameters single_threaded_params
= {1, 1, 1, 1, 1, 1};
520 begin_args_t init_args
= {argc
, dev_argv
, dev_envp
};
521 if (hsa_status_t err
= launch_kernel(
522 dev_agent
, executable
, kernargs_pool
, coarsegrained_pool
, queue
,
523 single_threaded_params
, "_begin.kd", init_args
))
526 start_args_t args
= {argc
, dev_argv
, dev_envp
, dev_ret
};
527 if (hsa_status_t err
=
528 launch_kernel(dev_agent
, executable
, kernargs_pool
,
529 coarsegrained_pool
, queue
, params
, "_start.kd", args
))
533 if (hsa_status_t err
=
534 hsa_amd_memory_pool_allocate(finegrained_pool
, sizeof(int),
535 /*flags=*/0, &host_ret
))
537 hsa_amd_agents_allow_access(1, &dev_agent
, nullptr, host_ret
);
539 if (hsa_status_t err
=
540 hsa_memcpy(host_ret
, host_agent
, dev_ret
, dev_agent
, sizeof(int)))
543 // Save the return value and perform basic clean-up.
544 int ret
= *static_cast<int *>(host_ret
);
546 end_args_t fini_args
= {ret
};
547 if (hsa_status_t err
= launch_kernel(
548 dev_agent
, executable
, kernargs_pool
, coarsegrained_pool
, queue
,
549 single_threaded_params
, "_end.kd", fini_args
))
552 if (rpc_status_t err
= rpc_server_shutdown(
553 device_id
, [](void *ptr
, void *) { hsa_amd_memory_pool_free(ptr
); },
557 // Free the memory allocated for the device.
558 if (hsa_status_t err
= hsa_amd_memory_pool_free(dev_argv
))
560 if (hsa_status_t err
= hsa_amd_memory_pool_free(dev_ret
))
562 if (hsa_status_t err
= hsa_amd_memory_pool_free(host_ret
))
565 if (hsa_status_t err
= hsa_queue_destroy(queue
))
568 if (hsa_status_t err
= hsa_executable_destroy(executable
))
571 if (hsa_status_t err
= hsa_code_object_destroy(object
))
574 if (rpc_status_t err
= rpc_shutdown())
576 if (hsa_status_t err
= hsa_shut_down())