1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
41 #include <hsa_ext_amd.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
48 #include "oacc-plugin.h"
52 /* These probably won't be in elf.h for a while. */
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
69 /* GCN specific definitions for asynchronous queues. */
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
93 secure_getenv (const char *name
)
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
102 #define secure_getenv getenv
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
120 struct hsa_runtime_fn_info
123 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
124 const char **status_string
);
125 hsa_status_t (*hsa_system_get_info_fn
) (hsa_system_info_t attribute
,
127 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
128 hsa_agent_info_t attribute
,
130 hsa_status_t (*hsa_isa_get_info_fn
)(hsa_isa_t isa
,
131 hsa_isa_info_t attribute
,
134 hsa_status_t (*hsa_init_fn
) (void);
135 hsa_status_t (*hsa_iterate_agents_fn
)
136 (hsa_status_t (*callback
)(hsa_agent_t agent
, void *data
), void *data
);
137 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
138 hsa_region_info_t attribute
,
140 hsa_status_t (*hsa_queue_create_fn
)
141 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
142 void (*callback
)(hsa_status_t status
, hsa_queue_t
*source
, void *data
),
143 void *data
, uint32_t private_segment_size
,
144 uint32_t group_segment_size
, hsa_queue_t
**queue
);
145 hsa_status_t (*hsa_agent_iterate_regions_fn
)
147 hsa_status_t (*callback
)(hsa_region_t region
, void *data
), void *data
);
148 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
149 hsa_status_t (*hsa_executable_create_fn
)
150 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
151 const char *options
, hsa_executable_t
*executable
);
152 hsa_status_t (*hsa_executable_global_variable_define_fn
)
153 (hsa_executable_t executable
, const char *variable_name
, void *address
);
154 hsa_status_t (*hsa_executable_load_code_object_fn
)
155 (hsa_executable_t executable
, hsa_agent_t agent
,
156 hsa_code_object_t code_object
, const char *options
);
157 hsa_status_t (*hsa_executable_freeze_fn
)(hsa_executable_t executable
,
158 const char *options
);
159 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
160 uint32_t num_consumers
,
161 const hsa_agent_t
*consumers
,
162 hsa_signal_t
*signal
);
163 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
165 hsa_status_t (*hsa_memory_assign_agent_fn
) (void *ptr
, hsa_agent_t agent
,
166 hsa_access_permission_t access
);
167 hsa_status_t (*hsa_memory_copy_fn
)(void *dst
, const void *src
, size_t size
);
168 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
169 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
170 hsa_status_t (*hsa_executable_get_symbol_fn
)
171 (hsa_executable_t executable
, const char *module_name
,
172 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
173 hsa_executable_symbol_t
*symbol
);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
175 (hsa_executable_symbol_t executable_symbol
,
176 hsa_executable_symbol_info_t attribute
, void *value
);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn
)
178 (hsa_executable_t executable
,
179 hsa_status_t (*callback
)(hsa_executable_t executable
,
180 hsa_executable_symbol_t symbol
, void *data
),
182 uint64_t (*hsa_queue_add_write_index_release_fn
) (const hsa_queue_t
*queue
,
184 uint64_t (*hsa_queue_load_read_index_acquire_fn
) (const hsa_queue_t
*queue
);
185 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
186 hsa_signal_value_t value
);
187 void (*hsa_signal_store_release_fn
) (hsa_signal_t signal
,
188 hsa_signal_value_t value
);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
190 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
191 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
192 hsa_wait_state_t wait_state_hint
);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn
) (hsa_signal_t signal
);
194 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
196 hsa_status_t (*hsa_code_object_deserialize_fn
)
197 (void *serialized_code_object
, size_t serialized_code_object_size
,
198 const char *options
, hsa_code_object_t
*code_object
);
199 hsa_status_t (*hsa_amd_memory_lock_fn
)
200 (void *host_ptr
, size_t size
, hsa_agent_t
*agents
, int num_agent
,
202 hsa_status_t (*hsa_amd_memory_unlock_fn
) (void *host_ptr
);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn
)
204 (const hsa_pitched_ptr_t
*dst
, const hsa_dim3_t
*dst_offset
,
205 const hsa_pitched_ptr_t
*src
, const hsa_dim3_t
*src_offset
,
206 const hsa_dim3_t
*range
, hsa_agent_t copy_agent
,
207 hsa_amd_copy_direction_t dir
, uint32_t num_dep_signals
,
208 const hsa_signal_t
*dep_signals
, hsa_signal_t completion_signal
);
211 /* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
214 struct GOMP_kernel_launch_attributes
216 /* Number of dimensions the workload has. Maximum number is 3. */
218 /* Size of the grid in the three respective dimensions. */
220 /* Size of work-groups in the respective dimensions. */
224 /* Collection of information needed for a dispatch of a kernel from a
227 struct kernel_dispatch
229 struct agent_info
*agent
;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address
;
236 /* Synchronization signal used for dispatch synchronization. */
238 /* Private segment size. */
239 uint32_t private_segment_size
;
240 /* Group segment size. */
241 uint32_t group_segment_size
;
244 /* Structure of the kernargs segment, supporting console output.
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
250 struct kernargs_abi abi
;
253 struct output output_data
;
256 /* A queue entry for a future asynchronous launch. */
260 struct kernel_info
*kernel
;
262 struct GOMP_kernel_launch_attributes kla
;
265 /* A queue entry for a future callback. */
273 /* A data struct for the copy_data callback. */
280 struct goacc_asyncqueue
*aq
;
283 /* A queue entry for a placeholder. These correspond to a wait event. */
289 pthread_mutex_t mutex
;
292 /* A queue entry for a wait directive. */
294 struct asyncwait_info
296 struct placeholder
*placeholderp
;
299 /* Encode the type of an entry in an async queue. */
309 /* An entry in an async queue. */
313 enum entry_type type
;
315 struct kernel_launch launch
;
316 struct callback callback
;
317 struct asyncwait_info asyncwait
;
318 struct placeholder placeholder
;
322 /* An async queue header.
324 OpenMP may create one of these.
325 OpenACC may create many. */
327 struct goacc_asyncqueue
329 struct agent_info
*agent
;
330 hsa_queue_t
*hsa_queue
;
332 pthread_t thread_drain_queue
;
333 pthread_mutex_t mutex
;
334 pthread_cond_t queue_cond_in
;
335 pthread_cond_t queue_cond_out
;
336 struct queue_entry queue
[ASYNC_QUEUE_SIZE
];
339 int drain_queue_stop
;
342 struct goacc_asyncqueue
*prev
;
343 struct goacc_asyncqueue
*next
;
346 /* Mkoffload uses this structure to describe a kernel.
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
351 struct hsa_kernel_description
354 int oacc_dims
[3]; /* Only present for GCN kernels. */
359 /* Mkoffload uses this structure to describe an offload variable. */
361 struct global_var_info
367 /* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
370 struct gcn_image_desc
376 const unsigned kernel_count
;
377 struct hsa_kernel_description
*kernel_infos
;
378 const unsigned ind_func_count
;
379 const unsigned global_variable_count
;
382 /* Enum values corresponding to the the ELF architecture codes.
383 Only 'special' values are actually referenced in this file, but having them
384 all may aid debugging. */
387 EF_AMDGPU_MACH_UNSUPPORTED
= -1,
388 #define GCN_DEVICE(name, NAME, ELF, ...) \
389 EF_AMDGPU_MACH_AMDGCN_ ## NAME = ELF,
390 #include "../../gcc/config/gcn/gcn-devices.def"
393 const static int EF_AMDGPU_MACH_MASK
= 0x000000ff;
394 typedef EF_AMDGPU_MACH gcn_isa
;
396 /* Description of an HSA GPU agent (device) and the program associated with
401 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
403 /* The user-visible device number. */
405 /* Whether the agent has been initialized. The fields below are usable only
409 /* The instruction set architecture of the device. */
411 /* Name of the agent. */
413 /* Name of the vendor of the agent. */
414 char vendor_name
[64];
415 /* Command queues of the agent. */
416 hsa_queue_t
*sync_queue
;
417 struct goacc_asyncqueue
*async_queues
, *omp_async_queue
;
418 pthread_mutex_t async_queues_mutex
;
420 /* The HSA memory region from which to allocate kernel arguments. */
421 hsa_region_t kernarg_region
;
423 /* The HSA memory region from which to allocate device data. */
424 hsa_region_t data_region
;
426 /* Allocated ephemeral memories (team arena and stack space). */
427 struct ephemeral_memories_list
*ephemeral_memories_list
;
428 pthread_mutex_t ephemeral_memories_write_lock
;
430 /* Read-write lock that protects kernels which are running or about to be run
431 from interference with loading and unloading of images. Needs to be
432 locked for reading while a kernel is being run, and for writing if the
433 list of modules is manipulated (and thus the HSA program invalidated). */
434 pthread_rwlock_t module_rwlock
;
436 /* The module associated with this kernel. */
437 struct module_info
*module
;
439 /* Mutex enforcing that only one thread will finalize the HSA program. A
440 thread should have locked agent->module_rwlock for reading before
442 pthread_mutex_t prog_mutex
;
443 /* Flag whether the HSA program that consists of all the modules has been
446 /* HSA executable - the finalized program that is used to locate kernels. */
447 hsa_executable_t executable
;
450 /* Information required to identify, finalize and run any given kernel. */
452 enum offload_kind
{KIND_UNKNOWN
, KIND_OPENMP
, KIND_OPENACC
};
456 /* Name of the kernel, required to locate it within the GCN object-code
459 /* The specific agent the kernel has been or will be finalized for and run
461 struct agent_info
*agent
;
462 /* The specific module where the kernel takes place. */
463 struct module_info
*module
;
464 /* Information provided by mkoffload associated with the kernel. */
465 struct hsa_kernel_description
*description
;
466 /* Mutex enforcing that at most once thread ever initializes a kernel for
467 use. A thread should have locked agent->module_rwlock for reading before
469 pthread_mutex_t init_mutex
;
470 /* Flag indicating whether the kernel has been initialized and all fields
471 below it contain valid data. */
473 /* Flag indicating that the kernel has a problem that blocks an execution. */
474 bool initialization_failed
;
475 /* The object to be put into the dispatch queue. */
477 /* Required size of kernel arguments. */
478 uint32_t kernarg_segment_size
;
479 /* Required size of group segment. */
480 uint32_t group_segment_size
;
481 /* Required size of private segment. */
482 uint32_t private_segment_size
;
483 /* Set up for OpenMP or OpenACC? */
484 enum offload_kind kind
;
487 /* Information about a particular GCN module, its image and kernels. */
491 /* The description with which the program has registered the image. */
492 struct gcn_image_desc
*image_desc
;
493 /* GCN heap allocation. */
495 /* Physical boundaries of the loaded module. */
496 Elf64_Addr phys_address_start
;
497 Elf64_Addr phys_address_end
;
499 bool constructors_run_p
;
500 struct kernel_info
*init_array_func
, *fini_array_func
;
502 /* Number of kernels in this module. */
504 /* An array of kernel_info structures describing each kernel in this
506 struct kernel_info kernels
[];
509 /* A linked list of memory arenas allocated on the device.
510 These are used by OpenMP, as a means to optimize per-team malloc,
511 and for host-accessible stack space. */
513 struct ephemeral_memories_list
515 struct ephemeral_memories_list
*next
;
517 /* The size is determined by the number of teams and threads. */
519 /* The device address allocated memory. */
521 /* A flag to prevent two asynchronous kernels trying to use the same memory.
522 The mutex is locked until the kernel exits. */
523 pthread_mutex_t in_use
;
526 /* Information about the whole HSA environment and all of its agents. */
528 struct hsa_context_info
530 /* Whether the structure has been initialized. */
532 /* Number of usable GPU HSA agents in the system. */
534 /* Array of agent_info structures describing the individual HSA agents. */
535 struct agent_info
*agents
;
536 /* Driver version string. */
537 char driver_version_s
[30];
541 /* {{{ Global variables */
543 /* Information about the whole HSA environment and all of its agents. */
545 static struct hsa_context_info hsa_context
;
547 /* HSA runtime functions that are initialized in init_hsa_context. */
549 static struct hsa_runtime_fn_info hsa_fns
;
551 /* Heap space, allocated target-side, provided for use of newlib malloc.
552 Each module should have it's own heap allocated.
553 Beware that heap usage increases with OpenMP teams. See also arenas. */
555 static size_t gcn_kernel_heap_size
= DEFAULT_GCN_HEAP_SIZE
;
557 /* Ephemeral memory sizes for each kernel launch. */
559 static int team_arena_size
= DEFAULT_TEAM_ARENA_SIZE
;
560 static int stack_size
= DEFAULT_GCN_STACK_SIZE
;
561 static int lowlat_size
= -1;
563 /* Flag to decide whether print to stderr information about what is going on.
564 Set in init_debug depending on environment variables. */
568 /* Flag to decide if the runtime should suppress a possible fallback to host
571 static bool suppress_host_fallback
;
573 /* Flag to locate HSA runtime shared library that is dlopened
576 static const char *hsa_runtime_lib
;
578 /* Flag to decide if the runtime should support also CPU devices (can be
581 static bool support_cpu_devices
;
583 /* Runtime dimension overrides. Zero indicates default. */
585 static int override_x_dim
= 0;
586 static int override_z_dim
= 0;
589 /* {{{ Debug & Diagnostic */
591 /* Print a message to stderr if GCN_DEBUG value is set to true. */
593 #define DEBUG_PRINT(...) \
598 fprintf (stderr, __VA_ARGS__); \
603 /* Flush stderr if GCN_DEBUG value is set to true. */
605 #define DEBUG_FLUSH() \
611 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
614 #define DEBUG_LOG(prefix, ...) \
617 DEBUG_PRINT (prefix); \
618 DEBUG_PRINT (__VA_ARGS__); \
622 /* Print a debugging message to stderr. */
624 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
626 /* Print a warning message to stderr. */
628 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
630 /* Print HSA warning STR with an HSA STATUS code. */
633 hsa_warn (const char *str
, hsa_status_t status
)
638 const char *hsa_error_msg
= "[unknown]";
639 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
641 fprintf (stderr
, "GCN warning: %s\nRuntime message: %s\n", str
,
645 /* Report a fatal error STR together with the HSA error corresponding to STATUS
646 and terminate execution of the current process. */
649 hsa_fatal (const char *str
, hsa_status_t status
)
651 const char *hsa_error_msg
= "[unknown]";
652 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
653 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str
,
657 /* Like hsa_fatal, except only report error message, and return FALSE
658 for propagating error processing to outside of plugin. */
661 hsa_error (const char *str
, hsa_status_t status
)
663 const char *hsa_error_msg
= "[unknown]";
664 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
665 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str
,
670 /* Dump information about the available hardware. */
673 dump_hsa_system_info (void)
677 hsa_endianness_t endianness
;
678 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS
,
680 if (status
== HSA_STATUS_SUCCESS
)
683 case HSA_ENDIANNESS_LITTLE
:
684 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
686 case HSA_ENDIANNESS_BIG
:
687 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
690 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
693 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
695 uint8_t extensions
[128];
696 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS
,
698 if (status
== HSA_STATUS_SUCCESS
)
700 if (extensions
[0] & (1 << HSA_EXTENSION_IMAGES
))
701 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
704 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
707 /* Dump information about the available hardware. */
710 dump_machine_model (hsa_machine_model_t machine_model
, const char *s
)
712 switch (machine_model
)
714 case HSA_MACHINE_MODEL_SMALL
:
715 GCN_DEBUG ("%s: SMALL\n", s
);
717 case HSA_MACHINE_MODEL_LARGE
:
718 GCN_DEBUG ("%s: LARGE\n", s
);
721 GCN_WARNING ("%s: UNKNOWN\n", s
);
726 /* Dump information about the available hardware. */
729 dump_profile (hsa_profile_t profile
, const char *s
)
733 case HSA_PROFILE_FULL
:
734 GCN_DEBUG ("%s: FULL\n", s
);
736 case HSA_PROFILE_BASE
:
737 GCN_DEBUG ("%s: BASE\n", s
);
740 GCN_WARNING ("%s: UNKNOWN\n", s
);
745 /* Dump information about a device memory region. */
748 dump_hsa_region (hsa_region_t region
, void *data
__attribute__((unused
)))
752 hsa_region_segment_t segment
;
753 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
755 if (status
== HSA_STATUS_SUCCESS
)
757 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
758 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
759 else if (segment
== HSA_REGION_SEGMENT_READONLY
)
760 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
761 else if (segment
== HSA_REGION_SEGMENT_PRIVATE
)
762 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
763 else if (segment
== HSA_REGION_SEGMENT_GROUP
)
764 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
766 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
769 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
771 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
775 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
777 if (status
== HSA_STATUS_SUCCESS
)
779 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
780 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
781 if (flags
& HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
)
782 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
783 if (flags
& HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
)
784 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
787 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
791 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
792 if (status
== HSA_STATUS_SUCCESS
)
793 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size
);
795 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
798 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_ALLOC_MAX_SIZE
,
800 if (status
== HSA_STATUS_SUCCESS
)
801 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size
);
803 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
807 = hsa_fns
.hsa_region_get_info_fn (region
,
808 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED
,
810 if (status
== HSA_STATUS_SUCCESS
)
811 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed
);
813 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
815 if (status
!= HSA_STATUS_SUCCESS
|| !alloc_allowed
)
816 return HSA_STATUS_SUCCESS
;
819 = hsa_fns
.hsa_region_get_info_fn (region
,
820 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
,
822 if (status
== HSA_STATUS_SUCCESS
)
823 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size
);
825 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
829 = hsa_fns
.hsa_region_get_info_fn (region
,
830 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT
,
832 if (status
== HSA_STATUS_SUCCESS
)
833 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align
);
835 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
837 return HSA_STATUS_SUCCESS
;
840 /* Dump information about all the device memory regions. */
843 dump_hsa_regions (hsa_agent_t agent
)
846 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
,
849 if (status
!= HSA_STATUS_SUCCESS
)
850 hsa_error ("Dumping hsa regions failed", status
);
853 /* Dump information about the available devices. */
856 dump_hsa_agent_info (hsa_agent_t agent
, void *data
__attribute__((unused
)))
861 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
,
863 if (status
== HSA_STATUS_SUCCESS
)
864 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf
);
866 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
868 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_VENDOR_NAME
,
870 if (status
== HSA_STATUS_SUCCESS
)
871 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf
);
873 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
875 hsa_machine_model_t machine_model
;
877 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_MACHINE_MODEL
,
879 if (status
== HSA_STATUS_SUCCESS
)
880 dump_machine_model (machine_model
, "HSA_AGENT_INFO_MACHINE_MODEL");
882 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
884 hsa_profile_t profile
;
885 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_PROFILE
,
887 if (status
== HSA_STATUS_SUCCESS
)
888 dump_profile (profile
, "HSA_AGENT_INFO_PROFILE");
890 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
892 hsa_device_type_t device_type
;
893 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
895 if (status
== HSA_STATUS_SUCCESS
)
899 case HSA_DEVICE_TYPE_CPU
:
900 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
902 case HSA_DEVICE_TYPE_GPU
:
903 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
905 case HSA_DEVICE_TYPE_DSP
:
906 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
909 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
914 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
917 status
= hsa_fns
.hsa_agent_get_info_fn
918 (agent
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
919 if (status
== HSA_STATUS_SUCCESS
)
920 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count
);
922 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
925 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_WAVEFRONT_SIZE
,
927 if (status
== HSA_STATUS_SUCCESS
)
928 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size
);
930 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
933 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
934 HSA_AGENT_INFO_WORKGROUP_MAX_DIM
,
936 if (status
== HSA_STATUS_SUCCESS
)
937 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim
);
939 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
942 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
943 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE
,
945 if (status
== HSA_STATUS_SUCCESS
)
946 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size
);
948 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
950 uint32_t grid_max_dim
;
951 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_DIM
,
953 if (status
== HSA_STATUS_SUCCESS
)
954 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim
);
956 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
958 uint32_t grid_max_size
;
959 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_SIZE
,
961 if (status
== HSA_STATUS_SUCCESS
)
962 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size
);
964 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
966 dump_hsa_regions (agent
);
968 return HSA_STATUS_SUCCESS
;
971 /* Forward reference. */
973 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol
);
975 /* Helper function for dump_executable_symbols. */
978 dump_executable_symbol (hsa_executable_t executable
,
979 hsa_executable_symbol_t symbol
,
980 void *data
__attribute__((unused
)))
982 char *name
= get_executable_symbol_name (symbol
);
986 GCN_DEBUG ("executable symbol: %s\n", name
);
990 return HSA_STATUS_SUCCESS
;
993 /* Dump all global symbol in an executable. */
996 dump_executable_symbols (hsa_executable_t executable
)
1000 = hsa_fns
.hsa_executable_iterate_symbols_fn (executable
,
1001 dump_executable_symbol
,
1003 if (status
!= HSA_STATUS_SUCCESS
)
1004 hsa_fatal ("Could not dump HSA executable symbols", status
);
1007 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1010 print_kernel_dispatch (struct kernel_dispatch
*dispatch
, unsigned indent
)
1012 struct kernargs
*kernargs
= (struct kernargs
*)dispatch
->kernarg_address
;
1014 fprintf (stderr
, "%*sthis: %p\n", indent
, "", dispatch
);
1015 fprintf (stderr
, "%*squeue: %p\n", indent
, "", dispatch
->queue
);
1016 fprintf (stderr
, "%*skernarg_address: %p\n", indent
, "", kernargs
);
1017 fprintf (stderr
, "%*sheap address: %p\n", indent
, "",
1018 (void*)kernargs
->abi
.heap_ptr
);
1019 fprintf (stderr
, "%*sarena address: %p (%d bytes per workgroup)\n", indent
,
1020 "", (void*)kernargs
->abi
.arena_ptr
,
1021 kernargs
->abi
.arena_size_per_team
);
1022 fprintf (stderr
, "%*sstack address: %p (%d bytes per wavefront)\n", indent
,
1023 "", (void*)kernargs
->abi
.stack_ptr
,
1024 kernargs
->abi
.stack_size_per_thread
);
1025 fprintf (stderr
, "%*sobject: %lu\n", indent
, "", dispatch
->object
);
1026 fprintf (stderr
, "%*sprivate_segment_size: %u\n", indent
, "",
1027 dispatch
->private_segment_size
);
1028 fprintf (stderr
, "%*sgroup_segment_size: %u (low-latency pool)\n", indent
,
1029 "", dispatch
->group_segment_size
);
1030 fprintf (stderr
, "\n");
1034 /* {{{ Utility functions */
1036 /* Cast the thread local storage to gcn_thread. */
1038 static inline struct gcn_thread
*
1041 return (struct gcn_thread
*) GOMP_PLUGIN_acc_thread ();
1044 /* Initialize debug and suppress_host_fallback according to the environment. */
1047 init_environment_variables (void)
1049 if (secure_getenv ("GCN_DEBUG"))
1054 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1055 suppress_host_fallback
= true;
1057 suppress_host_fallback
= false;
1059 hsa_runtime_lib
= secure_getenv ("HSA_RUNTIME_LIB");
1060 if (hsa_runtime_lib
== NULL
)
1061 hsa_runtime_lib
= "libhsa-runtime64.so.1";
1063 support_cpu_devices
= secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1065 const char *x
= secure_getenv ("GCN_NUM_TEAMS");
1067 x
= secure_getenv ("GCN_NUM_GANGS");
1069 override_x_dim
= atoi (x
);
1071 const char *z
= secure_getenv ("GCN_NUM_THREADS");
1073 z
= secure_getenv ("GCN_NUM_WORKERS");
1075 override_z_dim
= atoi (z
);
1077 const char *heap
= secure_getenv ("GCN_HEAP_SIZE");
1080 size_t tmp
= atol (heap
);
1082 gcn_kernel_heap_size
= tmp
;
1085 const char *arena
= secure_getenv ("GCN_TEAM_ARENA_SIZE");
1088 int tmp
= atoi (arena
);
1090 team_arena_size
= tmp
;;
1093 const char *stack
= secure_getenv ("GCN_STACK_SIZE");
1096 int tmp
= atoi (stack
);
1101 const char *lowlat
= secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1103 lowlat_size
= atoi (lowlat
);
1106 /* Return malloc'd string with name of SYMBOL. */
1109 get_executable_symbol_name (hsa_executable_symbol_t symbol
)
1111 hsa_status_t status
;
1114 const hsa_executable_symbol_info_t info_name_length
1115 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
;
1117 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name_length
,
1119 if (status
!= HSA_STATUS_SUCCESS
)
1121 hsa_error ("Could not get length of symbol name", status
);
1125 res
= GOMP_PLUGIN_malloc (len
+ 1);
1127 const hsa_executable_symbol_info_t info_name
1128 = HSA_EXECUTABLE_SYMBOL_INFO_NAME
;
1130 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name
, res
);
1132 if (status
!= HSA_STATUS_SUCCESS
)
1134 hsa_error ("Could not get symbol name", status
);
1144 /* Get the number of GPU Compute Units. */
1147 get_cu_count (struct agent_info
*agent
)
1150 hsa_status_t status
= hsa_fns
.hsa_agent_get_info_fn
1151 (agent
->id
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
1152 if (status
== HSA_STATUS_SUCCESS
)
1155 return 64; /* The usual number for older devices. */
1158 /* Calculate the maximum grid size for OMP threads / OACC workers.
1159 This depends on the kernel's resource usage levels. */
1162 limit_worker_threads (int threads
)
1164 /* FIXME Do something more inteligent here.
1165 GCN can always run 4 threads within a Compute Unit, but
1166 more than that depends on register usage. */
1172 /* This sets the maximum number of teams to twice the number of GPU Compute
1173 Units to avoid memory waste and corresponding memory access faults. */
1176 limit_teams (int teams
, struct agent_info
*agent
)
1178 int max_teams
= 2 * get_cu_count (agent
);
1179 if (teams
> max_teams
)
1184 /* Parse the target attributes INPUT provided by the compiler and return true
1185 if we should run anything all. If INPUT is NULL, fill DEF with default
1186 values, then store INPUT or DEF into *RESULT.
1188 This is used for OpenMP only. */
1191 parse_target_attributes (void **input
,
1192 struct GOMP_kernel_launch_attributes
*def
,
1193 struct GOMP_kernel_launch_attributes
**result
,
1194 struct agent_info
*agent
)
1197 GOMP_PLUGIN_fatal ("No target arguments provided");
1199 bool grid_attrs_found
= false;
1200 bool gcn_dims_found
= false;
1202 int gcn_threads
= 0;
1205 intptr_t id
= (intptr_t) *input
++, val
;
1207 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1208 val
= (intptr_t) *input
++;
1210 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1212 val
= (val
> INT_MAX
) ? INT_MAX
: val
;
1214 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) == GOMP_DEVICE_GCN
1215 && ((id
& GOMP_TARGET_ARG_ID_MASK
)
1216 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES
))
1218 grid_attrs_found
= true;
1221 else if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
)
1222 == GOMP_TARGET_ARG_DEVICE_ALL
)
1224 gcn_dims_found
= true;
1225 switch (id
& GOMP_TARGET_ARG_ID_MASK
)
1227 case GOMP_TARGET_ARG_NUM_TEAMS
:
1228 gcn_teams
= limit_teams (val
, agent
);
1230 case GOMP_TARGET_ARG_THREAD_LIMIT
:
1231 gcn_threads
= limit_worker_threads (val
);
1241 bool gfx900_workaround_p
= false;
1243 if (agent
->device_isa
== EF_AMDGPU_MACH_AMDGCN_GFX900
1244 && gcn_threads
== 0 && override_z_dim
== 0)
1246 gfx900_workaround_p
= true;
1247 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1248 "threads to at most 4 per team.\n");
1249 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1250 "GCN_NUM_THREADS=16\n");
1253 /* Ideally, when a dimension isn't explicitly specified, we should
1254 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1255 In practice, we tune for peak performance on BabelStream, which
1256 for OpenACC is currently 32 threads per CU. */
1258 if (gcn_teams
<= 0 && gcn_threads
<= 0)
1260 /* Set up a reasonable number of teams and threads. */
1261 gcn_threads
= gfx900_workaround_p
? 4 : 16; // 8;
1262 def
->gdims
[0] = get_cu_count (agent
); // * (40 / gcn_threads);
1263 def
->gdims
[2] = gcn_threads
;
1265 else if (gcn_teams
<= 0 && gcn_threads
> 0)
1267 /* Auto-scale the number of teams with the number of threads. */
1268 def
->gdims
[0] = get_cu_count (agent
); // * (40 / gcn_threads);
1269 def
->gdims
[2] = gcn_threads
;
1271 else if (gcn_teams
> 0 && gcn_threads
<= 0)
1273 int max_threads
= gfx900_workaround_p
? 4 : 16;
1275 /* Auto-scale the number of threads with the number of teams. */
1276 def
->gdims
[0] = gcn_teams
;
1277 def
->gdims
[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1278 if (def
->gdims
[2] == 0)
1280 else if (def
->gdims
[2] > max_threads
)
1281 def
->gdims
[2] = max_threads
;
1285 def
->gdims
[0] = gcn_teams
;
1286 def
->gdims
[2] = gcn_threads
;
1288 def
->gdims
[1] = 64; /* Each thread is 64 work items wide. */
1289 def
->wdims
[0] = 1; /* Single team per work-group. */
1295 else if (!grid_attrs_found
)
1305 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1309 struct GOMP_kernel_launch_attributes
*kla
;
1310 kla
= (struct GOMP_kernel_launch_attributes
*) *input
;
1312 if (kla
->ndim
== 0 || kla
->ndim
> 3)
1313 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla
->ndim
);
1315 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla
->ndim
);
1317 for (i
= 0; i
< kla
->ndim
; i
++)
1319 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i
,
1320 kla
->gdims
[i
], kla
->wdims
[i
]);
1321 if (kla
->gdims
[i
] == 0)
1327 /* Return the group size given the requested GROUP size, GRID size and number
1328 of grid dimensions NDIM. */
1331 get_group_size (uint32_t ndim
, uint32_t grid
, uint32_t group
)
1335 /* TODO: Provide a default via environment or device characteristics. */
1349 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1352 packet_store_release (uint32_t* packet
, uint16_t header
, uint16_t rest
)
1354 __atomic_store_n (packet
, header
| (rest
<< 16), __ATOMIC_RELEASE
);
1357 /* A never-called callback for the HSA command queues. These signal events
1358 that we don't use, so we trigger an error.
1360 This "queue" is not to be confused with the async queues, below. */
1363 hsa_queue_callback (hsa_status_t status
,
1364 hsa_queue_t
*queue
__attribute__ ((unused
)),
1365 void *data
__attribute__ ((unused
)))
1367 hsa_fatal ("Asynchronous queue error", status
);
1371 /* {{{ HSA initialization */
1373 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1376 init_hsa_runtime_functions (void)
1378 #define DLSYM_FN(function) \
1379 hsa_fns.function##_fn = dlsym (handle, #function); \
1380 if (hsa_fns.function##_fn == NULL) \
1381 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
1382 #define DLSYM_OPT_FN(function) \
1383 hsa_fns.function##_fn = dlsym (handle, #function);
1385 void *handle
= dlopen (hsa_runtime_lib
, RTLD_LAZY
);
1389 DLSYM_FN (hsa_status_string
)
1390 DLSYM_FN (hsa_system_get_info
)
1391 DLSYM_FN (hsa_agent_get_info
)
1393 DLSYM_FN (hsa_iterate_agents
)
1394 DLSYM_FN (hsa_region_get_info
)
1395 DLSYM_FN (hsa_queue_create
)
1396 DLSYM_FN (hsa_agent_iterate_regions
)
1397 DLSYM_FN (hsa_executable_destroy
)
1398 DLSYM_FN (hsa_executable_create
)
1399 DLSYM_FN (hsa_executable_global_variable_define
)
1400 DLSYM_FN (hsa_executable_load_code_object
)
1401 DLSYM_FN (hsa_executable_freeze
)
1402 DLSYM_FN (hsa_signal_create
)
1403 DLSYM_FN (hsa_memory_allocate
)
1404 DLSYM_FN (hsa_memory_assign_agent
)
1405 DLSYM_FN (hsa_memory_copy
)
1406 DLSYM_FN (hsa_memory_free
)
1407 DLSYM_FN (hsa_signal_destroy
)
1408 DLSYM_FN (hsa_executable_get_symbol
)
1409 DLSYM_FN (hsa_executable_symbol_get_info
)
1410 DLSYM_FN (hsa_executable_iterate_symbols
)
1411 DLSYM_FN (hsa_queue_add_write_index_release
)
1412 DLSYM_FN (hsa_queue_load_read_index_acquire
)
1413 DLSYM_FN (hsa_signal_wait_acquire
)
1414 DLSYM_FN (hsa_signal_store_relaxed
)
1415 DLSYM_FN (hsa_signal_store_release
)
1416 DLSYM_FN (hsa_signal_load_acquire
)
1417 DLSYM_FN (hsa_queue_destroy
)
1418 DLSYM_FN (hsa_code_object_deserialize
)
1419 DLSYM_OPT_FN (hsa_amd_memory_lock
)
1420 DLSYM_OPT_FN (hsa_amd_memory_unlock
)
1421 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect
)
1427 static gcn_isa
isa_code (const char *isa
);
1429 /* Return true if the agent is a GPU and can accept of concurrent submissions
1430 from different threads. */
1433 suitable_hsa_agent_p (hsa_agent_t agent
)
1435 hsa_device_type_t device_type
;
1437 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
1439 if (status
!= HSA_STATUS_SUCCESS
)
1442 switch (device_type
)
1444 case HSA_DEVICE_TYPE_GPU
:
1448 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
, name
);
1449 if (status
!= HSA_STATUS_SUCCESS
1450 || isa_code (name
) == EF_AMDGPU_MACH_UNSUPPORTED
)
1452 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1453 status
== HSA_STATUS_SUCCESS
? name
: "invalid");
1458 case HSA_DEVICE_TYPE_CPU
:
1459 if (!support_cpu_devices
)
1466 uint32_t features
= 0;
1467 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_FEATURE
,
1469 if (status
!= HSA_STATUS_SUCCESS
1470 || !(features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
))
1472 hsa_queue_type_t queue_type
;
1473 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_QUEUE_TYPE
,
1475 if (status
!= HSA_STATUS_SUCCESS
1476 || (queue_type
!= HSA_QUEUE_TYPE_MULTI
))
1482 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1483 agent_count in hsa_context. */
1486 count_gpu_agents (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
1488 if (suitable_hsa_agent_p (agent
))
1489 hsa_context
.agent_count
++;
1490 return HSA_STATUS_SUCCESS
;
1493 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1494 id to the describing structure in the hsa context. The index of the
1495 structure is pointed to by DATA, increment it afterwards. */
1498 assign_agent_ids (hsa_agent_t agent
, void *data
)
1500 if (suitable_hsa_agent_p (agent
))
1502 int *agent_index
= (int *) data
;
1503 hsa_context
.agents
[*agent_index
].id
= agent
;
1506 return HSA_STATUS_SUCCESS
;
1509 /* Initialize hsa_context if it has not already been done.
1510 If !PROBE: returns TRUE on success.
1511 If PROBE: returns TRUE on success or if the plugin/device shall be silently
1512 ignored, and otherwise emits an error and returns FALSE. */
1515 init_hsa_context (bool probe
)
1517 hsa_status_t status
;
1518 int agent_index
= 0;
1520 if (hsa_context
.initialized
)
1522 init_environment_variables ();
1523 if (!init_hsa_runtime_functions ())
1525 const char *msg
= "Run-time could not be dynamically opened";
1526 if (suppress_host_fallback
)
1527 GOMP_PLUGIN_fatal ("%s\n", msg
);
1529 GCN_WARNING ("%s\n", msg
);
1530 return probe
? true : false;
1532 status
= hsa_fns
.hsa_init_fn ();
1533 if (status
!= HSA_STATUS_SUCCESS
)
1534 return hsa_error ("Run-time could not be initialized", status
);
1535 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1538 dump_hsa_system_info ();
1540 status
= hsa_fns
.hsa_iterate_agents_fn (count_gpu_agents
, NULL
);
1541 if (status
!= HSA_STATUS_SUCCESS
)
1542 return hsa_error ("GCN GPU devices could not be enumerated", status
);
1543 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context
.agent_count
);
1546 = GOMP_PLUGIN_malloc_cleared (hsa_context
.agent_count
1547 * sizeof (struct agent_info
));
1548 status
= hsa_fns
.hsa_iterate_agents_fn (assign_agent_ids
, &agent_index
);
1549 if (status
!= HSA_STATUS_SUCCESS
)
1550 return hsa_error ("Scanning compute agents failed", status
);
1551 if (agent_index
!= hsa_context
.agent_count
)
1553 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1559 status
= hsa_fns
.hsa_iterate_agents_fn (dump_hsa_agent_info
, NULL
);
1560 if (status
!= HSA_STATUS_SUCCESS
)
1561 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1564 uint16_t minor
, major
;
1565 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR
,
1567 if (status
!= HSA_STATUS_SUCCESS
)
1568 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1569 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR
,
1571 if (status
!= HSA_STATUS_SUCCESS
)
1572 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1574 size_t len
= sizeof hsa_context
.driver_version_s
;
1575 int printed
= snprintf (hsa_context
.driver_version_s
, len
,
1576 "HSA Runtime %hu.%hu", (unsigned short int)major
,
1577 (unsigned short int)minor
);
1579 GCN_WARNING ("HSA runtime version string was truncated."
1580 "Version %hu.%hu is too long.", (unsigned short int)major
,
1581 (unsigned short int)minor
);
1583 hsa_context
.initialized
= true;
1587 /* Verify that hsa_context has already been initialized and return the
1588 agent_info structure describing device number N. Return NULL on error. */
1590 static struct agent_info
*
1591 get_agent_info (int n
)
1593 if (!hsa_context
.initialized
)
1595 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1598 if (n
>= hsa_context
.agent_count
)
1600 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n
);
1603 if (!hsa_context
.agents
[n
].initialized
)
1605 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1608 return &hsa_context
.agents
[n
];
1611 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1613 Selects (breaks at) a suitable region of type KIND. */
1616 get_memory_region (hsa_region_t region
, hsa_region_t
*retval
,
1617 hsa_region_global_flag_t kind
)
1619 hsa_status_t status
;
1620 hsa_region_segment_t segment
;
1622 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
1624 if (status
!= HSA_STATUS_SUCCESS
)
1626 if (segment
!= HSA_REGION_SEGMENT_GLOBAL
)
1627 return HSA_STATUS_SUCCESS
;
1630 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
1632 if (status
!= HSA_STATUS_SUCCESS
)
1637 return HSA_STATUS_INFO_BREAK
;
1639 return HSA_STATUS_SUCCESS
;
1642 /* Callback of hsa_agent_iterate_regions.
1644 Selects a kernargs memory region. */
1647 get_kernarg_memory_region (hsa_region_t region
, void *data
)
1649 return get_memory_region (region
, (hsa_region_t
*)data
,
1650 HSA_REGION_GLOBAL_FLAG_KERNARG
);
1653 /* Callback of hsa_agent_iterate_regions.
1655 Selects a coarse-grained memory region suitable for the heap and
1659 get_data_memory_region (hsa_region_t region
, void *data
)
1661 return get_memory_region (region
, (hsa_region_t
*)data
,
1662 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
);
1666 elf_gcn_isa_field (Elf64_Ehdr
*image
)
1668 return image
->e_flags
& EF_AMDGPU_MACH_MASK
;
1671 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1675 isa_name (int isa
) {
1678 #define GCN_DEVICE(name, NAME, ELF, ...) \
1679 case ELF: return #name;
1680 #include "../../gcc/config/gcn/gcn-devices.def"
1685 /* Returns the code which is used in the GCN object code to identify the ISA with
1686 the given name (as used by the HSA runtime). */
1689 isa_code(const char *isa
) {
1690 #define GCN_DEVICE(name, NAME, ELF, ...) \
1691 if (!strcmp (isa, #name)) return ELF;
1692 #include "../../gcc/config/gcn/gcn-devices.def"
1694 return EF_AMDGPU_MACH_UNSUPPORTED
;
1697 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1700 max_isa_vgprs (int isa
)
1704 #define GCN_DEVICE(name, NAME, ELF, ISA, XNACK, SRAM, WAVE64, CU, \
1705 MAX_ISA_VGPRS, ...) \
1706 case ELF: return MAX_ISA_VGPRS;
1707 #include "../../gcc/config/gcn/gcn-devices.def"
1709 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1716 /* Create or reuse a team arena and stack space.
1718 Team arenas are used by OpenMP to avoid calling malloc multiple times
1719 while setting up each team. This is purely a performance optimization.
1721 The stack space is used by all kernels. We must allocate it in such a
1722 way that the reverse offload implmentation can access the data.
1724 Allocating this memory costs performance, so this function will reuse an
1725 existing allocation if a large enough one is idle.
1726 The memory lock is released, but not deallocated, when the kernel exits. */
1729 configure_ephemeral_memories (struct kernel_info
*kernel
,
1730 struct kernargs_abi
*kernargs
, int num_teams
,
1733 struct agent_info
*agent
= kernel
->agent
;
1734 struct ephemeral_memories_list
**next_ptr
= &agent
->ephemeral_memories_list
;
1735 struct ephemeral_memories_list
*item
;
1737 int actual_arena_size
= (kernel
->kind
== KIND_OPENMP
1738 ? team_arena_size
: 0);
1739 int actual_arena_total_size
= actual_arena_size
* num_teams
;
1740 size_t size
= (actual_arena_total_size
1741 + num_teams
* num_threads
* stack_size
);
1743 for (item
= *next_ptr
; item
; next_ptr
= &item
->next
, item
= item
->next
)
1745 if (item
->size
< size
)
1748 if (pthread_mutex_trylock (&item
->in_use
) == 0)
1754 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1755 " (%zd bytes)\n", (actual_arena_size
? "arena and " : ""),
1756 num_teams
, num_threads
, size
);
1758 if (pthread_mutex_lock (&agent
->ephemeral_memories_write_lock
))
1760 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1763 item
= malloc (sizeof (*item
));
1768 if (pthread_mutex_init (&item
->in_use
, NULL
))
1770 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1773 if (pthread_mutex_lock (&item
->in_use
))
1775 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1778 if (pthread_mutex_unlock (&agent
->ephemeral_memories_write_lock
))
1780 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1784 hsa_status_t status
;
1785 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
, size
,
1787 if (status
!= HSA_STATUS_SUCCESS
)
1788 hsa_fatal ("Could not allocate memory for GCN kernel arena", status
);
1789 status
= hsa_fns
.hsa_memory_assign_agent_fn (item
->address
, agent
->id
,
1790 HSA_ACCESS_PERMISSION_RW
);
1791 if (status
!= HSA_STATUS_SUCCESS
)
1792 hsa_fatal ("Could not assign arena & stack memory to device", status
);
1795 kernargs
->arena_ptr
= (actual_arena_total_size
1796 ? (uint64_t)item
->address
1798 kernargs
->stack_ptr
= (uint64_t)item
->address
+ actual_arena_total_size
;
1799 kernargs
->arena_size_per_team
= actual_arena_size
;
1800 kernargs
->stack_size_per_thread
= stack_size
;
1803 /* Mark an ephemeral memory space available for reuse. */
1806 release_ephemeral_memories (struct agent_info
* agent
, void *address
)
1808 struct ephemeral_memories_list
*item
;
1810 for (item
= agent
->ephemeral_memories_list
; item
; item
= item
->next
)
1812 if (item
->address
== address
)
1814 if (pthread_mutex_unlock (&item
->in_use
))
1815 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1819 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1822 /* Clean up all the allocated team arenas. */
1825 destroy_ephemeral_memories (struct agent_info
*agent
)
1827 struct ephemeral_memories_list
*item
, *next
;
1829 for (item
= agent
->ephemeral_memories_list
; item
; item
= next
)
1832 hsa_fns
.hsa_memory_free_fn (item
->address
);
1833 if (pthread_mutex_destroy (&item
->in_use
))
1835 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1840 agent
->ephemeral_memories_list
= NULL
;
1845 /* Allocate memory on a specified device. */
1848 alloc_by_agent (struct agent_info
*agent
, size_t size
)
1850 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size
, agent
->device_id
);
1853 hsa_status_t status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1855 if (status
!= HSA_STATUS_SUCCESS
)
1857 hsa_error ("Could not allocate device memory", status
);
1861 status
= hsa_fns
.hsa_memory_assign_agent_fn (ptr
, agent
->id
,
1862 HSA_ACCESS_PERMISSION_RW
);
1863 if (status
!= HSA_STATUS_SUCCESS
)
1865 hsa_error ("Could not assign data memory to device", status
);
1869 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1870 bool profiling_dispatch_p
1871 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1872 if (profiling_dispatch_p
)
1874 acc_prof_info
*prof_info
= thr
->prof_info
;
1875 acc_event_info data_event_info
;
1876 acc_api_info
*api_info
= thr
->api_info
;
1878 prof_info
->event_type
= acc_ev_alloc
;
1880 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1881 data_event_info
.data_event
.valid_bytes
1882 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1883 data_event_info
.data_event
.parent_construct
1884 = acc_construct_parallel
;
1885 data_event_info
.data_event
.implicit
= 1;
1886 data_event_info
.data_event
.tool_info
= NULL
;
1887 data_event_info
.data_event
.var_name
= NULL
;
1888 data_event_info
.data_event
.bytes
= size
;
1889 data_event_info
.data_event
.host_ptr
= NULL
;
1890 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
1892 api_info
->device_api
= acc_device_api_other
;
1894 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1901 /* Create kernel dispatch data structure for given KERNEL, along with
1902 the necessary device signals and memory allocations. */
1904 static struct kernel_dispatch
*
1905 create_kernel_dispatch (struct kernel_info
*kernel
, int num_teams
,
1908 struct agent_info
*agent
= kernel
->agent
;
1909 struct kernel_dispatch
*shadow
1910 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch
));
1912 shadow
->agent
= kernel
->agent
;
1913 shadow
->object
= kernel
->object
;
1915 hsa_signal_t sync_signal
;
1916 hsa_status_t status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &sync_signal
);
1917 if (status
!= HSA_STATUS_SUCCESS
)
1918 hsa_fatal ("Error creating the GCN sync signal", status
);
1920 shadow
->signal
= sync_signal
.handle
;
1921 shadow
->private_segment_size
= kernel
->private_segment_size
;
1923 if (lowlat_size
< 0)
1925 /* Divide the LDS between the number of running teams.
1926 Allocate not less than is defined in the kernel metadata. */
1927 int teams_per_cu
= num_teams
/ get_cu_count (agent
);
1928 int LDS_per_team
= (teams_per_cu
? 65536 / teams_per_cu
: 65536);
1929 shadow
->group_segment_size
1930 = (kernel
->group_segment_size
> LDS_per_team
1931 ? kernel
->group_segment_size
1934 else if (lowlat_size
< GCN_LOWLAT_HEAP
+8)
1935 /* Ensure that there's space for the OpenMP libgomp data. */
1936 shadow
->group_segment_size
= GCN_LOWLAT_HEAP
+8;
1938 shadow
->group_segment_size
= (lowlat_size
> 65536
1942 /* We expect kernels to request a single pointer, explicitly, and the
1943 rest of struct kernargs, implicitly. If they request anything else
1944 then something is wrong. */
1945 if (kernel
->kernarg_segment_size
> 8)
1947 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1951 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->kernarg_region
,
1952 sizeof (struct kernargs
),
1953 &shadow
->kernarg_address
);
1954 if (status
!= HSA_STATUS_SUCCESS
)
1955 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status
);
1956 struct kernargs
*kernargs
= shadow
->kernarg_address
;
1958 /* Zero-initialize the output_data (minimum needed). */
1959 kernargs
->abi
.out_ptr
= (int64_t)&kernargs
->output_data
;
1960 kernargs
->output_data
.next_output
= 0;
1961 for (unsigned i
= 0;
1962 i
< (sizeof (kernargs
->output_data
.queue
)
1963 / sizeof (kernargs
->output_data
.queue
[0]));
1965 kernargs
->output_data
.queue
[i
].written
= 0;
1966 kernargs
->output_data
.consumed
= 0;
1968 /* Pass in the heap location. */
1969 kernargs
->abi
.heap_ptr
= (int64_t)kernel
->module
->heap
;
1971 /* Create the ephemeral memory spaces. */
1972 configure_ephemeral_memories (kernel
, &kernargs
->abi
, num_teams
, num_threads
);
1974 /* Ensure we can recognize unset return values. */
1975 kernargs
->output_data
.return_value
= 0xcafe0000;
1981 process_reverse_offload (uint64_t fn
, uint64_t mapnum
, uint64_t hostaddrs
,
1982 uint64_t sizes
, uint64_t kinds
, uint64_t dev_num64
)
1984 int dev_num
= dev_num64
;
1985 GOMP_PLUGIN_target_rev (fn
, mapnum
, hostaddrs
, sizes
, kinds
, dev_num
,
1989 /* Output any data written to console output from the kernel. It is expected
1990 that this function is polled during kernel execution.
1992 We print all entries from the last item printed to the next entry without
1993 a "written" flag. If the "final" flag is set then it'll continue right to
1996 The print buffer is circular, but the from and to locations don't wrap when
1997 the buffer does, so the output limit is UINT_MAX. The target blocks on
1998 output when the buffer is full. */
2001 console_output (struct kernel_info
*kernel
, struct kernargs
*kernargs
,
2004 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
2005 / sizeof (kernargs
->output_data
.queue
[0]));
2007 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
2009 unsigned int to
= kernargs
->output_data
.next_output
;
2015 printf ("GCN print buffer overflowed.\n");
2020 for (i
= from
; i
< to
; i
++)
2022 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
2024 if (!data
->written
&& !final
)
2029 case 0: printf ("%.128s%ld\n", data
->msg
, data
->ivalue
); break;
2030 case 1: printf ("%.128s%f\n", data
->msg
, data
->dvalue
); break;
2031 case 2: printf ("%.128s%.128s\n", data
->msg
, data
->text
); break;
2032 case 3: printf ("%.128s%.128s", data
->msg
, data
->text
); break;
2034 process_reverse_offload (data
->value_u64
[0], data
->value_u64
[1],
2035 data
->value_u64
[2], data
->value_u64
[3],
2036 data
->value_u64
[4], data
->value_u64
[5]);
2038 default: printf ("GCN print buffer error!\n"); break;
2041 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
2047 /* Release data structure created for a kernel dispatch in SHADOW argument,
2048 and clean up the signal and memory allocations. */
2051 release_kernel_dispatch (struct kernel_dispatch
*shadow
)
2053 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow
);
2055 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2056 void *addr
= (void *)kernargs
->abi
.arena_ptr
;
2058 addr
= (void *)kernargs
->abi
.stack_ptr
;
2059 release_ephemeral_memories (shadow
->agent
, addr
);
2061 hsa_fns
.hsa_memory_free_fn (shadow
->kernarg_address
);
2064 s
.handle
= shadow
->signal
;
2065 hsa_fns
.hsa_signal_destroy_fn (s
);
2070 /* Extract the properties from a kernel binary. */
2073 init_kernel_properties (struct kernel_info
*kernel
)
2075 hsa_status_t status
;
2076 struct agent_info
*agent
= kernel
->agent
;
2077 hsa_executable_symbol_t kernel_symbol
;
2078 char *buf
= alloca (strlen (kernel
->name
) + 4);
2079 sprintf (buf
, "%s.kd", kernel
->name
);
2080 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
2083 if (status
!= HSA_STATUS_SUCCESS
)
2085 hsa_warn ("Could not find symbol for kernel in the code object", status
);
2086 fprintf (stderr
, "not found name: '%s'\n", buf
);
2087 dump_executable_symbols (agent
->executable
);
2090 GCN_DEBUG ("Located kernel %s\n", kernel
->name
);
2091 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2092 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
->object
);
2093 if (status
!= HSA_STATUS_SUCCESS
)
2094 hsa_fatal ("Could not extract a kernel object from its symbol", status
);
2095 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2096 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
2097 &kernel
->kernarg_segment_size
);
2098 if (status
!= HSA_STATUS_SUCCESS
)
2099 hsa_fatal ("Could not get info about kernel argument size", status
);
2100 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2101 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
2102 &kernel
->group_segment_size
);
2103 if (status
!= HSA_STATUS_SUCCESS
)
2104 hsa_fatal ("Could not get info about kernel group segment size", status
);
2105 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2106 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
2107 &kernel
->private_segment_size
);
2108 if (status
!= HSA_STATUS_SUCCESS
)
2109 hsa_fatal ("Could not get info about kernel private segment size",
2112 /* The kernel type is not known until something tries to launch it. */
2113 kernel
->kind
= KIND_UNKNOWN
;
2115 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2116 "following segment sizes: \n", kernel
->name
);
2117 GCN_DEBUG (" group_segment_size: %u\n",
2118 (unsigned) kernel
->group_segment_size
);
2119 GCN_DEBUG (" private_segment_size: %u\n",
2120 (unsigned) kernel
->private_segment_size
);
2121 GCN_DEBUG (" kernarg_segment_size: %u\n",
2122 (unsigned) kernel
->kernarg_segment_size
);
2126 kernel
->initialization_failed
= true;
2129 /* Do all the work that is necessary before running KERNEL for the first time.
2130 The function assumes the program has been created, finalized and frozen by
2131 create_and_finalize_hsa_program. */
2134 init_kernel (struct kernel_info
*kernel
)
2136 if (pthread_mutex_lock (&kernel
->init_mutex
))
2137 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2138 if (kernel
->initialized
)
2140 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2141 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2147 init_kernel_properties (kernel
);
2149 if (!kernel
->initialization_failed
)
2153 kernel
->initialized
= true;
2155 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2156 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2160 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2161 launch attributes from KLA.
2163 MODULE_LOCKED indicates that the caller already holds the lock and
2164 run_kernel need not lock it again.
2165 If AQ is NULL then agent->sync_queue will be used. */
2168 run_kernel (struct kernel_info
*kernel
, void *vars
,
2169 struct GOMP_kernel_launch_attributes
*kla
,
2170 struct goacc_asyncqueue
*aq
, bool module_locked
)
2172 struct agent_info
*agent
= kernel
->agent
;
2173 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel
->description
->sgpr_count
,
2174 kernel
->description
->vpgr_count
);
2176 /* Reduce the number of threads/workers if there are insufficient
2177 VGPRs available to run the kernels together. */
2178 if (kla
->ndim
== 3 && kernel
->description
->vpgr_count
> 0)
2180 int max_vgprs
= max_isa_vgprs (agent
->device_isa
);
2181 int granulated_vgprs
= (kernel
->description
->vpgr_count
+ 3) & ~3;
2182 int max_threads
= (max_vgprs
/ granulated_vgprs
) * 4;
2183 if (kla
->gdims
[2] > max_threads
)
2185 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2186 " per team/gang - reducing to %d threads/workers.\n",
2187 kla
->gdims
[2], max_threads
);
2188 kla
->gdims
[2] = max_threads
;
2192 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel
->agent
->device_id
,
2194 GCN_DEBUG ("GCN launch attribs: gdims:[");
2196 for (i
= 0; i
< kla
->ndim
; ++i
)
2200 DEBUG_PRINT ("%u", kla
->gdims
[i
]);
2202 DEBUG_PRINT ("], normalized gdims:[");
2203 for (i
= 0; i
< kla
->ndim
; ++i
)
2207 DEBUG_PRINT ("%u", kla
->gdims
[i
] / kla
->wdims
[i
]);
2209 DEBUG_PRINT ("], wdims:[");
2210 for (i
= 0; i
< kla
->ndim
; ++i
)
2214 DEBUG_PRINT ("%u", kla
->wdims
[i
]);
2216 DEBUG_PRINT ("]\n");
2219 if (!module_locked
&& pthread_rwlock_rdlock (&agent
->module_rwlock
))
2220 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2222 if (!agent
->initialized
)
2223 GOMP_PLUGIN_fatal ("Agent must be initialized");
2225 if (!kernel
->initialized
)
2226 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2228 hsa_queue_t
*command_q
= (aq
? aq
->hsa_queue
: kernel
->agent
->sync_queue
);
2231 = hsa_fns
.hsa_queue_add_write_index_release_fn (command_q
, 1);
2232 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index
);
2234 /* Wait until the queue is not full before writing the packet. */
2235 while (index
- hsa_fns
.hsa_queue_load_read_index_acquire_fn (command_q
)
2239 /* Do not allow the dimensions to be overridden when running
2240 constructors or destructors. */
2241 int override_x
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_x_dim
;
2242 int override_z
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_z_dim
;
2244 hsa_kernel_dispatch_packet_t
*packet
;
2245 packet
= ((hsa_kernel_dispatch_packet_t
*) command_q
->base_address
)
2246 + index
% command_q
->size
;
2248 memset (((uint8_t *) packet
) + 4, 0, sizeof (*packet
) - 4);
2249 packet
->grid_size_x
= override_x
? : kla
->gdims
[0];
2250 packet
->workgroup_size_x
= get_group_size (kla
->ndim
,
2251 packet
->grid_size_x
,
2256 packet
->grid_size_y
= kla
->gdims
[1];
2257 packet
->workgroup_size_y
= get_group_size (kla
->ndim
, kla
->gdims
[1],
2262 packet
->grid_size_y
= 1;
2263 packet
->workgroup_size_y
= 1;
2268 packet
->grid_size_z
= limit_worker_threads (override_z
2270 packet
->workgroup_size_z
= get_group_size (kla
->ndim
,
2271 packet
->grid_size_z
,
2276 packet
->grid_size_z
= 1;
2277 packet
->workgroup_size_z
= 1;
2280 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2281 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2282 packet
->grid_size_x
, packet
->grid_size_y
, packet
->grid_size_z
,
2283 packet
->grid_size_x
/ packet
->workgroup_size_x
,
2284 packet
->grid_size_y
/ packet
->workgroup_size_y
,
2285 packet
->grid_size_z
/ packet
->workgroup_size_z
,
2286 packet
->workgroup_size_x
, packet
->workgroup_size_y
,
2287 packet
->workgroup_size_z
);
2289 struct kernel_dispatch
*shadow
2290 = create_kernel_dispatch (kernel
, packet
->grid_size_x
,
2291 packet
->grid_size_z
);
2292 shadow
->queue
= command_q
;
2296 fprintf (stderr
, "\nKernel has following dependencies:\n");
2297 print_kernel_dispatch (shadow
, 2);
2300 packet
->private_segment_size
= shadow
->private_segment_size
;
2301 packet
->group_segment_size
= shadow
->group_segment_size
;
2302 packet
->kernel_object
= shadow
->object
;
2303 packet
->kernarg_address
= shadow
->kernarg_address
;
2305 s
.handle
= shadow
->signal
;
2306 packet
->completion_signal
= s
;
2307 hsa_fns
.hsa_signal_store_relaxed_fn (s
, 1);
2308 memcpy (shadow
->kernarg_address
, &vars
, sizeof (vars
));
2310 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2313 header
= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
2314 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
2315 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
2317 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel
->name
,
2320 packet_store_release ((uint32_t *) packet
, header
,
2321 (uint16_t) kla
->ndim
2322 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
);
2324 hsa_fns
.hsa_signal_store_release_fn (command_q
->doorbell_signal
,
2327 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2329 /* Root signal waits with 1ms timeout. */
2330 while (hsa_fns
.hsa_signal_wait_acquire_fn (s
, HSA_SIGNAL_CONDITION_LT
, 1,
2332 HSA_WAIT_STATE_BLOCKED
) != 0)
2334 console_output (kernel
, shadow
->kernarg_address
, false);
2336 console_output (kernel
, shadow
->kernarg_address
, true);
2338 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2339 unsigned int return_value
= (unsigned int)kernargs
->output_data
.return_value
;
2341 release_kernel_dispatch (shadow
);
2343 if (!module_locked
&& pthread_rwlock_unlock (&agent
->module_rwlock
))
2344 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2346 unsigned int upper
= (return_value
& ~0xffff) >> 16;
2347 if (upper
== 0xcafe)
2348 ; // exit not called, normal termination.
2349 else if (upper
== 0xffff)
2353 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2354 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2359 if (upper
== 0xffff)
2361 unsigned int signal
= (return_value
>> 8) & 0xff;
2363 if (signal
== SIGABRT
)
2365 GCN_WARNING ("GCN Kernel aborted\n");
2368 else if (signal
!= 0)
2370 GCN_WARNING ("GCN Kernel received unknown signal\n");
2374 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value
& 0xff);
2375 exit (return_value
& 0xff);
2380 /* {{{ Load/Unload */
2382 /* Initialize KERNEL from D and other parameters. Return true on success. */
2385 init_basic_kernel_info (struct kernel_info
*kernel
,
2386 struct hsa_kernel_description
*d
,
2387 struct agent_info
*agent
,
2388 struct module_info
*module
)
2390 kernel
->agent
= agent
;
2391 kernel
->module
= module
;
2392 kernel
->name
= d
->name
;
2393 kernel
->description
= d
;
2394 if (pthread_mutex_init (&kernel
->init_mutex
, NULL
))
2396 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2402 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2405 isa_matches_agent (struct agent_info
*agent
, Elf64_Ehdr
*image
)
2407 int isa_field
= elf_gcn_isa_field (image
);
2408 const char* isa_s
= isa_name (isa_field
);
2411 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR
);
2415 if (isa_field
!= agent
->device_isa
)
2418 const char *agent_isa_s
= isa_name (agent
->device_isa
);
2419 assert (agent_isa_s
);
2421 snprintf (msg
, sizeof msg
,
2422 "GCN code object ISA '%s' does not match GPU ISA '%s' "
2424 "Try to recompile with '-foffload-options=-march=%s',\n"
2425 "or use ROCR_VISIBLE_DEVICES to disable incompatible "
2427 isa_s
, agent_isa_s
, agent
->device_id
, agent_isa_s
);
2429 hsa_error (msg
, HSA_STATUS_ERROR
);
2436 /* Create and finalize the program consisting of all loaded modules. */
2439 create_and_finalize_hsa_program (struct agent_info
*agent
)
2441 hsa_status_t status
;
2443 if (pthread_mutex_lock (&agent
->prog_mutex
))
2445 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2448 if (agent
->prog_finalized
)
2452 = hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
2453 HSA_EXECUTABLE_STATE_UNFROZEN
,
2454 "", &agent
->executable
);
2455 if (status
!= HSA_STATUS_SUCCESS
)
2457 hsa_error ("Could not create GCN executable", status
);
2461 /* Load any GCN modules. */
2462 struct module_info
*module
= agent
->module
;
2465 Elf64_Ehdr
*image
= (Elf64_Ehdr
*)module
->image_desc
->gcn_image
->image
;
2467 if (!isa_matches_agent (agent
, image
))
2470 hsa_code_object_t co
= { 0 };
2471 status
= hsa_fns
.hsa_code_object_deserialize_fn
2472 (module
->image_desc
->gcn_image
->image
,
2473 module
->image_desc
->gcn_image
->size
,
2475 if (status
!= HSA_STATUS_SUCCESS
)
2477 hsa_error ("Could not deserialize GCN code object", status
);
2481 status
= hsa_fns
.hsa_executable_load_code_object_fn
2482 (agent
->executable
, agent
->id
, co
, "");
2483 if (status
!= HSA_STATUS_SUCCESS
)
2485 hsa_error ("Could not load GCN code object", status
);
2491 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
2492 gcn_kernel_heap_size
,
2493 (void**)&module
->heap
);
2494 if (status
!= HSA_STATUS_SUCCESS
)
2496 hsa_error ("Could not allocate memory for GCN heap", status
);
2500 status
= hsa_fns
.hsa_memory_assign_agent_fn
2501 (module
->heap
, agent
->id
, HSA_ACCESS_PERMISSION_RW
);
2502 if (status
!= HSA_STATUS_SUCCESS
)
2504 hsa_error ("Could not assign GCN heap memory to device", status
);
2508 hsa_fns
.hsa_memory_copy_fn (&module
->heap
->size
,
2509 &gcn_kernel_heap_size
,
2510 sizeof (gcn_kernel_heap_size
));
2516 dump_executable_symbols (agent
->executable
);
2518 status
= hsa_fns
.hsa_executable_freeze_fn (agent
->executable
, "");
2519 if (status
!= HSA_STATUS_SUCCESS
)
2521 hsa_error ("Could not freeze the GCN executable", status
);
2526 agent
->prog_finalized
= true;
2528 if (pthread_mutex_unlock (&agent
->prog_mutex
))
2530 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2541 /* Free the HSA program in agent and everything associated with it and set
2542 agent->prog_finalized and the initialized flags of all kernels to false.
2543 Return TRUE on success. */
2546 destroy_hsa_program (struct agent_info
*agent
)
2548 if (!agent
->prog_finalized
)
2551 hsa_status_t status
;
2553 GCN_DEBUG ("Destroying the current GCN program.\n");
2555 status
= hsa_fns
.hsa_executable_destroy_fn (agent
->executable
);
2556 if (status
!= HSA_STATUS_SUCCESS
)
2557 return hsa_error ("Could not destroy GCN executable", status
);
2562 for (i
= 0; i
< agent
->module
->kernel_count
; i
++)
2563 agent
->module
->kernels
[i
].initialized
= false;
2565 if (agent
->module
->heap
)
2567 hsa_fns
.hsa_memory_free_fn (agent
->module
->heap
);
2568 agent
->module
->heap
= NULL
;
2571 agent
->prog_finalized
= false;
2575 /* Deinitialize all information associated with MODULE and kernels within
2576 it. Return TRUE on success. */
2579 destroy_module (struct module_info
*module
, bool locked
)
2581 /* Run destructors before destroying module. */
2582 struct GOMP_kernel_launch_attributes kla
=
2586 /* Work-group size. */
2590 if (module
->fini_array_func
)
2592 init_kernel (module
->fini_array_func
);
2593 run_kernel (module
->fini_array_func
, NULL
, &kla
, NULL
, locked
);
2595 module
->constructors_run_p
= false;
2598 for (i
= 0; i
< module
->kernel_count
; i
++)
2599 if (pthread_mutex_destroy (&module
->kernels
[i
].init_mutex
))
2601 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2612 /* Callback of dispatch queues to report errors. */
2615 execute_queue_entry (struct goacc_asyncqueue
*aq
, int index
)
2617 struct queue_entry
*entry
= &aq
->queue
[index
];
2619 switch (entry
->type
)
2623 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2624 aq
->agent
->device_id
, aq
->id
, index
);
2625 run_kernel (entry
->u
.launch
.kernel
,
2626 entry
->u
.launch
.vars
,
2627 &entry
->u
.launch
.kla
, aq
, false);
2629 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2630 aq
->agent
->device_id
, aq
->id
, index
);
2635 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2636 aq
->agent
->device_id
, aq
->id
, index
);
2637 entry
->u
.callback
.fn (entry
->u
.callback
.data
);
2639 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2640 aq
->agent
->device_id
, aq
->id
, index
);
2645 /* FIXME: is it safe to access a placeholder that may already have
2647 struct placeholder
*placeholderp
= entry
->u
.asyncwait
.placeholderp
;
2650 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2651 aq
->agent
->device_id
, aq
->id
, index
);
2653 pthread_mutex_lock (&placeholderp
->mutex
);
2655 while (!placeholderp
->executed
)
2656 pthread_cond_wait (&placeholderp
->cond
, &placeholderp
->mutex
);
2658 pthread_mutex_unlock (&placeholderp
->mutex
);
2660 if (pthread_cond_destroy (&placeholderp
->cond
))
2661 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2663 if (pthread_mutex_destroy (&placeholderp
->mutex
))
2664 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2667 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2668 "entry (%d) done\n", aq
->agent
->device_id
, aq
->id
, index
);
2672 case ASYNC_PLACEHOLDER
:
2673 pthread_mutex_lock (&entry
->u
.placeholder
.mutex
);
2674 entry
->u
.placeholder
.executed
= 1;
2675 pthread_cond_signal (&entry
->u
.placeholder
.cond
);
2676 pthread_mutex_unlock (&entry
->u
.placeholder
.mutex
);
2680 GOMP_PLUGIN_fatal ("Unknown queue element");
2684 /* This function is run as a thread to service an async queue in the
2685 background. It runs continuously until the stop flag is set. */
2688 drain_queue (void *thread_arg
)
2690 struct goacc_asyncqueue
*aq
= thread_arg
;
2692 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
2694 aq
->drain_queue_stop
= 2;
2698 pthread_mutex_lock (&aq
->mutex
);
2702 if (aq
->drain_queue_stop
)
2705 if (aq
->queue_n
> 0)
2707 pthread_mutex_unlock (&aq
->mutex
);
2708 execute_queue_entry (aq
, aq
->queue_first
);
2710 pthread_mutex_lock (&aq
->mutex
);
2711 aq
->queue_first
= ((aq
->queue_first
+ 1)
2712 % ASYNC_QUEUE_SIZE
);
2715 if (DEBUG_THREAD_SIGNAL
)
2716 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2717 aq
->agent
->device_id
, aq
->id
);
2718 pthread_cond_broadcast (&aq
->queue_cond_out
);
2719 pthread_mutex_unlock (&aq
->mutex
);
2722 GCN_DEBUG ("Async thread %d:%d: continue\n", aq
->agent
->device_id
,
2724 pthread_mutex_lock (&aq
->mutex
);
2728 if (DEBUG_THREAD_SLEEP
)
2729 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2730 aq
->agent
->device_id
, aq
->id
);
2731 pthread_cond_wait (&aq
->queue_cond_in
, &aq
->mutex
);
2732 if (DEBUG_THREAD_SLEEP
)
2733 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2734 aq
->agent
->device_id
, aq
->id
);
2738 aq
->drain_queue_stop
= 2;
2739 if (DEBUG_THREAD_SIGNAL
)
2740 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2741 aq
->agent
->device_id
, aq
->id
);
2742 pthread_cond_broadcast (&aq
->queue_cond_out
);
2743 pthread_mutex_unlock (&aq
->mutex
);
2745 GCN_DEBUG ("Async thread %d:%d: returning\n", aq
->agent
->device_id
, aq
->id
);
2749 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2750 is not usually the case. This is just a debug tool. */
2753 drain_queue_synchronous (struct goacc_asyncqueue
*aq
)
2755 pthread_mutex_lock (&aq
->mutex
);
2757 while (aq
->queue_n
> 0)
2759 execute_queue_entry (aq
, aq
->queue_first
);
2761 aq
->queue_first
= ((aq
->queue_first
+ 1)
2762 % ASYNC_QUEUE_SIZE
);
2766 pthread_mutex_unlock (&aq
->mutex
);
2769 /* Block the current thread until an async queue is writable. The aq->mutex
2770 lock should be held on entry, and remains locked on exit. */
2773 wait_for_queue_nonfull (struct goacc_asyncqueue
*aq
)
2775 if (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2777 /* Queue is full. Wait for it to not be full. */
2778 while (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2779 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2783 /* Request an asynchronous kernel launch on the specified queue. This
2784 may block if the queue is full, but returns without waiting for the
2788 queue_push_launch (struct goacc_asyncqueue
*aq
, struct kernel_info
*kernel
,
2789 void *vars
, struct GOMP_kernel_launch_attributes
*kla
)
2791 assert (aq
->agent
== kernel
->agent
);
2793 pthread_mutex_lock (&aq
->mutex
);
2795 wait_for_queue_nonfull (aq
);
2797 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2798 % ASYNC_QUEUE_SIZE
);
2800 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq
->agent
->device_id
,
2801 aq
->id
, queue_last
);
2803 aq
->queue
[queue_last
].type
= KERNEL_LAUNCH
;
2804 aq
->queue
[queue_last
].u
.launch
.kernel
= kernel
;
2805 aq
->queue
[queue_last
].u
.launch
.vars
= vars
;
2806 aq
->queue
[queue_last
].u
.launch
.kla
= *kla
;
2810 if (DEBUG_THREAD_SIGNAL
)
2811 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2812 aq
->agent
->device_id
, aq
->id
);
2813 pthread_cond_signal (&aq
->queue_cond_in
);
2815 pthread_mutex_unlock (&aq
->mutex
);
2818 /* Request an asynchronous callback on the specified queue. The callback
2819 function will be called, with the given opaque data, from the appropriate
2820 async thread, when all previous items on that queue are complete. */
2823 queue_push_callback (struct goacc_asyncqueue
*aq
, void (*fn
)(void *),
2826 pthread_mutex_lock (&aq
->mutex
);
2828 wait_for_queue_nonfull (aq
);
2830 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2831 % ASYNC_QUEUE_SIZE
);
2833 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq
->agent
->device_id
,
2834 aq
->id
, queue_last
);
2836 aq
->queue
[queue_last
].type
= CALLBACK
;
2837 aq
->queue
[queue_last
].u
.callback
.fn
= fn
;
2838 aq
->queue
[queue_last
].u
.callback
.data
= data
;
2842 if (DEBUG_THREAD_SIGNAL
)
2843 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2844 aq
->agent
->device_id
, aq
->id
);
2845 pthread_cond_signal (&aq
->queue_cond_in
);
2847 pthread_mutex_unlock (&aq
->mutex
);
2850 /* Request that a given async thread wait for another thread (unspecified) to
2851 reach the given placeholder. The wait will occur when all previous entries
2852 on the queue are complete. A placeholder is effectively a kind of signal
2853 which simply sets a flag when encountered in a queue. */
2856 queue_push_asyncwait (struct goacc_asyncqueue
*aq
,
2857 struct placeholder
*placeholderp
)
2859 pthread_mutex_lock (&aq
->mutex
);
2861 wait_for_queue_nonfull (aq
);
2863 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2865 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq
->agent
->device_id
,
2866 aq
->id
, queue_last
);
2868 aq
->queue
[queue_last
].type
= ASYNC_WAIT
;
2869 aq
->queue
[queue_last
].u
.asyncwait
.placeholderp
= placeholderp
;
2873 if (DEBUG_THREAD_SIGNAL
)
2874 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2875 aq
->agent
->device_id
, aq
->id
);
2876 pthread_cond_signal (&aq
->queue_cond_in
);
2878 pthread_mutex_unlock (&aq
->mutex
);
2881 /* Add a placeholder into an async queue. When the async thread reaches the
2882 placeholder it will set the "executed" flag to true and continue.
2883 Another thread may be waiting on this thread reaching the placeholder. */
2885 static struct placeholder
*
2886 queue_push_placeholder (struct goacc_asyncqueue
*aq
)
2888 struct placeholder
*placeholderp
;
2890 pthread_mutex_lock (&aq
->mutex
);
2892 wait_for_queue_nonfull (aq
);
2894 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2896 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq
->agent
->device_id
,
2897 aq
->id
, queue_last
);
2899 aq
->queue
[queue_last
].type
= ASYNC_PLACEHOLDER
;
2900 placeholderp
= &aq
->queue
[queue_last
].u
.placeholder
;
2902 if (pthread_mutex_init (&placeholderp
->mutex
, NULL
))
2904 pthread_mutex_unlock (&aq
->mutex
);
2905 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2908 if (pthread_cond_init (&placeholderp
->cond
, NULL
))
2910 pthread_mutex_unlock (&aq
->mutex
);
2911 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2914 placeholderp
->executed
= 0;
2918 if (DEBUG_THREAD_SIGNAL
)
2919 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2920 aq
->agent
->device_id
, aq
->id
);
2921 pthread_cond_signal (&aq
->queue_cond_in
);
2923 pthread_mutex_unlock (&aq
->mutex
);
2925 return placeholderp
;
2928 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2931 finalize_async_thread (struct goacc_asyncqueue
*aq
)
2933 pthread_mutex_lock (&aq
->mutex
);
2934 if (aq
->drain_queue_stop
== 2)
2936 pthread_mutex_unlock (&aq
->mutex
);
2940 aq
->drain_queue_stop
= 1;
2942 if (DEBUG_THREAD_SIGNAL
)
2943 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2944 aq
->agent
->device_id
, aq
->id
);
2945 pthread_cond_signal (&aq
->queue_cond_in
);
2947 while (aq
->drain_queue_stop
!= 2)
2949 if (DEBUG_THREAD_SLEEP
)
2950 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2951 " to sleep\n", aq
->agent
->device_id
, aq
->id
);
2952 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2953 if (DEBUG_THREAD_SLEEP
)
2954 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2955 aq
->agent
->device_id
, aq
->id
);
2958 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq
->agent
->device_id
,
2960 pthread_mutex_unlock (&aq
->mutex
);
2962 int err
= pthread_join (aq
->thread_drain_queue
, NULL
);
2964 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2965 aq
->agent
->device_id
, aq
->id
, strerror (err
));
2966 GCN_DEBUG ("Joined with async thread %d:%d\n", aq
->agent
->device_id
, aq
->id
);
2969 /* Set up an async queue for OpenMP. There will be only one. The
2970 implementation simply uses an OpenACC async queue.
2971 FIXME: is this thread-safe if two threads call this function? */
2974 maybe_init_omp_async (struct agent_info
*agent
)
2976 if (!agent
->omp_async_queue
)
2977 agent
->omp_async_queue
2978 = GOMP_OFFLOAD_openacc_async_construct (agent
->device_id
);
2981 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2982 copies from read-only pages. */
2985 hsa_memory_copy_wrapper (void *dst
, const void *src
, size_t len
)
2987 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, len
);
2989 if (status
== HSA_STATUS_SUCCESS
)
2992 /* It appears that the copy fails if the source data is in a read-only page.
2993 We can't detect that easily, so try copying the data to a temporary buffer
2994 and doing the copy again if we got an error above. */
2996 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2997 "[%p:+%d]\n", (void *) src
, (int) len
);
2999 void *src_copy
= malloc (len
);
3000 memcpy (src_copy
, src
, len
);
3001 status
= hsa_fns
.hsa_memory_copy_fn (dst
, (const void *) src_copy
, len
);
3003 if (status
!= HSA_STATUS_SUCCESS
)
3004 GOMP_PLUGIN_error ("memory copy failed");
3007 /* Copy data to or from a device. This is intended for use as an async
3011 copy_data (void *data_
)
3013 struct copy_data
*data
= (struct copy_data
*)data_
;
3014 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3015 data
->aq
->agent
->device_id
, data
->aq
->id
, data
->len
, data
->src
,
3017 hsa_memory_copy_wrapper (data
->dst
, data
->src
, data
->len
);
3021 /* Request an asynchronous data copy, to or from a device, on a given queue.
3022 The event will be registered as a callback. */
3025 queue_push_copy (struct goacc_asyncqueue
*aq
, void *dst
, const void *src
,
3029 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3030 aq
->agent
->device_id
, aq
->id
, len
, src
, dst
);
3031 struct copy_data
*data
3032 = (struct copy_data
*)GOMP_PLUGIN_malloc (sizeof (struct copy_data
));
3037 queue_push_callback (aq
, copy_data
, data
);
3040 /* Return true if the given queue is currently empty. */
3043 queue_empty (struct goacc_asyncqueue
*aq
)
3045 pthread_mutex_lock (&aq
->mutex
);
3046 int res
= aq
->queue_n
== 0 ? 1 : 0;
3047 pthread_mutex_unlock (&aq
->mutex
);
3052 /* Wait for a given queue to become empty. This implements an OpenACC wait
3056 wait_queue (struct goacc_asyncqueue
*aq
)
3058 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
3060 drain_queue_synchronous (aq
);
3064 pthread_mutex_lock (&aq
->mutex
);
3066 while (aq
->queue_n
> 0)
3068 if (DEBUG_THREAD_SLEEP
)
3069 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3070 aq
->agent
->device_id
, aq
->id
);
3071 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
3072 if (DEBUG_THREAD_SLEEP
)
3073 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq
->agent
->device_id
,
3077 pthread_mutex_unlock (&aq
->mutex
);
3078 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq
->agent
->device_id
, aq
->id
);
3082 /* {{{ OpenACC support */
3084 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3087 gcn_exec (struct kernel_info
*kernel
,
3088 void **devaddrs
, unsigned *dims
, void *targ_mem_desc
, bool async
,
3089 struct goacc_asyncqueue
*aq
)
3091 if (!GOMP_OFFLOAD_can_run (kernel
))
3092 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3094 /* If we get here then this must be an OpenACC kernel. */
3095 kernel
->kind
= KIND_OPENACC
;
3097 struct hsa_kernel_description
*hsa_kernel_desc
= NULL
;
3098 for (unsigned i
= 0; i
< kernel
->module
->image_desc
->kernel_count
; i
++)
3100 struct hsa_kernel_description
*d
3101 = &kernel
->module
->image_desc
->kernel_infos
[i
];
3102 if (d
->name
== kernel
->name
)
3104 hsa_kernel_desc
= d
;
3109 /* We may have statically-determined dimensions in
3110 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3111 invocation at runtime in dims[]. We allow static dimensions to take
3112 priority over dynamic dimensions when present (non-zero). */
3113 if (hsa_kernel_desc
->oacc_dims
[0] > 0)
3114 dims
[0] = hsa_kernel_desc
->oacc_dims
[0];
3115 if (hsa_kernel_desc
->oacc_dims
[1] > 0)
3116 dims
[1] = hsa_kernel_desc
->oacc_dims
[1];
3117 if (hsa_kernel_desc
->oacc_dims
[2] > 0)
3118 dims
[2] = hsa_kernel_desc
->oacc_dims
[2];
3120 /* Ideally, when a dimension isn't explicitly specified, we should
3121 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3122 In practice, we tune for peak performance on BabelStream, which
3123 for OpenACC is currently 32 threads per CU. */
3124 if (dims
[0] == 0 && dims
[1] == 0)
3126 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3127 number. There isn't really a correct answer for this without a clue
3128 about the problem size, so let's do a reasonable number of workers
3131 dims
[0] = get_cu_count (kernel
->agent
) * 4; /* Gangs. */
3132 dims
[1] = 8; /* Workers. */
3134 else if (dims
[0] == 0 && dims
[1] > 0)
3136 /* Auto-scale the number of gangs with the requested number of workers. */
3137 dims
[0] = get_cu_count (kernel
->agent
) * (32 / dims
[1]);
3139 else if (dims
[0] > 0 && dims
[1] == 0)
3141 /* Auto-scale the number of workers with the requested number of gangs. */
3142 dims
[1] = get_cu_count (kernel
->agent
) * 32 / dims
[0];
3149 /* The incoming dimensions are expressed in terms of gangs, workers, and
3150 vectors. The HSA dimensions are expressed in terms of "work-items",
3151 which means multiples of vector lanes.
3153 The "grid size" specifies the size of the problem space, and the
3154 "work-group size" specifies how much of that we want a single compute
3155 unit to chew on at once.
3157 The three dimensions do not really correspond to hardware, but the
3158 important thing is that the HSA runtime will launch as many
3159 work-groups as it takes to process the entire grid, and each
3160 work-group will contain as many wave-fronts as it takes to process
3161 the work-items in that group.
3163 Essentially, as long as we set the Y dimension to 64 (the number of
3164 vector lanes in hardware), and the Z group size to the maximum (16),
3165 then we will get the gangs (X) and workers (Z) launched as we expect.
3167 The reason for the apparent reversal of vector and worker dimension
3168 order is to do with the way the run-time distributes work-items across
3170 struct GOMP_kernel_launch_attributes kla
=
3173 {dims
[0], 64, dims
[1]},
3174 /* Work-group size. */
3178 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3179 acc_prof_info
*prof_info
= thr
->prof_info
;
3180 acc_event_info enqueue_launch_event_info
;
3181 acc_api_info
*api_info
= thr
->api_info
;
3182 bool profiling_dispatch_p
= __builtin_expect (prof_info
!= NULL
, false);
3183 if (profiling_dispatch_p
)
3185 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
3187 enqueue_launch_event_info
.launch_event
.event_type
3188 = prof_info
->event_type
;
3189 enqueue_launch_event_info
.launch_event
.valid_bytes
3190 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
3191 enqueue_launch_event_info
.launch_event
.parent_construct
3192 = acc_construct_parallel
;
3193 enqueue_launch_event_info
.launch_event
.implicit
= 1;
3194 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
3195 enqueue_launch_event_info
.launch_event
.kernel_name
3196 = (char *) kernel
->name
;
3197 enqueue_launch_event_info
.launch_event
.num_gangs
= kla
.gdims
[0];
3198 enqueue_launch_event_info
.launch_event
.num_workers
= kla
.gdims
[2];
3199 enqueue_launch_event_info
.launch_event
.vector_length
= kla
.gdims
[1];
3201 api_info
->device_api
= acc_device_api_other
;
3203 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3204 &enqueue_launch_event_info
, api_info
);
3208 run_kernel (kernel
, devaddrs
, &kla
, NULL
, false);
3210 queue_push_launch (aq
, kernel
, devaddrs
, &kla
);
3212 if (profiling_dispatch_p
)
3214 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
3215 enqueue_launch_event_info
.launch_event
.event_type
= prof_info
->event_type
;
3216 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3217 &enqueue_launch_event_info
,
3223 /* {{{ Generic Plugin API */
3225 /* Return the name of the accelerator, which is "gcn". */
3228 GOMP_OFFLOAD_get_name (void)
3233 /* Return the UID; if not available return NULL.
3234 Returns freshly allocated memoy. */
3237 GOMP_OFFLOAD_get_uid (int ord
)
3240 hsa_status_t status
;
3241 struct agent_info
*agent
= get_agent_info (ord
);
3243 /* HSA documentation states: maximally 21 characters including NUL. */
3244 str
= GOMP_PLUGIN_malloc (21 * sizeof (char));
3245 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AMD_AGENT_INFO_UUID
,
3247 if (status
!= HSA_STATUS_SUCCESS
)
3255 /* Return the specific capabilities the HSA accelerator have. */
3258 GOMP_OFFLOAD_get_caps (void)
3260 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3261 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3262 | GOMP_OFFLOAD_CAP_OPENACC_200
;
3265 /* Identify as GCN accelerator. */
3268 GOMP_OFFLOAD_get_type (void)
3270 return OFFLOAD_TARGET_TYPE_GCN
;
3273 /* Return the libgomp version number we're compatible with. There is
3274 no requirement for cross-version compatibility. */
3277 GOMP_OFFLOAD_version (void)
3279 return GOMP_VERSION
;
3282 /* Return the number of GCN devices on the system. */
3285 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask
)
3287 if (!init_hsa_context (true))
3288 exit (EXIT_FAILURE
);
3289 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3290 devices were present. */
3291 if (hsa_context
.agent_count
> 0
3292 && ((omp_requires_mask
3293 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3294 | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
3295 | GOMP_REQUIRES_SELF_MAPS
3296 | GOMP_REQUIRES_REVERSE_OFFLOAD
)) != 0))
3298 /* Check whether host page access is supported; this is per system level
3299 (all GPUs supported by HSA). While intrinsically true for APUs, it
3300 requires XNACK support for discrete GPUs. */
3301 if (hsa_context
.agent_count
> 0
3302 && (omp_requires_mask
3303 & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
| GOMP_REQUIRES_SELF_MAPS
)))
3306 hsa_system_info_t type
= HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT
;
3307 hsa_status_t status
= hsa_fns
.hsa_system_get_info_fn (type
, &b
);
3308 if (status
!= HSA_STATUS_SUCCESS
)
3309 GOMP_PLUGIN_error ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT "
3315 return hsa_context
.agent_count
;
3318 /* Initialize device (agent) number N so that it can be used for computation.
3319 Return TRUE on success. */
3322 GOMP_OFFLOAD_init_device (int n
)
3324 if (!init_hsa_context (false))
3326 if (n
>= hsa_context
.agent_count
)
3328 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n
);
3331 struct agent_info
*agent
= &hsa_context
.agents
[n
];
3333 if (agent
->initialized
)
3336 agent
->device_id
= n
;
3338 if (pthread_rwlock_init (&agent
->module_rwlock
, NULL
))
3340 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3343 if (pthread_mutex_init (&agent
->prog_mutex
, NULL
))
3345 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3348 if (pthread_mutex_init (&agent
->async_queues_mutex
, NULL
))
3350 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3353 if (pthread_mutex_init (&agent
->ephemeral_memories_write_lock
, NULL
))
3355 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3358 agent
->async_queues
= NULL
;
3359 agent
->omp_async_queue
= NULL
;
3360 agent
->ephemeral_memories_list
= NULL
;
3362 uint32_t queue_size
;
3363 hsa_status_t status
;
3364 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
,
3365 HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
3367 if (status
!= HSA_STATUS_SUCCESS
)
3368 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3371 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_NAME
,
3373 if (status
!= HSA_STATUS_SUCCESS
)
3374 return hsa_error ("Error querying the name of the agent", status
);
3376 agent
->device_isa
= isa_code (agent
->name
);
3377 if (agent
->device_isa
== EF_AMDGPU_MACH_UNSUPPORTED
)
3379 char msg
[33 + 64 + 1];
3380 snprintf (msg
, sizeof msg
,
3381 "Unknown GCN agent architecture '%s'", agent
->name
);
3382 return hsa_error (msg
, HSA_STATUS_ERROR
);
3385 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_VENDOR_NAME
,
3386 &agent
->vendor_name
);
3387 if (status
!= HSA_STATUS_SUCCESS
)
3388 return hsa_error ("Error querying the vendor name of the agent", status
);
3390 status
= hsa_fns
.hsa_queue_create_fn (agent
->id
, queue_size
,
3391 HSA_QUEUE_TYPE_MULTI
,
3392 hsa_queue_callback
, NULL
, UINT32_MAX
,
3393 UINT32_MAX
, &agent
->sync_queue
);
3394 if (status
!= HSA_STATUS_SUCCESS
)
3395 return hsa_error ("Error creating command queue", status
);
3397 agent
->kernarg_region
.handle
= (uint64_t) -1;
3398 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3399 get_kernarg_memory_region
,
3400 &agent
->kernarg_region
);
3401 if (status
!= HSA_STATUS_SUCCESS
3402 && status
!= HSA_STATUS_INFO_BREAK
)
3403 hsa_error ("Scanning memory regions failed", status
);
3404 if (agent
->kernarg_region
.handle
== (uint64_t) -1)
3406 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3410 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3411 dump_hsa_region (agent
->kernarg_region
, NULL
);
3413 agent
->data_region
.handle
= (uint64_t) -1;
3414 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3415 get_data_memory_region
,
3416 &agent
->data_region
);
3417 if (status
!= HSA_STATUS_SUCCESS
3418 && status
!= HSA_STATUS_INFO_BREAK
)
3419 hsa_error ("Scanning memory regions failed", status
);
3420 if (agent
->data_region
.handle
== (uint64_t) -1)
3422 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3426 GCN_DEBUG ("Selected device data memory region:\n");
3427 dump_hsa_region (agent
->data_region
, NULL
);
3429 GCN_DEBUG ("GCN agent %d initialized\n", n
);
3431 agent
->initialized
= true;
3435 /* Load GCN object-code module described by struct gcn_image_desc in
3436 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3437 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3438 contain the on-device addresses of the functions for reverse offload. To be
3439 freed by the caller. */
3442 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
3443 struct addr_pair
**target_table
,
3444 uint64_t **rev_fn_table
,
3445 uint64_t *host_ind_fn_table
)
3447 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3449 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3450 " (expected %u, received %u)",
3451 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3455 struct gcn_image_desc
*image_desc
= (struct gcn_image_desc
*) target_data
;
3456 struct agent_info
*agent
;
3457 struct addr_pair
*pair
;
3458 struct module_info
*module
;
3459 struct kernel_info
*kernel
;
3460 int kernel_count
= image_desc
->kernel_count
;
3461 unsigned ind_func_count
= GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version
)
3462 ? image_desc
->ind_func_count
: 0;
3463 unsigned var_count
= image_desc
->global_variable_count
;
3464 /* Currently, "others" is a struct of ICVS. */
3465 int other_count
= 1;
3467 agent
= get_agent_info (ord
);
3471 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3473 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3476 if (agent
->prog_finalized
3477 && !destroy_hsa_program (agent
))
3480 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count
);
3481 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count
);
3482 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count
);
3483 GCN_DEBUG ("Expect %d other variables in an image\n", other_count
);
3484 pair
= GOMP_PLUGIN_malloc ((kernel_count
+ var_count
+ other_count
- 2)
3485 * sizeof (struct addr_pair
));
3486 *target_table
= pair
;
3487 module
= (struct module_info
*)
3488 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info
)
3489 + kernel_count
* sizeof (struct kernel_info
));
3490 module
->image_desc
= image_desc
;
3491 module
->kernel_count
= kernel_count
;
3492 module
->heap
= NULL
;
3493 module
->constructors_run_p
= false;
3495 kernel
= &module
->kernels
[0];
3497 /* Allocate memory for kernel dependencies. */
3498 for (unsigned i
= 0; i
< kernel_count
; i
++)
3500 struct hsa_kernel_description
*d
= &image_desc
->kernel_infos
[i
];
3501 if (!init_basic_kernel_info (kernel
, d
, agent
, module
))
3503 if (strcmp (d
->name
, "_init_array") == 0)
3504 module
->init_array_func
= kernel
;
3505 else if (strcmp (d
->name
, "_fini_array") == 0)
3506 module
->fini_array_func
= kernel
;
3509 pair
->start
= (uintptr_t) kernel
;
3510 pair
->end
= (uintptr_t) (kernel
+ 1);
3516 agent
->module
= module
;
3517 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3519 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3523 if (!create_and_finalize_hsa_program (agent
))
3528 hsa_status_t status
;
3529 hsa_executable_symbol_t var_symbol
;
3530 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3531 ".offload_var_table",
3535 if (status
!= HSA_STATUS_SUCCESS
)
3536 hsa_fatal ("Could not find symbol for variable in the code object",
3539 uint64_t var_table_addr
;
3540 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3541 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3543 if (status
!= HSA_STATUS_SUCCESS
)
3544 hsa_fatal ("Could not extract a variable from its symbol", status
);
3549 } var_table
[var_count
];
3550 GOMP_OFFLOAD_dev2host (agent
->device_id
, var_table
,
3551 (void*)var_table_addr
, sizeof (var_table
));
3553 for (unsigned i
= 0; i
< var_count
; i
++)
3555 pair
->start
= var_table
[i
].addr
;
3556 pair
->end
= var_table
[i
].addr
+ var_table
[i
].size
;
3557 GCN_DEBUG ("Found variable at %p with size %lu\n",
3558 (void *)var_table
[i
].addr
, var_table
[i
].size
);
3563 if (ind_func_count
> 0)
3565 hsa_status_t status
;
3567 /* Read indirect function table from image. */
3568 hsa_executable_symbol_t ind_funcs_symbol
;
3569 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3570 ".offload_ind_func_table",
3572 0, &ind_funcs_symbol
);
3574 if (status
!= HSA_STATUS_SUCCESS
)
3575 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3576 "code object", status
);
3578 uint64_t ind_funcs_table_addr
;
3579 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3580 (ind_funcs_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3581 &ind_funcs_table_addr
);
3582 if (status
!= HSA_STATUS_SUCCESS
)
3583 hsa_fatal ("Could not extract a variable from its symbol", status
);
3585 uint64_t ind_funcs_table
[ind_func_count
];
3586 GOMP_OFFLOAD_dev2host (agent
->device_id
, ind_funcs_table
,
3587 (void*) ind_funcs_table_addr
,
3588 sizeof (ind_funcs_table
));
3590 /* Build host->target address map for indirect functions. */
3591 uint64_t ind_fn_map
[ind_func_count
* 2 + 1];
3592 for (unsigned i
= 0; i
< ind_func_count
; i
++)
3594 ind_fn_map
[i
* 2] = host_ind_fn_table
[i
];
3595 ind_fn_map
[i
* 2 + 1] = ind_funcs_table
[i
];
3596 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3597 i
, host_ind_fn_table
[i
], ind_funcs_table
[i
]);
3599 ind_fn_map
[ind_func_count
* 2] = 0;
3601 /* Write the map onto the target. */
3602 void *map_target_addr
3603 = GOMP_OFFLOAD_alloc (agent
->device_id
, sizeof (ind_fn_map
));
3604 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr
);
3606 GOMP_OFFLOAD_host2dev (agent
->device_id
, map_target_addr
,
3608 sizeof (ind_fn_map
));
3610 /* Write address of the map onto the target. */
3611 hsa_executable_symbol_t symbol
;
3614 = hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3615 XSTRING (GOMP_INDIRECT_ADDR_MAP
),
3616 agent
->id
, 0, &symbol
);
3617 if (status
!= HSA_STATUS_SUCCESS
)
3618 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3624 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3625 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3627 if (status
!= HSA_STATUS_SUCCESS
)
3628 hsa_fatal ("Could not extract a variable from its symbol", status
);
3629 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3630 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
,
3632 if (status
!= HSA_STATUS_SUCCESS
)
3633 hsa_fatal ("Could not extract a variable size from its symbol",
3636 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3639 GOMP_OFFLOAD_host2dev (agent
->device_id
, (void *) varptr
,
3641 sizeof (map_target_addr
));
3644 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS
));
3646 hsa_status_t status
;
3647 hsa_executable_symbol_t var_symbol
;
3648 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3649 XSTRING (GOMP_ADDITIONAL_ICVS
),
3650 agent
->id
, 0, &var_symbol
);
3651 if (status
== HSA_STATUS_SUCCESS
)
3656 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3657 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3659 if (status
!= HSA_STATUS_SUCCESS
)
3660 hsa_fatal ("Could not extract a variable from its symbol", status
);
3661 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3662 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
,
3664 if (status
!= HSA_STATUS_SUCCESS
)
3665 hsa_fatal ("Could not extract a variable size from its symbol",
3668 pair
->start
= varptr
;
3669 pair
->end
= varptr
+ varsize
;
3673 /* The variable was not in this image. */
3674 GCN_DEBUG ("Variable not found in image: %s\n",
3675 XSTRING (GOMP_ADDITIONAL_ICVS
));
3676 pair
->start
= pair
->end
= 0;
3679 /* Ensure that constructors are run first. */
3680 struct GOMP_kernel_launch_attributes kla
=
3684 /* Work-group size. */
3688 if (module
->init_array_func
)
3690 init_kernel (module
->init_array_func
);
3691 run_kernel (module
->init_array_func
, NULL
, &kla
, NULL
, false);
3693 module
->constructors_run_p
= true;
3695 /* Don't report kernels that libgomp need not know about. */
3696 if (module
->init_array_func
)
3698 if (module
->fini_array_func
)
3701 if (rev_fn_table
!= NULL
&& kernel_count
== 0)
3702 *rev_fn_table
= NULL
;
3703 else if (rev_fn_table
!= NULL
)
3705 hsa_status_t status
;
3706 hsa_executable_symbol_t var_symbol
;
3707 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3708 ".offload_func_table",
3709 agent
->id
, 0, &var_symbol
);
3710 if (status
!= HSA_STATUS_SUCCESS
)
3711 hsa_fatal ("Could not find symbol for variable in the code object",
3713 uint64_t fn_table_addr
;
3714 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3715 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3717 if (status
!= HSA_STATUS_SUCCESS
)
3718 hsa_fatal ("Could not extract a variable from its symbol", status
);
3719 *rev_fn_table
= GOMP_PLUGIN_malloc (kernel_count
* sizeof (uint64_t));
3720 GOMP_OFFLOAD_dev2host (agent
->device_id
, *rev_fn_table
,
3721 (void*) fn_table_addr
,
3722 kernel_count
* sizeof (uint64_t));
3725 return kernel_count
+ var_count
+ other_count
;
3728 /* Unload GCN object-code module described by struct gcn_image_desc in
3729 TARGET_DATA from agent number N. Return TRUE on success. */
3732 GOMP_OFFLOAD_unload_image (int n
, unsigned version
, const void *target_data
)
3734 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3736 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3737 " (expected %u, received %u)",
3738 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3742 struct agent_info
*agent
;
3743 agent
= get_agent_info (n
);
3747 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3749 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3753 if (!agent
->module
|| agent
->module
->image_desc
!= target_data
)
3755 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3760 if (!destroy_module (agent
->module
, true))
3762 free (agent
->module
);
3763 agent
->module
= NULL
;
3764 if (!destroy_hsa_program (agent
))
3766 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3768 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3774 /* Deinitialize all information and status associated with agent number N. We
3775 do not attempt any synchronization, assuming the user and libgomp will not
3776 attempt deinitialization of a device that is in any way being used at the
3777 same time. Return TRUE on success. */
3780 GOMP_OFFLOAD_fini_device (int n
)
3782 struct agent_info
*agent
= get_agent_info (n
);
3786 if (!agent
->initialized
)
3789 if (agent
->omp_async_queue
)
3791 GOMP_OFFLOAD_openacc_async_destruct (agent
->omp_async_queue
);
3792 agent
->omp_async_queue
= NULL
;
3797 if (!destroy_module (agent
->module
, false))
3799 free (agent
->module
);
3800 agent
->module
= NULL
;
3803 if (!destroy_ephemeral_memories (agent
))
3806 if (!destroy_hsa_program (agent
))
3809 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (agent
->sync_queue
);
3810 if (status
!= HSA_STATUS_SUCCESS
)
3811 return hsa_error ("Error destroying command queue", status
);
3813 if (pthread_mutex_destroy (&agent
->prog_mutex
))
3815 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3818 if (pthread_rwlock_destroy (&agent
->module_rwlock
))
3820 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3824 if (pthread_mutex_destroy (&agent
->async_queues_mutex
))
3826 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3829 if (pthread_mutex_destroy (&agent
->ephemeral_memories_write_lock
))
3831 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3834 agent
->initialized
= false;
3838 /* Return true if the HSA runtime can run function FN_PTR. */
3841 GOMP_OFFLOAD_can_run (void *fn_ptr
)
3843 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3845 init_kernel (kernel
);
3846 if (kernel
->initialization_failed
)
3847 GOMP_PLUGIN_fatal ("kernel initialization failed");
3852 /* Allocate memory on device N. */
3855 GOMP_OFFLOAD_alloc (int n
, size_t size
)
3857 struct agent_info
*agent
= get_agent_info (n
);
3858 return alloc_by_agent (agent
, size
);
3861 /* Free memory from device N. */
3864 GOMP_OFFLOAD_free (int device
, void *ptr
)
3866 GCN_DEBUG ("Freeing memory on device %d\n", device
);
3868 hsa_status_t status
= hsa_fns
.hsa_memory_free_fn (ptr
);
3869 if (status
!= HSA_STATUS_SUCCESS
)
3871 hsa_error ("Could not free device memory", status
);
3875 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3876 bool profiling_dispatch_p
3877 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
3878 if (profiling_dispatch_p
)
3880 acc_prof_info
*prof_info
= thr
->prof_info
;
3881 acc_event_info data_event_info
;
3882 acc_api_info
*api_info
= thr
->api_info
;
3884 prof_info
->event_type
= acc_ev_free
;
3886 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
3887 data_event_info
.data_event
.valid_bytes
3888 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
3889 data_event_info
.data_event
.parent_construct
3890 = acc_construct_parallel
;
3891 data_event_info
.data_event
.implicit
= 1;
3892 data_event_info
.data_event
.tool_info
= NULL
;
3893 data_event_info
.data_event
.var_name
= NULL
;
3894 data_event_info
.data_event
.bytes
= 0;
3895 data_event_info
.data_event
.host_ptr
= NULL
;
3896 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
3898 api_info
->device_api
= acc_device_api_other
;
3900 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
3907 /* Copy data from DEVICE to host. */
3910 GOMP_OFFLOAD_dev2host (int device
, void *dst
, const void *src
, size_t n
)
3912 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n
, device
,
3914 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3915 if (status
!= HSA_STATUS_SUCCESS
)
3916 GOMP_PLUGIN_error ("memory copy failed");
3920 /* Copy data from host to DEVICE. */
3923 GOMP_OFFLOAD_host2dev (int device
, void *dst
, const void *src
, size_t n
)
3925 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n
, src
,
3927 hsa_memory_copy_wrapper (dst
, src
, n
);
3931 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3934 GOMP_OFFLOAD_dev2dev (int device
, void *dst
, const void *src
, size_t n
)
3936 struct gcn_thread
*thread_data
= gcn_thread ();
3938 if (thread_data
&& !async_synchronous_p (thread_data
->async
))
3940 struct agent_info
*agent
= get_agent_info (device
);
3941 maybe_init_omp_async (agent
);
3942 queue_push_copy (agent
->omp_async_queue
, dst
, src
, n
);
3946 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n
,
3947 device
, src
, device
, dst
);
3948 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3949 if (status
!= HSA_STATUS_SUCCESS
)
3950 GOMP_PLUGIN_error ("memory copy failed");
3954 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3955 measured in bytes. So we have:
3957 dim1_size: number of bytes to copy on innermost dimension ("row")
3958 dim0_len: number of rows to copy
3959 dst: base pointer for destination of copy
3960 dst_offset1_size: innermost row offset (for dest), in bytes
3961 dst_offset0_len: offset, number of rows (for dest)
3962 dst_dim1_size: whole-array dest row length, in bytes (pitch)
3963 src: base pointer for source of copy
3964 src_offset1_size: innermost row offset (for source), in bytes
3965 src_offset0_len: offset, number of rows (for source)
3966 src_dim1_size: whole-array source row length, in bytes (pitch)
3970 GOMP_OFFLOAD_memcpy2d (int dst_ord
, int src_ord
, size_t dim1_size
,
3971 size_t dim0_len
, void *dst
, size_t dst_offset1_size
,
3972 size_t dst_offset0_len
, size_t dst_dim1_size
,
3973 const void *src
, size_t src_offset1_size
,
3974 size_t src_offset0_len
, size_t src_dim1_size
)
3976 if (!hsa_fns
.hsa_amd_memory_lock_fn
3977 || !hsa_fns
.hsa_amd_memory_unlock_fn
3978 || !hsa_fns
.hsa_amd_memory_async_copy_rect_fn
)
3981 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
3982 out quietly if we have anything oddly-aligned rather than letting the
3983 driver raise an error. */
3984 if ((((uintptr_t) dst
) & 3) != 0 || (((uintptr_t) src
) & 3) != 0)
3987 if ((dst_dim1_size
& 3) != 0 || (src_dim1_size
& 3) != 0)
3990 /* Only handle host to device or device to host transfers here. */
3991 if ((dst_ord
== -1 && src_ord
== -1)
3992 || (dst_ord
!= -1 && src_ord
!= -1))
3995 hsa_amd_copy_direction_t dir
3996 = (src_ord
== -1) ? hsaHostToDevice
: hsaDeviceToHost
;
3997 hsa_agent_t copy_agent
;
3999 /* We need to pin (lock) host memory before we start the transfer. Try to
4000 lock the minimum size necessary, i.e. using partial first/last rows of the
4001 whole array. Something like this:
4005 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4006 o | ++#######+++++ <- whole row
4007 l | ++#######+++++ <- "
4008 s v ++#######..... <- last row apart from trailing remainder
4011 We could split very large transfers into several rectangular copies, but
4012 that is unimplemented for now. */
4014 size_t bounded_size_host
, first_elem_offset_host
;
4016 if (dir
== hsaHostToDevice
)
4018 bounded_size_host
= src_dim1_size
* (dim0_len
- 1) + dim1_size
;
4019 first_elem_offset_host
= src_offset0_len
* src_dim1_size
4021 host_ptr
= (void *) src
;
4022 struct agent_info
*agent
= get_agent_info (dst_ord
);
4023 copy_agent
= agent
->id
;
4027 bounded_size_host
= dst_dim1_size
* (dim0_len
- 1) + dim1_size
;
4028 first_elem_offset_host
= dst_offset0_len
* dst_dim1_size
4031 struct agent_info
*agent
= get_agent_info (src_ord
);
4032 copy_agent
= agent
->id
;
4038 = hsa_fns
.hsa_amd_memory_lock_fn (host_ptr
+ first_elem_offset_host
,
4039 bounded_size_host
, NULL
, 0, &agent_ptr
);
4040 /* We can't lock the host memory: don't give up though, we might still be
4041 able to use the slow path in our caller. So, don't make this an
4043 if (status
!= HSA_STATUS_SUCCESS
)
4046 hsa_pitched_ptr_t dstpp
, srcpp
;
4047 hsa_dim3_t dst_offsets
, src_offsets
, ranges
;
4051 hsa_signal_t completion_signal
;
4052 status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &completion_signal
);
4053 if (status
!= HSA_STATUS_SUCCESS
)
4059 if (dir
== hsaHostToDevice
)
4061 srcpp
.base
= agent_ptr
- first_elem_offset_host
;
4066 srcpp
.base
= (void *) src
;
4067 dstpp
.base
= agent_ptr
- first_elem_offset_host
;
4070 srcpp
.pitch
= src_dim1_size
;
4073 src_offsets
.x
= src_offset1_size
;
4074 src_offsets
.y
= src_offset0_len
;
4077 dstpp
.pitch
= dst_dim1_size
;
4080 dst_offsets
.x
= dst_offset1_size
;
4081 dst_offsets
.y
= dst_offset0_len
;
4084 ranges
.x
= dim1_size
;
4085 ranges
.y
= dim0_len
;
4089 = hsa_fns
.hsa_amd_memory_async_copy_rect_fn (&dstpp
, &dst_offsets
, &srcpp
,
4090 &src_offsets
, &ranges
,
4091 copy_agent
, dir
, 0, NULL
,
4093 /* If the rectangular copy fails, we might still be able to use the slow
4094 path. We need to unlock the host memory though, so don't return
4096 if (status
!= HSA_STATUS_SUCCESS
)
4099 hsa_fns
.hsa_signal_wait_acquire_fn (completion_signal
,
4100 HSA_SIGNAL_CONDITION_LT
, 1, UINT64_MAX
,
4101 HSA_WAIT_STATE_ACTIVE
);
4103 hsa_fns
.hsa_signal_destroy_fn (completion_signal
);
4106 status
= hsa_fns
.hsa_amd_memory_unlock_fn (host_ptr
+ first_elem_offset_host
);
4107 if (status
!= HSA_STATUS_SUCCESS
)
4108 hsa_fatal ("Could not unlock host memory", status
);
4113 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4114 measured in bytes. So we have:
4116 dim2_size: number of bytes to copy on innermost dimension ("row")
4117 dim1_len: number of rows per slice to copy
4118 dim0_len: number of slices to copy
4119 dst: base pointer for destination of copy
4120 dst_offset2_size: innermost row offset (for dest), in bytes
4121 dst_offset1_len: offset, number of rows (for dest)
4122 dst_offset0_len: offset, number of slices (for dest)
4123 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4124 dst_dim1_len: whole-array number of rows in slice (for dest)
4125 src: base pointer for source of copy
4126 src_offset2_size: innermost row offset (for source), in bytes
4127 src_offset1_len: offset, number of rows (for source)
4128 src_offset0_len: offset, number of slices (for source)
4129 src_dim2_size: whole-array source row length, in bytes (pitch)
4130 src_dim1_len: whole-array number of rows in slice (for source)
4134 GOMP_OFFLOAD_memcpy3d (int dst_ord
, int src_ord
, size_t dim2_size
,
4135 size_t dim1_len
, size_t dim0_len
, void *dst
,
4136 size_t dst_offset2_size
, size_t dst_offset1_len
,
4137 size_t dst_offset0_len
, size_t dst_dim2_size
,
4138 size_t dst_dim1_len
, const void *src
,
4139 size_t src_offset2_size
, size_t src_offset1_len
,
4140 size_t src_offset0_len
, size_t src_dim2_size
,
4141 size_t src_dim1_len
)
4143 if (!hsa_fns
.hsa_amd_memory_lock_fn
4144 || !hsa_fns
.hsa_amd_memory_unlock_fn
4145 || !hsa_fns
.hsa_amd_memory_async_copy_rect_fn
)
4148 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4149 out quietly if we have anything oddly-aligned rather than letting the
4150 driver raise an error. */
4151 if ((((uintptr_t) dst
) & 3) != 0 || (((uintptr_t) src
) & 3) != 0)
4154 if ((dst_dim2_size
& 3) != 0 || (src_dim2_size
& 3) != 0)
4157 /* Only handle host to device or device to host transfers here. */
4158 if ((dst_ord
== -1 && src_ord
== -1)
4159 || (dst_ord
!= -1 && src_ord
!= -1))
4162 hsa_amd_copy_direction_t dir
4163 = (src_ord
== -1) ? hsaHostToDevice
: hsaDeviceToHost
;
4164 hsa_agent_t copy_agent
;
4166 /* We need to pin (lock) host memory before we start the transfer. Try to
4167 lock the minimum size necessary, i.e. using partial first/last slices of
4168 the whole 3D array. Something like this:
4170 slice 0: slice 1: slice 2:
4171 __________ __________ __________
4172 ^ /+++++++++/ : /+++++++++/ : / /
4173 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4174 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4175 /_________/ : /_________/ : /_________/
4178 We could split very large transfers into several rectangular copies, but
4179 that is unimplemented for now. */
4181 size_t bounded_size_host
, first_elem_offset_host
;
4183 if (dir
== hsaHostToDevice
)
4185 size_t slice_bytes
= src_dim2_size
* src_dim1_len
;
4186 bounded_size_host
= slice_bytes
* (dim0_len
- 1)
4187 + src_dim2_size
* (dim1_len
- 1)
4189 first_elem_offset_host
= src_offset0_len
* slice_bytes
4190 + src_offset1_len
* src_dim2_size
4192 host_ptr
= (void *) src
;
4193 struct agent_info
*agent
= get_agent_info (dst_ord
);
4194 copy_agent
= agent
->id
;
4198 size_t slice_bytes
= dst_dim2_size
* dst_dim1_len
;
4199 bounded_size_host
= slice_bytes
* (dim0_len
- 1)
4200 + dst_dim2_size
* (dim1_len
- 1)
4202 first_elem_offset_host
= dst_offset0_len
* slice_bytes
4203 + dst_offset1_len
* dst_dim2_size
4206 struct agent_info
*agent
= get_agent_info (src_ord
);
4207 copy_agent
= agent
->id
;
4213 = hsa_fns
.hsa_amd_memory_lock_fn (host_ptr
+ first_elem_offset_host
,
4214 bounded_size_host
, NULL
, 0, &agent_ptr
);
4215 /* We can't lock the host memory: don't give up though, we might still be
4216 able to use the slow path in our caller (maybe even with iterated memcpy2d
4217 calls). So, don't make this an error. */
4218 if (status
!= HSA_STATUS_SUCCESS
)
4221 hsa_pitched_ptr_t dstpp
, srcpp
;
4222 hsa_dim3_t dst_offsets
, src_offsets
, ranges
;
4226 hsa_signal_t completion_signal
;
4227 status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &completion_signal
);
4228 if (status
!= HSA_STATUS_SUCCESS
)
4234 if (dir
== hsaHostToDevice
)
4236 srcpp
.base
= agent_ptr
- first_elem_offset_host
;
4241 srcpp
.base
= (void *) src
;
4242 dstpp
.base
= agent_ptr
- first_elem_offset_host
;
4245 /* Pitch is measured in bytes. */
4246 srcpp
.pitch
= src_dim2_size
;
4247 /* Slice is also measured in bytes (i.e. total per-slice). */
4248 srcpp
.slice
= src_dim2_size
* src_dim1_len
;
4250 src_offsets
.x
= src_offset2_size
;
4251 src_offsets
.y
= src_offset1_len
;
4252 src_offsets
.z
= src_offset0_len
;
4255 dstpp
.pitch
= dst_dim2_size
;
4256 dstpp
.slice
= dst_dim2_size
* dst_dim1_len
;
4258 dst_offsets
.x
= dst_offset2_size
;
4259 dst_offsets
.y
= dst_offset1_len
;
4260 dst_offsets
.z
= dst_offset0_len
;
4262 ranges
.x
= dim2_size
;
4263 ranges
.y
= dim1_len
;
4264 ranges
.z
= dim0_len
;
4267 = hsa_fns
.hsa_amd_memory_async_copy_rect_fn (&dstpp
, &dst_offsets
, &srcpp
,
4268 &src_offsets
, &ranges
,
4269 copy_agent
, dir
, 0, NULL
,
4271 /* If the rectangular copy fails, we might still be able to use the slow
4272 path. We need to unlock the host memory though, so don't return
4274 if (status
!= HSA_STATUS_SUCCESS
)
4278 hsa_signal_value_t sv
4279 = hsa_fns
.hsa_signal_wait_acquire_fn (completion_signal
,
4280 HSA_SIGNAL_CONDITION_LT
, 1,
4282 HSA_WAIT_STATE_ACTIVE
);
4285 GCN_WARNING ("async copy rect failure");
4290 hsa_fns
.hsa_signal_destroy_fn (completion_signal
);
4293 status
= hsa_fns
.hsa_amd_memory_unlock_fn (host_ptr
+ first_elem_offset_host
);
4294 if (status
!= HSA_STATUS_SUCCESS
)
4295 hsa_fatal ("Could not unlock host memory", status
);
4301 /* {{{ OpenMP Plugin API */
4303 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4304 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4305 to a kernel_info structure, and must have previously been loaded to the
4306 specified device. */
4309 GOMP_OFFLOAD_run (int device
, void *fn_ptr
, void *vars
, void **args
)
4311 struct agent_info
*agent
= get_agent_info (device
);
4312 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4313 struct GOMP_kernel_launch_attributes def
;
4314 struct GOMP_kernel_launch_attributes
*kla
;
4315 assert (agent
== kernel
->agent
);
4317 /* If we get here then the kernel must be OpenMP. */
4318 kernel
->kind
= KIND_OPENMP
;
4320 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
4322 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4325 run_kernel (kernel
, vars
, kla
, NULL
, false);
4328 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4329 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4330 GOMP_PLUGIN_target_task_completion when it has finished. */
4333 GOMP_OFFLOAD_async_run (int device
, void *tgt_fn
, void *tgt_vars
,
4334 void **args
, void *async_data
)
4336 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4337 struct agent_info
*agent
= get_agent_info (device
);
4338 struct kernel_info
*kernel
= (struct kernel_info
*) tgt_fn
;
4339 struct GOMP_kernel_launch_attributes def
;
4340 struct GOMP_kernel_launch_attributes
*kla
;
4341 assert (agent
== kernel
->agent
);
4343 /* If we get here then the kernel must be OpenMP. */
4344 kernel
->kind
= KIND_OPENMP
;
4346 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
4348 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4352 maybe_init_omp_async (agent
);
4353 queue_push_launch (agent
->omp_async_queue
, kernel
, tgt_vars
, kla
);
4354 queue_push_callback (agent
->omp_async_queue
,
4355 GOMP_PLUGIN_target_task_completion
, async_data
);
4359 /* {{{ OpenACC Plugin API */
4361 /* Run a synchronous OpenACC kernel. The device number is inferred from the
4362 already-loaded KERNEL. */
4365 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr
) (void *),
4366 size_t mapnum
__attribute__((unused
)),
4367 void **hostaddrs
__attribute__((unused
)),
4368 void **devaddrs
, unsigned *dims
,
4369 void *targ_mem_desc
)
4371 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4373 gcn_exec (kernel
, devaddrs
, dims
, targ_mem_desc
, false, NULL
);
4376 /* Run an asynchronous OpenACC kernel on the specified queue. */
4379 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr
) (void *),
4380 size_t mapnum
__attribute__((unused
)),
4381 void **hostaddrs
__attribute__((unused
)),
4383 unsigned *dims
, void *targ_mem_desc
,
4384 struct goacc_asyncqueue
*aq
)
4386 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
4388 gcn_exec (kernel
, devaddrs
, dims
, targ_mem_desc
, true, aq
);
4391 /* Create a new asynchronous thread and queue for running future kernels;
4392 issues a fatal error if the queue cannot be created as all callers expect
4393 that the queue exists. */
4395 struct goacc_asyncqueue
*
4396 GOMP_OFFLOAD_openacc_async_construct (int device
)
4398 struct agent_info
*agent
= get_agent_info (device
);
4400 pthread_mutex_lock (&agent
->async_queues_mutex
);
4402 struct goacc_asyncqueue
*aq
= GOMP_PLUGIN_malloc (sizeof (*aq
));
4403 aq
->agent
= get_agent_info (device
);
4405 aq
->next
= agent
->async_queues
;
4408 aq
->next
->prev
= aq
;
4409 aq
->id
= aq
->next
->id
+ 1;
4413 agent
->async_queues
= aq
;
4415 aq
->queue_first
= 0;
4417 aq
->drain_queue_stop
= 0;
4419 if (pthread_mutex_init (&aq
->mutex
, NULL
))
4421 GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue mutex");
4424 if (pthread_cond_init (&aq
->queue_cond_in
, NULL
))
4426 GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue cond");
4429 if (pthread_cond_init (&aq
->queue_cond_out
, NULL
))
4431 GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue cond");
4435 hsa_status_t status
= hsa_fns
.hsa_queue_create_fn (agent
->id
,
4437 HSA_QUEUE_TYPE_MULTI
,
4438 hsa_queue_callback
, NULL
,
4439 UINT32_MAX
, UINT32_MAX
,
4441 if (status
!= HSA_STATUS_SUCCESS
)
4442 hsa_fatal ("Error creating command queue", status
);
4444 int err
= pthread_create (&aq
->thread_drain_queue
, NULL
, &drain_queue
, aq
);
4446 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4448 GCN_DEBUG ("Async thread %d:%d: created\n", aq
->agent
->device_id
,
4451 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4456 /* Destroy an existing asynchronous thread and queue. Waits for any
4457 currently-running task to complete, but cancels any queued tasks. */
4460 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
4462 struct agent_info
*agent
= aq
->agent
;
4464 finalize_async_thread (aq
);
4466 pthread_mutex_lock (&agent
->async_queues_mutex
);
4469 if ((err
= pthread_mutex_destroy (&aq
->mutex
)))
4471 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err
);
4474 if (pthread_cond_destroy (&aq
->queue_cond_in
))
4476 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4479 if (pthread_cond_destroy (&aq
->queue_cond_out
))
4481 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4484 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (aq
->hsa_queue
);
4485 if (status
!= HSA_STATUS_SUCCESS
)
4487 hsa_error ("Error destroying command queue", status
);
4492 aq
->prev
->next
= aq
->next
;
4494 aq
->next
->prev
= aq
->prev
;
4495 if (agent
->async_queues
== aq
)
4496 agent
->async_queues
= aq
->next
;
4498 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent
->device_id
, aq
->id
);
4501 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4505 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4509 /* Return true if the specified async queue is currently empty. */
4512 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
4514 return queue_empty (aq
);
4517 /* Block until the specified queue has executed all its tasks and the
4521 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
4527 /* Add a serialization point across two async queues. Any new tasks added to
4528 AQ2, after this call, will not run until all tasks on AQ1, at the time
4529 of this call, have completed. */
4532 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
4533 struct goacc_asyncqueue
*aq2
)
4535 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4536 scheduled to run on it up to this point. */
4539 struct placeholder
*placeholderp
= queue_push_placeholder (aq1
);
4540 queue_push_asyncwait (aq2
, placeholderp
);
4545 /* Add an opaque callback to the given async queue. */
4548 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
4549 void (*fn
) (void *), void *data
)
4551 queue_push_callback (aq
, fn
, data
);
4554 /* Queue up an asynchronous data copy from host to DEVICE. */
4557 GOMP_OFFLOAD_openacc_async_host2dev (int device
, void *dst
, const void *src
,
4558 size_t n
, struct goacc_asyncqueue
*aq
)
4560 struct agent_info
*agent
= get_agent_info (device
);
4561 assert (agent
== aq
->agent
);
4562 queue_push_copy (aq
, dst
, src
, n
);
4566 /* Queue up an asynchronous data copy from DEVICE to host. */
4569 GOMP_OFFLOAD_openacc_async_dev2host (int device
, void *dst
, const void *src
,
4570 size_t n
, struct goacc_asyncqueue
*aq
)
4572 struct agent_info
*agent
= get_agent_info (device
);
4573 assert (agent
== aq
->agent
);
4574 queue_push_copy (aq
, dst
, src
, n
);
4578 union goacc_property_value
4579 GOMP_OFFLOAD_openacc_get_property (int device
, enum goacc_property prop
)
4581 struct agent_info
*agent
= get_agent_info (device
);
4583 union goacc_property_value propval
= { .val
= 0 };
4587 case GOACC_PROPERTY_FREE_MEMORY
:
4588 /* Not supported. */
4590 case GOACC_PROPERTY_MEMORY
:
4593 hsa_region_t region
= agent
->data_region
;
4594 hsa_status_t status
=
4595 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
4596 if (status
== HSA_STATUS_SUCCESS
)
4600 case GOACC_PROPERTY_NAME
:
4601 propval
.ptr
= agent
->name
;
4603 case GOACC_PROPERTY_VENDOR
:
4604 propval
.ptr
= agent
->vendor_name
;
4606 case GOACC_PROPERTY_DRIVER
:
4607 propval
.ptr
= hsa_context
.driver_version_s
;
4614 /* Set up plugin-specific thread-local-data (host-side). */
4617 GOMP_OFFLOAD_openacc_create_thread_data (int ord
__attribute__((unused
)))
4619 struct gcn_thread
*thread_data
4620 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread
));
4622 thread_data
->async
= GOMP_ASYNC_SYNC
;
4624 return (void *) thread_data
;
4627 /* Clean up plugin-specific thread-local-data. */
4630 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)