Daily bump.
[gcc.git] / libgomp / plugin / plugin-gcn.c
blobd26b93657bf61d0b9060cf5c36315fb68340557e
1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
31 #include "config.h"
32 #include "symcat.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <pthread.h>
37 #include <inttypes.h>
38 #include <stdbool.h>
39 #include <limits.h>
40 #include <hsa.h>
41 #include <hsa_ext_amd.h>
42 #include <dlfcn.h>
43 #include <signal.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
47 #include <elf.h>
48 #include "oacc-plugin.h"
49 #include "oacc-int.h"
50 #include <assert.h>
52 /* These probably won't be in elf.h for a while. */
53 #ifndef R_AMDGPU_NONE
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
67 #endif
69 /* GCN specific definitions for asynchronous queues. */
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
77 /* Defaults. */
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87 #include <unistd.h>
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
92 static char *
93 secure_getenv (const char *name)
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
101 #else
102 #define secure_getenv getenv
103 #endif
104 #endif
106 /* }}} */
107 /* {{{ Types */
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
111 struct gcn_thread
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
120 struct hsa_runtime_fn_info
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
199 hsa_status_t (*hsa_amd_memory_lock_fn)
200 (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
201 void **agent_ptr);
202 hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
204 (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
205 const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
206 const hsa_dim3_t *range, hsa_agent_t copy_agent,
207 hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
208 const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
211 /* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
214 struct GOMP_kernel_launch_attributes
216 /* Number of dimensions the workload has. Maximum number is 3. */
217 uint32_t ndim;
218 /* Size of the grid in the three respective dimensions. */
219 uint32_t gdims[3];
220 /* Size of work-groups in the respective dimensions. */
221 uint32_t wdims[3];
224 /* Collection of information needed for a dispatch of a kernel from a
225 kernel. */
227 struct kernel_dispatch
229 struct agent_info *agent;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
231 void *queue;
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address;
234 /* Kernel object. */
235 uint64_t object;
236 /* Synchronization signal used for dispatch synchronization. */
237 uint64_t signal;
238 /* Private segment size. */
239 uint32_t private_segment_size;
240 /* Group segment size. */
241 uint32_t group_segment_size;
244 /* Structure of the kernargs segment, supporting console output.
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
249 struct kernargs {
250 struct kernargs_abi abi;
252 /* Output data. */
253 struct output output_data;
256 /* A queue entry for a future asynchronous launch. */
258 struct kernel_launch
260 struct kernel_info *kernel;
261 void *vars;
262 struct GOMP_kernel_launch_attributes kla;
265 /* A queue entry for a future callback. */
267 struct callback
269 void (*fn)(void *);
270 void *data;
273 /* A data struct for the copy_data callback. */
275 struct copy_data
277 void *dst;
278 const void *src;
279 size_t len;
280 struct goacc_asyncqueue *aq;
283 /* A queue entry for a placeholder. These correspond to a wait event. */
285 struct placeholder
287 int executed;
288 pthread_cond_t cond;
289 pthread_mutex_t mutex;
292 /* A queue entry for a wait directive. */
294 struct asyncwait_info
296 struct placeholder *placeholderp;
299 /* Encode the type of an entry in an async queue. */
301 enum entry_type
303 KERNEL_LAUNCH,
304 CALLBACK,
305 ASYNC_WAIT,
306 ASYNC_PLACEHOLDER
309 /* An entry in an async queue. */
311 struct queue_entry
313 enum entry_type type;
314 union {
315 struct kernel_launch launch;
316 struct callback callback;
317 struct asyncwait_info asyncwait;
318 struct placeholder placeholder;
319 } u;
322 /* An async queue header.
324 OpenMP may create one of these.
325 OpenACC may create many. */
327 struct goacc_asyncqueue
329 struct agent_info *agent;
330 hsa_queue_t *hsa_queue;
332 pthread_t thread_drain_queue;
333 pthread_mutex_t mutex;
334 pthread_cond_t queue_cond_in;
335 pthread_cond_t queue_cond_out;
336 struct queue_entry queue[ASYNC_QUEUE_SIZE];
337 int queue_first;
338 int queue_n;
339 int drain_queue_stop;
341 int id;
342 struct goacc_asyncqueue *prev;
343 struct goacc_asyncqueue *next;
346 /* Mkoffload uses this structure to describe a kernel.
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
351 struct hsa_kernel_description
353 const char *name;
354 int oacc_dims[3]; /* Only present for GCN kernels. */
355 int sgpr_count;
356 int vpgr_count;
359 /* Mkoffload uses this structure to describe an offload variable. */
361 struct global_var_info
363 const char *name;
364 void *address;
367 /* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
370 struct gcn_image_desc
372 struct gcn_image {
373 size_t size;
374 void *image;
375 } *gcn_image;
376 const unsigned kernel_count;
377 struct hsa_kernel_description *kernel_infos;
378 const unsigned ind_func_count;
379 const unsigned global_variable_count;
382 /* Enum values corresponding to the the ELF architecture codes.
383 Only 'special' values are actually referenced in this file, but having them
384 all may aid debugging. */
386 typedef enum {
387 EF_AMDGPU_MACH_UNSUPPORTED = -1,
388 #define GCN_DEVICE(name, NAME, ELF, ...) \
389 EF_AMDGPU_MACH_AMDGCN_ ## NAME = ELF,
390 #include "../../gcc/config/gcn/gcn-devices.def"
391 } EF_AMDGPU_MACH;
393 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
394 typedef EF_AMDGPU_MACH gcn_isa;
396 /* Description of an HSA GPU agent (device) and the program associated with
397 it. */
399 struct agent_info
401 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
402 hsa_agent_t id;
403 /* The user-visible device number. */
404 int device_id;
405 /* Whether the agent has been initialized. The fields below are usable only
406 if it has been. */
407 bool initialized;
409 /* The instruction set architecture of the device. */
410 gcn_isa device_isa;
411 /* Name of the agent. */
412 char name[64];
413 /* Name of the vendor of the agent. */
414 char vendor_name[64];
415 /* Command queues of the agent. */
416 hsa_queue_t *sync_queue;
417 struct goacc_asyncqueue *async_queues, *omp_async_queue;
418 pthread_mutex_t async_queues_mutex;
420 /* The HSA memory region from which to allocate kernel arguments. */
421 hsa_region_t kernarg_region;
423 /* The HSA memory region from which to allocate device data. */
424 hsa_region_t data_region;
426 /* Allocated ephemeral memories (team arena and stack space). */
427 struct ephemeral_memories_list *ephemeral_memories_list;
428 pthread_mutex_t ephemeral_memories_write_lock;
430 /* Read-write lock that protects kernels which are running or about to be run
431 from interference with loading and unloading of images. Needs to be
432 locked for reading while a kernel is being run, and for writing if the
433 list of modules is manipulated (and thus the HSA program invalidated). */
434 pthread_rwlock_t module_rwlock;
436 /* The module associated with this kernel. */
437 struct module_info *module;
439 /* Mutex enforcing that only one thread will finalize the HSA program. A
440 thread should have locked agent->module_rwlock for reading before
441 acquiring it. */
442 pthread_mutex_t prog_mutex;
443 /* Flag whether the HSA program that consists of all the modules has been
444 finalized. */
445 bool prog_finalized;
446 /* HSA executable - the finalized program that is used to locate kernels. */
447 hsa_executable_t executable;
450 /* Information required to identify, finalize and run any given kernel. */
452 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
454 struct kernel_info
456 /* Name of the kernel, required to locate it within the GCN object-code
457 module. */
458 const char *name;
459 /* The specific agent the kernel has been or will be finalized for and run
460 on. */
461 struct agent_info *agent;
462 /* The specific module where the kernel takes place. */
463 struct module_info *module;
464 /* Information provided by mkoffload associated with the kernel. */
465 struct hsa_kernel_description *description;
466 /* Mutex enforcing that at most once thread ever initializes a kernel for
467 use. A thread should have locked agent->module_rwlock for reading before
468 acquiring it. */
469 pthread_mutex_t init_mutex;
470 /* Flag indicating whether the kernel has been initialized and all fields
471 below it contain valid data. */
472 bool initialized;
473 /* Flag indicating that the kernel has a problem that blocks an execution. */
474 bool initialization_failed;
475 /* The object to be put into the dispatch queue. */
476 uint64_t object;
477 /* Required size of kernel arguments. */
478 uint32_t kernarg_segment_size;
479 /* Required size of group segment. */
480 uint32_t group_segment_size;
481 /* Required size of private segment. */
482 uint32_t private_segment_size;
483 /* Set up for OpenMP or OpenACC? */
484 enum offload_kind kind;
487 /* Information about a particular GCN module, its image and kernels. */
489 struct module_info
491 /* The description with which the program has registered the image. */
492 struct gcn_image_desc *image_desc;
493 /* GCN heap allocation. */
494 struct heap *heap;
495 /* Physical boundaries of the loaded module. */
496 Elf64_Addr phys_address_start;
497 Elf64_Addr phys_address_end;
499 bool constructors_run_p;
500 struct kernel_info *init_array_func, *fini_array_func;
502 /* Number of kernels in this module. */
503 int kernel_count;
504 /* An array of kernel_info structures describing each kernel in this
505 module. */
506 struct kernel_info kernels[];
509 /* A linked list of memory arenas allocated on the device.
510 These are used by OpenMP, as a means to optimize per-team malloc,
511 and for host-accessible stack space. */
513 struct ephemeral_memories_list
515 struct ephemeral_memories_list *next;
517 /* The size is determined by the number of teams and threads. */
518 size_t size;
519 /* The device address allocated memory. */
520 void *address;
521 /* A flag to prevent two asynchronous kernels trying to use the same memory.
522 The mutex is locked until the kernel exits. */
523 pthread_mutex_t in_use;
526 /* Information about the whole HSA environment and all of its agents. */
528 struct hsa_context_info
530 /* Whether the structure has been initialized. */
531 bool initialized;
532 /* Number of usable GPU HSA agents in the system. */
533 int agent_count;
534 /* Array of agent_info structures describing the individual HSA agents. */
535 struct agent_info *agents;
536 /* Driver version string. */
537 char driver_version_s[30];
540 /* }}} */
541 /* {{{ Global variables */
543 /* Information about the whole HSA environment and all of its agents. */
545 static struct hsa_context_info hsa_context;
547 /* HSA runtime functions that are initialized in init_hsa_context. */
549 static struct hsa_runtime_fn_info hsa_fns;
551 /* Heap space, allocated target-side, provided for use of newlib malloc.
552 Each module should have it's own heap allocated.
553 Beware that heap usage increases with OpenMP teams. See also arenas. */
555 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
557 /* Ephemeral memory sizes for each kernel launch. */
559 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
560 static int stack_size = DEFAULT_GCN_STACK_SIZE;
561 static int lowlat_size = -1;
563 /* Flag to decide whether print to stderr information about what is going on.
564 Set in init_debug depending on environment variables. */
566 static bool debug;
568 /* Flag to decide if the runtime should suppress a possible fallback to host
569 execution. */
571 static bool suppress_host_fallback;
573 /* Flag to locate HSA runtime shared library that is dlopened
574 by this plug-in. */
576 static const char *hsa_runtime_lib;
578 /* Flag to decide if the runtime should support also CPU devices (can be
579 a simulator). */
581 static bool support_cpu_devices;
583 /* Runtime dimension overrides. Zero indicates default. */
585 static int override_x_dim = 0;
586 static int override_z_dim = 0;
588 /* }}} */
589 /* {{{ Debug & Diagnostic */
591 /* Print a message to stderr if GCN_DEBUG value is set to true. */
593 #define DEBUG_PRINT(...) \
594 do \
596 if (debug) \
598 fprintf (stderr, __VA_ARGS__); \
601 while (false);
603 /* Flush stderr if GCN_DEBUG value is set to true. */
605 #define DEBUG_FLUSH() \
606 do { \
607 if (debug) \
608 fflush (stderr); \
609 } while (false)
611 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
612 is set to true. */
614 #define DEBUG_LOG(prefix, ...) \
615 do \
617 DEBUG_PRINT (prefix); \
618 DEBUG_PRINT (__VA_ARGS__); \
619 DEBUG_FLUSH (); \
620 } while (false)
622 /* Print a debugging message to stderr. */
624 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
626 /* Print a warning message to stderr. */
628 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
630 /* Print HSA warning STR with an HSA STATUS code. */
632 static void
633 hsa_warn (const char *str, hsa_status_t status)
635 if (!debug)
636 return;
638 const char *hsa_error_msg = "[unknown]";
639 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
641 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
642 hsa_error_msg);
645 /* Report a fatal error STR together with the HSA error corresponding to STATUS
646 and terminate execution of the current process. */
648 static void
649 hsa_fatal (const char *str, hsa_status_t status)
651 const char *hsa_error_msg = "[unknown]";
652 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
653 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
654 hsa_error_msg);
657 /* Like hsa_fatal, except only report error message, and return FALSE
658 for propagating error processing to outside of plugin. */
660 static bool
661 hsa_error (const char *str, hsa_status_t status)
663 const char *hsa_error_msg = "[unknown]";
664 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
665 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
666 hsa_error_msg);
667 return false;
670 /* Dump information about the available hardware. */
672 static void
673 dump_hsa_system_info (void)
675 hsa_status_t status;
677 hsa_endianness_t endianness;
678 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
679 &endianness);
680 if (status == HSA_STATUS_SUCCESS)
681 switch (endianness)
683 case HSA_ENDIANNESS_LITTLE:
684 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
685 break;
686 case HSA_ENDIANNESS_BIG:
687 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
688 break;
689 default:
690 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
692 else
693 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
695 uint8_t extensions[128];
696 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
697 &extensions);
698 if (status == HSA_STATUS_SUCCESS)
700 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
701 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
703 else
704 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
707 /* Dump information about the available hardware. */
709 static void
710 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
712 switch (machine_model)
714 case HSA_MACHINE_MODEL_SMALL:
715 GCN_DEBUG ("%s: SMALL\n", s);
716 break;
717 case HSA_MACHINE_MODEL_LARGE:
718 GCN_DEBUG ("%s: LARGE\n", s);
719 break;
720 default:
721 GCN_WARNING ("%s: UNKNOWN\n", s);
722 break;
726 /* Dump information about the available hardware. */
728 static void
729 dump_profile (hsa_profile_t profile, const char *s)
731 switch (profile)
733 case HSA_PROFILE_FULL:
734 GCN_DEBUG ("%s: FULL\n", s);
735 break;
736 case HSA_PROFILE_BASE:
737 GCN_DEBUG ("%s: BASE\n", s);
738 break;
739 default:
740 GCN_WARNING ("%s: UNKNOWN\n", s);
741 break;
745 /* Dump information about a device memory region. */
747 static hsa_status_t
748 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
750 hsa_status_t status;
752 hsa_region_segment_t segment;
753 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
754 &segment);
755 if (status == HSA_STATUS_SUCCESS)
757 if (segment == HSA_REGION_SEGMENT_GLOBAL)
758 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
759 else if (segment == HSA_REGION_SEGMENT_READONLY)
760 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
761 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
762 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
763 else if (segment == HSA_REGION_SEGMENT_GROUP)
764 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
765 else
766 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
768 else
769 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
771 if (segment == HSA_REGION_SEGMENT_GLOBAL)
773 uint32_t flags;
774 status
775 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
776 &flags);
777 if (status == HSA_STATUS_SUCCESS)
779 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
780 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
781 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
782 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
783 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
784 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
786 else
787 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
790 size_t size;
791 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
792 if (status == HSA_STATUS_SUCCESS)
793 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
794 else
795 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
797 status
798 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
799 &size);
800 if (status == HSA_STATUS_SUCCESS)
801 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
802 else
803 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
805 bool alloc_allowed;
806 status
807 = hsa_fns.hsa_region_get_info_fn (region,
808 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
809 &alloc_allowed);
810 if (status == HSA_STATUS_SUCCESS)
811 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
812 else
813 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
815 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
816 return HSA_STATUS_SUCCESS;
818 status
819 = hsa_fns.hsa_region_get_info_fn (region,
820 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
821 &size);
822 if (status == HSA_STATUS_SUCCESS)
823 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
824 else
825 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
827 size_t align;
828 status
829 = hsa_fns.hsa_region_get_info_fn (region,
830 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
831 &align);
832 if (status == HSA_STATUS_SUCCESS)
833 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
834 else
835 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
837 return HSA_STATUS_SUCCESS;
840 /* Dump information about all the device memory regions. */
842 static void
843 dump_hsa_regions (hsa_agent_t agent)
845 hsa_status_t status;
846 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
847 dump_hsa_region,
848 NULL);
849 if (status != HSA_STATUS_SUCCESS)
850 hsa_error ("Dumping hsa regions failed", status);
853 /* Dump information about the available devices. */
855 static hsa_status_t
856 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
858 hsa_status_t status;
860 char buf[64];
861 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
862 &buf);
863 if (status == HSA_STATUS_SUCCESS)
864 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
865 else
866 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
868 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
869 &buf);
870 if (status == HSA_STATUS_SUCCESS)
871 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
872 else
873 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
875 hsa_machine_model_t machine_model;
876 status
877 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
878 &machine_model);
879 if (status == HSA_STATUS_SUCCESS)
880 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
881 else
882 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
884 hsa_profile_t profile;
885 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
886 &profile);
887 if (status == HSA_STATUS_SUCCESS)
888 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
889 else
890 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
892 hsa_device_type_t device_type;
893 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
894 &device_type);
895 if (status == HSA_STATUS_SUCCESS)
897 switch (device_type)
899 case HSA_DEVICE_TYPE_CPU:
900 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
901 break;
902 case HSA_DEVICE_TYPE_GPU:
903 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
904 break;
905 case HSA_DEVICE_TYPE_DSP:
906 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
907 break;
908 default:
909 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
910 break;
913 else
914 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
916 uint32_t cu_count;
917 status = hsa_fns.hsa_agent_get_info_fn
918 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
919 if (status == HSA_STATUS_SUCCESS)
920 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
921 else
922 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
924 uint32_t size;
925 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
926 &size);
927 if (status == HSA_STATUS_SUCCESS)
928 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
929 else
930 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
932 uint32_t max_dim;
933 status = hsa_fns.hsa_agent_get_info_fn (agent,
934 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
935 &max_dim);
936 if (status == HSA_STATUS_SUCCESS)
937 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
938 else
939 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
941 uint32_t max_size;
942 status = hsa_fns.hsa_agent_get_info_fn (agent,
943 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
944 &max_size);
945 if (status == HSA_STATUS_SUCCESS)
946 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
947 else
948 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
950 uint32_t grid_max_dim;
951 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
952 &grid_max_dim);
953 if (status == HSA_STATUS_SUCCESS)
954 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
955 else
956 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
958 uint32_t grid_max_size;
959 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
960 &grid_max_size);
961 if (status == HSA_STATUS_SUCCESS)
962 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
963 else
964 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
966 dump_hsa_regions (agent);
968 return HSA_STATUS_SUCCESS;
971 /* Forward reference. */
973 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
975 /* Helper function for dump_executable_symbols. */
977 static hsa_status_t
978 dump_executable_symbol (hsa_executable_t executable,
979 hsa_executable_symbol_t symbol,
980 void *data __attribute__((unused)))
982 char *name = get_executable_symbol_name (symbol);
984 if (name)
986 GCN_DEBUG ("executable symbol: %s\n", name);
987 free (name);
990 return HSA_STATUS_SUCCESS;
993 /* Dump all global symbol in an executable. */
995 static void
996 dump_executable_symbols (hsa_executable_t executable)
998 hsa_status_t status;
999 status
1000 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1001 dump_executable_symbol,
1002 NULL);
1003 if (status != HSA_STATUS_SUCCESS)
1004 hsa_fatal ("Could not dump HSA executable symbols", status);
1007 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1009 static void
1010 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1012 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1014 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1015 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1016 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1017 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1018 (void*)kernargs->abi.heap_ptr);
1019 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1020 "", (void*)kernargs->abi.arena_ptr,
1021 kernargs->abi.arena_size_per_team);
1022 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1023 "", (void*)kernargs->abi.stack_ptr,
1024 kernargs->abi.stack_size_per_thread);
1025 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1026 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1027 dispatch->private_segment_size);
1028 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1029 "", dispatch->group_segment_size);
1030 fprintf (stderr, "\n");
1033 /* }}} */
1034 /* {{{ Utility functions */
1036 /* Cast the thread local storage to gcn_thread. */
1038 static inline struct gcn_thread *
1039 gcn_thread (void)
1041 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1044 /* Initialize debug and suppress_host_fallback according to the environment. */
1046 static void
1047 init_environment_variables (void)
1049 if (secure_getenv ("GCN_DEBUG"))
1050 debug = true;
1051 else
1052 debug = false;
1054 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1055 suppress_host_fallback = true;
1056 else
1057 suppress_host_fallback = false;
1059 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1060 if (hsa_runtime_lib == NULL)
1061 hsa_runtime_lib = "libhsa-runtime64.so.1";
1063 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1065 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1066 if (!x)
1067 x = secure_getenv ("GCN_NUM_GANGS");
1068 if (x)
1069 override_x_dim = atoi (x);
1071 const char *z = secure_getenv ("GCN_NUM_THREADS");
1072 if (!z)
1073 z = secure_getenv ("GCN_NUM_WORKERS");
1074 if (z)
1075 override_z_dim = atoi (z);
1077 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1078 if (heap)
1080 size_t tmp = atol (heap);
1081 if (tmp)
1082 gcn_kernel_heap_size = tmp;
1085 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1086 if (arena)
1088 int tmp = atoi (arena);
1089 if (tmp)
1090 team_arena_size = tmp;;
1093 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1094 if (stack)
1096 int tmp = atoi (stack);
1097 if (tmp)
1098 stack_size = tmp;;
1101 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1102 if (lowlat)
1103 lowlat_size = atoi (lowlat);
1106 /* Return malloc'd string with name of SYMBOL. */
1108 static char *
1109 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1111 hsa_status_t status;
1112 char *res;
1113 uint32_t len;
1114 const hsa_executable_symbol_info_t info_name_length
1115 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1117 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1118 &len);
1119 if (status != HSA_STATUS_SUCCESS)
1121 hsa_error ("Could not get length of symbol name", status);
1122 return NULL;
1125 res = GOMP_PLUGIN_malloc (len + 1);
1127 const hsa_executable_symbol_info_t info_name
1128 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1130 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1132 if (status != HSA_STATUS_SUCCESS)
1134 hsa_error ("Could not get symbol name", status);
1135 free (res);
1136 return NULL;
1139 res[len] = '\0';
1141 return res;
1144 /* Get the number of GPU Compute Units. */
1146 static int
1147 get_cu_count (struct agent_info *agent)
1149 uint32_t cu_count;
1150 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1151 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1152 if (status == HSA_STATUS_SUCCESS)
1153 return cu_count;
1154 else
1155 return 64; /* The usual number for older devices. */
1158 /* Calculate the maximum grid size for OMP threads / OACC workers.
1159 This depends on the kernel's resource usage levels. */
1161 static int
1162 limit_worker_threads (int threads)
1164 /* FIXME Do something more inteligent here.
1165 GCN can always run 4 threads within a Compute Unit, but
1166 more than that depends on register usage. */
1167 if (threads > 16)
1168 threads = 16;
1169 return threads;
1172 /* This sets the maximum number of teams to twice the number of GPU Compute
1173 Units to avoid memory waste and corresponding memory access faults. */
1175 static int
1176 limit_teams (int teams, struct agent_info *agent)
1178 int max_teams = 2 * get_cu_count (agent);
1179 if (teams > max_teams)
1180 teams = max_teams;
1181 return teams;
1184 /* Parse the target attributes INPUT provided by the compiler and return true
1185 if we should run anything all. If INPUT is NULL, fill DEF with default
1186 values, then store INPUT or DEF into *RESULT.
1188 This is used for OpenMP only. */
1190 static bool
1191 parse_target_attributes (void **input,
1192 struct GOMP_kernel_launch_attributes *def,
1193 struct GOMP_kernel_launch_attributes **result,
1194 struct agent_info *agent)
1196 if (!input)
1197 GOMP_PLUGIN_fatal ("No target arguments provided");
1199 bool grid_attrs_found = false;
1200 bool gcn_dims_found = false;
1201 int gcn_teams = 0;
1202 int gcn_threads = 0;
1203 while (*input)
1205 intptr_t id = (intptr_t) *input++, val;
1207 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1208 val = (intptr_t) *input++;
1209 else
1210 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1212 val = (val > INT_MAX) ? INT_MAX : val;
1214 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1215 && ((id & GOMP_TARGET_ARG_ID_MASK)
1216 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1218 grid_attrs_found = true;
1219 break;
1221 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1222 == GOMP_TARGET_ARG_DEVICE_ALL)
1224 gcn_dims_found = true;
1225 switch (id & GOMP_TARGET_ARG_ID_MASK)
1227 case GOMP_TARGET_ARG_NUM_TEAMS:
1228 gcn_teams = limit_teams (val, agent);
1229 break;
1230 case GOMP_TARGET_ARG_THREAD_LIMIT:
1231 gcn_threads = limit_worker_threads (val);
1232 break;
1233 default:
1239 if (gcn_dims_found)
1241 bool gfx900_workaround_p = false;
1243 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1244 && gcn_threads == 0 && override_z_dim == 0)
1246 gfx900_workaround_p = true;
1247 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1248 "threads to at most 4 per team.\n");
1249 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1250 "GCN_NUM_THREADS=16\n");
1253 /* Ideally, when a dimension isn't explicitly specified, we should
1254 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1255 In practice, we tune for peak performance on BabelStream, which
1256 for OpenACC is currently 32 threads per CU. */
1257 def->ndim = 3;
1258 if (gcn_teams <= 0 && gcn_threads <= 0)
1260 /* Set up a reasonable number of teams and threads. */
1261 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1262 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1263 def->gdims[2] = gcn_threads;
1265 else if (gcn_teams <= 0 && gcn_threads > 0)
1267 /* Auto-scale the number of teams with the number of threads. */
1268 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1269 def->gdims[2] = gcn_threads;
1271 else if (gcn_teams > 0 && gcn_threads <= 0)
1273 int max_threads = gfx900_workaround_p ? 4 : 16;
1275 /* Auto-scale the number of threads with the number of teams. */
1276 def->gdims[0] = gcn_teams;
1277 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1278 if (def->gdims[2] == 0)
1279 def->gdims[2] = 1;
1280 else if (def->gdims[2] > max_threads)
1281 def->gdims[2] = max_threads;
1283 else
1285 def->gdims[0] = gcn_teams;
1286 def->gdims[2] = gcn_threads;
1288 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1289 def->wdims[0] = 1; /* Single team per work-group. */
1290 def->wdims[1] = 64;
1291 def->wdims[2] = 16;
1292 *result = def;
1293 return true;
1295 else if (!grid_attrs_found)
1297 def->ndim = 1;
1298 def->gdims[0] = 1;
1299 def->gdims[1] = 1;
1300 def->gdims[2] = 1;
1301 def->wdims[0] = 1;
1302 def->wdims[1] = 1;
1303 def->wdims[2] = 1;
1304 *result = def;
1305 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1306 return true;
1309 struct GOMP_kernel_launch_attributes *kla;
1310 kla = (struct GOMP_kernel_launch_attributes *) *input;
1311 *result = kla;
1312 if (kla->ndim == 0 || kla->ndim > 3)
1313 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1315 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1316 unsigned i;
1317 for (i = 0; i < kla->ndim; i++)
1319 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1320 kla->gdims[i], kla->wdims[i]);
1321 if (kla->gdims[i] == 0)
1322 return false;
1324 return true;
1327 /* Return the group size given the requested GROUP size, GRID size and number
1328 of grid dimensions NDIM. */
1330 static uint32_t
1331 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1333 if (group == 0)
1335 /* TODO: Provide a default via environment or device characteristics. */
1336 if (ndim == 1)
1337 group = 64;
1338 else if (ndim == 2)
1339 group = 8;
1340 else
1341 group = 4;
1344 if (group > grid)
1345 group = grid;
1346 return group;
1349 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1351 static void
1352 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1354 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1357 /* A never-called callback for the HSA command queues. These signal events
1358 that we don't use, so we trigger an error.
1360 This "queue" is not to be confused with the async queues, below. */
1362 static void
1363 hsa_queue_callback (hsa_status_t status,
1364 hsa_queue_t *queue __attribute__ ((unused)),
1365 void *data __attribute__ ((unused)))
1367 hsa_fatal ("Asynchronous queue error", status);
1370 /* }}} */
1371 /* {{{ HSA initialization */
1373 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1375 static bool
1376 init_hsa_runtime_functions (void)
1378 #define DLSYM_FN(function) \
1379 hsa_fns.function##_fn = dlsym (handle, #function); \
1380 if (hsa_fns.function##_fn == NULL) \
1381 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
1382 #define DLSYM_OPT_FN(function) \
1383 hsa_fns.function##_fn = dlsym (handle, #function);
1385 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1386 if (handle == NULL)
1387 return false;
1389 DLSYM_FN (hsa_status_string)
1390 DLSYM_FN (hsa_system_get_info)
1391 DLSYM_FN (hsa_agent_get_info)
1392 DLSYM_FN (hsa_init)
1393 DLSYM_FN (hsa_iterate_agents)
1394 DLSYM_FN (hsa_region_get_info)
1395 DLSYM_FN (hsa_queue_create)
1396 DLSYM_FN (hsa_agent_iterate_regions)
1397 DLSYM_FN (hsa_executable_destroy)
1398 DLSYM_FN (hsa_executable_create)
1399 DLSYM_FN (hsa_executable_global_variable_define)
1400 DLSYM_FN (hsa_executable_load_code_object)
1401 DLSYM_FN (hsa_executable_freeze)
1402 DLSYM_FN (hsa_signal_create)
1403 DLSYM_FN (hsa_memory_allocate)
1404 DLSYM_FN (hsa_memory_assign_agent)
1405 DLSYM_FN (hsa_memory_copy)
1406 DLSYM_FN (hsa_memory_free)
1407 DLSYM_FN (hsa_signal_destroy)
1408 DLSYM_FN (hsa_executable_get_symbol)
1409 DLSYM_FN (hsa_executable_symbol_get_info)
1410 DLSYM_FN (hsa_executable_iterate_symbols)
1411 DLSYM_FN (hsa_queue_add_write_index_release)
1412 DLSYM_FN (hsa_queue_load_read_index_acquire)
1413 DLSYM_FN (hsa_signal_wait_acquire)
1414 DLSYM_FN (hsa_signal_store_relaxed)
1415 DLSYM_FN (hsa_signal_store_release)
1416 DLSYM_FN (hsa_signal_load_acquire)
1417 DLSYM_FN (hsa_queue_destroy)
1418 DLSYM_FN (hsa_code_object_deserialize)
1419 DLSYM_OPT_FN (hsa_amd_memory_lock)
1420 DLSYM_OPT_FN (hsa_amd_memory_unlock)
1421 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1422 return true;
1423 #undef DLSYM_OPT_FN
1424 #undef DLSYM_FN
1427 static gcn_isa isa_code (const char *isa);
1429 /* Return true if the agent is a GPU and can accept of concurrent submissions
1430 from different threads. */
1432 static bool
1433 suitable_hsa_agent_p (hsa_agent_t agent)
1435 hsa_device_type_t device_type;
1436 hsa_status_t status
1437 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1438 &device_type);
1439 if (status != HSA_STATUS_SUCCESS)
1440 return false;
1442 switch (device_type)
1444 case HSA_DEVICE_TYPE_GPU:
1446 char name[64];
1447 hsa_status_t status
1448 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1449 if (status != HSA_STATUS_SUCCESS
1450 || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1452 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1453 status == HSA_STATUS_SUCCESS ? name : "invalid");
1454 return false;
1457 break;
1458 case HSA_DEVICE_TYPE_CPU:
1459 if (!support_cpu_devices)
1460 return false;
1461 break;
1462 default:
1463 return false;
1466 uint32_t features = 0;
1467 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1468 &features);
1469 if (status != HSA_STATUS_SUCCESS
1470 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1471 return false;
1472 hsa_queue_type_t queue_type;
1473 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1474 &queue_type);
1475 if (status != HSA_STATUS_SUCCESS
1476 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1477 return false;
1479 return true;
1482 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1483 agent_count in hsa_context. */
1485 static hsa_status_t
1486 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1488 if (suitable_hsa_agent_p (agent))
1489 hsa_context.agent_count++;
1490 return HSA_STATUS_SUCCESS;
1493 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1494 id to the describing structure in the hsa context. The index of the
1495 structure is pointed to by DATA, increment it afterwards. */
1497 static hsa_status_t
1498 assign_agent_ids (hsa_agent_t agent, void *data)
1500 if (suitable_hsa_agent_p (agent))
1502 int *agent_index = (int *) data;
1503 hsa_context.agents[*agent_index].id = agent;
1504 ++*agent_index;
1506 return HSA_STATUS_SUCCESS;
1509 /* Initialize hsa_context if it has not already been done.
1510 If !PROBE: returns TRUE on success.
1511 If PROBE: returns TRUE on success or if the plugin/device shall be silently
1512 ignored, and otherwise emits an error and returns FALSE. */
1514 static bool
1515 init_hsa_context (bool probe)
1517 hsa_status_t status;
1518 int agent_index = 0;
1520 if (hsa_context.initialized)
1521 return true;
1522 init_environment_variables ();
1523 if (!init_hsa_runtime_functions ())
1525 const char *msg = "Run-time could not be dynamically opened";
1526 if (suppress_host_fallback)
1527 GOMP_PLUGIN_fatal ("%s\n", msg);
1528 else
1529 GCN_WARNING ("%s\n", msg);
1530 return probe ? true : false;
1532 status = hsa_fns.hsa_init_fn ();
1533 if (status != HSA_STATUS_SUCCESS)
1534 return hsa_error ("Run-time could not be initialized", status);
1535 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1537 if (debug)
1538 dump_hsa_system_info ();
1540 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1541 if (status != HSA_STATUS_SUCCESS)
1542 return hsa_error ("GCN GPU devices could not be enumerated", status);
1543 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1545 hsa_context.agents
1546 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1547 * sizeof (struct agent_info));
1548 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1549 if (status != HSA_STATUS_SUCCESS)
1550 return hsa_error ("Scanning compute agents failed", status);
1551 if (agent_index != hsa_context.agent_count)
1553 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1554 return false;
1557 if (debug)
1559 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1560 if (status != HSA_STATUS_SUCCESS)
1561 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1564 uint16_t minor, major;
1565 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1566 &minor);
1567 if (status != HSA_STATUS_SUCCESS)
1568 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1569 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1570 &major);
1571 if (status != HSA_STATUS_SUCCESS)
1572 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1574 size_t len = sizeof hsa_context.driver_version_s;
1575 int printed = snprintf (hsa_context.driver_version_s, len,
1576 "HSA Runtime %hu.%hu", (unsigned short int)major,
1577 (unsigned short int)minor);
1578 if (printed >= len)
1579 GCN_WARNING ("HSA runtime version string was truncated."
1580 "Version %hu.%hu is too long.", (unsigned short int)major,
1581 (unsigned short int)minor);
1583 hsa_context.initialized = true;
1584 return true;
1587 /* Verify that hsa_context has already been initialized and return the
1588 agent_info structure describing device number N. Return NULL on error. */
1590 static struct agent_info *
1591 get_agent_info (int n)
1593 if (!hsa_context.initialized)
1595 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1596 return NULL;
1598 if (n >= hsa_context.agent_count)
1600 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1601 return NULL;
1603 if (!hsa_context.agents[n].initialized)
1605 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1606 return NULL;
1608 return &hsa_context.agents[n];
1611 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1613 Selects (breaks at) a suitable region of type KIND. */
1615 static hsa_status_t
1616 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1617 hsa_region_global_flag_t kind)
1619 hsa_status_t status;
1620 hsa_region_segment_t segment;
1622 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1623 &segment);
1624 if (status != HSA_STATUS_SUCCESS)
1625 return status;
1626 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1627 return HSA_STATUS_SUCCESS;
1629 uint32_t flags;
1630 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1631 &flags);
1632 if (status != HSA_STATUS_SUCCESS)
1633 return status;
1634 if (flags & kind)
1636 *retval = region;
1637 return HSA_STATUS_INFO_BREAK;
1639 return HSA_STATUS_SUCCESS;
1642 /* Callback of hsa_agent_iterate_regions.
1644 Selects a kernargs memory region. */
1646 static hsa_status_t
1647 get_kernarg_memory_region (hsa_region_t region, void *data)
1649 return get_memory_region (region, (hsa_region_t *)data,
1650 HSA_REGION_GLOBAL_FLAG_KERNARG);
1653 /* Callback of hsa_agent_iterate_regions.
1655 Selects a coarse-grained memory region suitable for the heap and
1656 offload data. */
1658 static hsa_status_t
1659 get_data_memory_region (hsa_region_t region, void *data)
1661 return get_memory_region (region, (hsa_region_t *)data,
1662 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1665 static int
1666 elf_gcn_isa_field (Elf64_Ehdr *image)
1668 return image->e_flags & EF_AMDGPU_MACH_MASK;
1671 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1672 support the ISA. */
1674 static const char*
1675 isa_name (int isa) {
1676 switch(isa)
1678 #define GCN_DEVICE(name, NAME, ELF, ...) \
1679 case ELF: return #name;
1680 #include "../../gcc/config/gcn/gcn-devices.def"
1682 return NULL;
1685 /* Returns the code which is used in the GCN object code to identify the ISA with
1686 the given name (as used by the HSA runtime). */
1688 static gcn_isa
1689 isa_code(const char *isa) {
1690 #define GCN_DEVICE(name, NAME, ELF, ...) \
1691 if (!strcmp (isa, #name)) return ELF;
1692 #include "../../gcc/config/gcn/gcn-devices.def"
1694 return EF_AMDGPU_MACH_UNSUPPORTED;
1697 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1699 static int
1700 max_isa_vgprs (int isa)
1702 switch (isa)
1704 #define GCN_DEVICE(name, NAME, ELF, ISA, XNACK, SRAM, WAVE64, CU, \
1705 MAX_ISA_VGPRS, ...) \
1706 case ELF: return MAX_ISA_VGPRS;
1707 #include "../../gcc/config/gcn/gcn-devices.def"
1708 default:
1709 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1713 /* }}} */
1714 /* {{{ Run */
1716 /* Create or reuse a team arena and stack space.
1718 Team arenas are used by OpenMP to avoid calling malloc multiple times
1719 while setting up each team. This is purely a performance optimization.
1721 The stack space is used by all kernels. We must allocate it in such a
1722 way that the reverse offload implmentation can access the data.
1724 Allocating this memory costs performance, so this function will reuse an
1725 existing allocation if a large enough one is idle.
1726 The memory lock is released, but not deallocated, when the kernel exits. */
1728 static void
1729 configure_ephemeral_memories (struct kernel_info *kernel,
1730 struct kernargs_abi *kernargs, int num_teams,
1731 int num_threads)
1733 struct agent_info *agent = kernel->agent;
1734 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1735 struct ephemeral_memories_list *item;
1737 int actual_arena_size = (kernel->kind == KIND_OPENMP
1738 ? team_arena_size : 0);
1739 int actual_arena_total_size = actual_arena_size * num_teams;
1740 size_t size = (actual_arena_total_size
1741 + num_teams * num_threads * stack_size);
1743 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1745 if (item->size < size)
1746 continue;
1748 if (pthread_mutex_trylock (&item->in_use) == 0)
1749 break;
1752 if (!item)
1754 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1755 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1756 num_teams, num_threads, size);
1758 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1760 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1761 return;
1763 item = malloc (sizeof (*item));
1764 item->size = size;
1765 item->next = NULL;
1766 *next_ptr = item;
1768 if (pthread_mutex_init (&item->in_use, NULL))
1770 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1771 return;
1773 if (pthread_mutex_lock (&item->in_use))
1775 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1776 return;
1778 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1780 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1781 return;
1784 hsa_status_t status;
1785 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1786 &item->address);
1787 if (status != HSA_STATUS_SUCCESS)
1788 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1789 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1790 HSA_ACCESS_PERMISSION_RW);
1791 if (status != HSA_STATUS_SUCCESS)
1792 hsa_fatal ("Could not assign arena & stack memory to device", status);
1795 kernargs->arena_ptr = (actual_arena_total_size
1796 ? (uint64_t)item->address
1797 : 0);
1798 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1799 kernargs->arena_size_per_team = actual_arena_size;
1800 kernargs->stack_size_per_thread = stack_size;
1803 /* Mark an ephemeral memory space available for reuse. */
1805 static void
1806 release_ephemeral_memories (struct agent_info* agent, void *address)
1808 struct ephemeral_memories_list *item;
1810 for (item = agent->ephemeral_memories_list; item; item = item->next)
1812 if (item->address == address)
1814 if (pthread_mutex_unlock (&item->in_use))
1815 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1816 return;
1819 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1822 /* Clean up all the allocated team arenas. */
1824 static bool
1825 destroy_ephemeral_memories (struct agent_info *agent)
1827 struct ephemeral_memories_list *item, *next;
1829 for (item = agent->ephemeral_memories_list; item; item = next)
1831 next = item->next;
1832 hsa_fns.hsa_memory_free_fn (item->address);
1833 if (pthread_mutex_destroy (&item->in_use))
1835 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1836 return false;
1838 free (item);
1840 agent->ephemeral_memories_list = NULL;
1842 return true;
1845 /* Allocate memory on a specified device. */
1847 static void *
1848 alloc_by_agent (struct agent_info *agent, size_t size)
1850 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1852 void *ptr;
1853 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1854 size, &ptr);
1855 if (status != HSA_STATUS_SUCCESS)
1857 hsa_error ("Could not allocate device memory", status);
1858 return NULL;
1861 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1862 HSA_ACCESS_PERMISSION_RW);
1863 if (status != HSA_STATUS_SUCCESS)
1865 hsa_error ("Could not assign data memory to device", status);
1866 return NULL;
1869 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1870 bool profiling_dispatch_p
1871 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1872 if (profiling_dispatch_p)
1874 acc_prof_info *prof_info = thr->prof_info;
1875 acc_event_info data_event_info;
1876 acc_api_info *api_info = thr->api_info;
1878 prof_info->event_type = acc_ev_alloc;
1880 data_event_info.data_event.event_type = prof_info->event_type;
1881 data_event_info.data_event.valid_bytes
1882 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1883 data_event_info.data_event.parent_construct
1884 = acc_construct_parallel;
1885 data_event_info.data_event.implicit = 1;
1886 data_event_info.data_event.tool_info = NULL;
1887 data_event_info.data_event.var_name = NULL;
1888 data_event_info.data_event.bytes = size;
1889 data_event_info.data_event.host_ptr = NULL;
1890 data_event_info.data_event.device_ptr = (void *) ptr;
1892 api_info->device_api = acc_device_api_other;
1894 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1895 api_info);
1898 return ptr;
1901 /* Create kernel dispatch data structure for given KERNEL, along with
1902 the necessary device signals and memory allocations. */
1904 static struct kernel_dispatch *
1905 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1906 int num_threads)
1908 struct agent_info *agent = kernel->agent;
1909 struct kernel_dispatch *shadow
1910 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1912 shadow->agent = kernel->agent;
1913 shadow->object = kernel->object;
1915 hsa_signal_t sync_signal;
1916 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1917 if (status != HSA_STATUS_SUCCESS)
1918 hsa_fatal ("Error creating the GCN sync signal", status);
1920 shadow->signal = sync_signal.handle;
1921 shadow->private_segment_size = kernel->private_segment_size;
1923 if (lowlat_size < 0)
1925 /* Divide the LDS between the number of running teams.
1926 Allocate not less than is defined in the kernel metadata. */
1927 int teams_per_cu = num_teams / get_cu_count (agent);
1928 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1929 shadow->group_segment_size
1930 = (kernel->group_segment_size > LDS_per_team
1931 ? kernel->group_segment_size
1932 : LDS_per_team);;
1934 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1935 /* Ensure that there's space for the OpenMP libgomp data. */
1936 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1937 else
1938 shadow->group_segment_size = (lowlat_size > 65536
1939 ? 65536
1940 : lowlat_size);
1942 /* We expect kernels to request a single pointer, explicitly, and the
1943 rest of struct kernargs, implicitly. If they request anything else
1944 then something is wrong. */
1945 if (kernel->kernarg_segment_size > 8)
1947 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1948 return NULL;
1951 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1952 sizeof (struct kernargs),
1953 &shadow->kernarg_address);
1954 if (status != HSA_STATUS_SUCCESS)
1955 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1956 struct kernargs *kernargs = shadow->kernarg_address;
1958 /* Zero-initialize the output_data (minimum needed). */
1959 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
1960 kernargs->output_data.next_output = 0;
1961 for (unsigned i = 0;
1962 i < (sizeof (kernargs->output_data.queue)
1963 / sizeof (kernargs->output_data.queue[0]));
1964 i++)
1965 kernargs->output_data.queue[i].written = 0;
1966 kernargs->output_data.consumed = 0;
1968 /* Pass in the heap location. */
1969 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
1971 /* Create the ephemeral memory spaces. */
1972 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
1974 /* Ensure we can recognize unset return values. */
1975 kernargs->output_data.return_value = 0xcafe0000;
1977 return shadow;
1980 static void
1981 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
1982 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
1984 int dev_num = dev_num64;
1985 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
1986 NULL);
1989 /* Output any data written to console output from the kernel. It is expected
1990 that this function is polled during kernel execution.
1992 We print all entries from the last item printed to the next entry without
1993 a "written" flag. If the "final" flag is set then it'll continue right to
1994 the end.
1996 The print buffer is circular, but the from and to locations don't wrap when
1997 the buffer does, so the output limit is UINT_MAX. The target blocks on
1998 output when the buffer is full. */
2000 static void
2001 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2002 bool final)
2004 unsigned int limit = (sizeof (kernargs->output_data.queue)
2005 / sizeof (kernargs->output_data.queue[0]));
2007 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2008 __ATOMIC_ACQUIRE);
2009 unsigned int to = kernargs->output_data.next_output;
2011 if (from > to)
2013 /* Overflow. */
2014 if (final)
2015 printf ("GCN print buffer overflowed.\n");
2016 return;
2019 unsigned int i;
2020 for (i = from; i < to; i++)
2022 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2024 if (!data->written && !final)
2025 break;
2027 switch (data->type)
2029 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2030 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2031 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2032 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2033 case 4:
2034 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2035 data->value_u64[2], data->value_u64[3],
2036 data->value_u64[4], data->value_u64[5]);
2037 break;
2038 default: printf ("GCN print buffer error!\n"); break;
2040 data->written = 0;
2041 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2042 __ATOMIC_RELEASE);
2044 fflush (stdout);
2047 /* Release data structure created for a kernel dispatch in SHADOW argument,
2048 and clean up the signal and memory allocations. */
2050 static void
2051 release_kernel_dispatch (struct kernel_dispatch *shadow)
2053 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2055 struct kernargs *kernargs = shadow->kernarg_address;
2056 void *addr = (void *)kernargs->abi.arena_ptr;
2057 if (!addr)
2058 addr = (void *)kernargs->abi.stack_ptr;
2059 release_ephemeral_memories (shadow->agent, addr);
2061 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2063 hsa_signal_t s;
2064 s.handle = shadow->signal;
2065 hsa_fns.hsa_signal_destroy_fn (s);
2067 free (shadow);
2070 /* Extract the properties from a kernel binary. */
2072 static void
2073 init_kernel_properties (struct kernel_info *kernel)
2075 hsa_status_t status;
2076 struct agent_info *agent = kernel->agent;
2077 hsa_executable_symbol_t kernel_symbol;
2078 char *buf = alloca (strlen (kernel->name) + 4);
2079 sprintf (buf, "%s.kd", kernel->name);
2080 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2081 buf, agent->id,
2082 0, &kernel_symbol);
2083 if (status != HSA_STATUS_SUCCESS)
2085 hsa_warn ("Could not find symbol for kernel in the code object", status);
2086 fprintf (stderr, "not found name: '%s'\n", buf);
2087 dump_executable_symbols (agent->executable);
2088 goto failure;
2090 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2091 status = hsa_fns.hsa_executable_symbol_get_info_fn
2092 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2093 if (status != HSA_STATUS_SUCCESS)
2094 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2095 status = hsa_fns.hsa_executable_symbol_get_info_fn
2096 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2097 &kernel->kernarg_segment_size);
2098 if (status != HSA_STATUS_SUCCESS)
2099 hsa_fatal ("Could not get info about kernel argument size", status);
2100 status = hsa_fns.hsa_executable_symbol_get_info_fn
2101 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2102 &kernel->group_segment_size);
2103 if (status != HSA_STATUS_SUCCESS)
2104 hsa_fatal ("Could not get info about kernel group segment size", status);
2105 status = hsa_fns.hsa_executable_symbol_get_info_fn
2106 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2107 &kernel->private_segment_size);
2108 if (status != HSA_STATUS_SUCCESS)
2109 hsa_fatal ("Could not get info about kernel private segment size",
2110 status);
2112 /* The kernel type is not known until something tries to launch it. */
2113 kernel->kind = KIND_UNKNOWN;
2115 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2116 "following segment sizes: \n", kernel->name);
2117 GCN_DEBUG (" group_segment_size: %u\n",
2118 (unsigned) kernel->group_segment_size);
2119 GCN_DEBUG (" private_segment_size: %u\n",
2120 (unsigned) kernel->private_segment_size);
2121 GCN_DEBUG (" kernarg_segment_size: %u\n",
2122 (unsigned) kernel->kernarg_segment_size);
2123 return;
2125 failure:
2126 kernel->initialization_failed = true;
2129 /* Do all the work that is necessary before running KERNEL for the first time.
2130 The function assumes the program has been created, finalized and frozen by
2131 create_and_finalize_hsa_program. */
2133 static void
2134 init_kernel (struct kernel_info *kernel)
2136 if (pthread_mutex_lock (&kernel->init_mutex))
2137 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2138 if (kernel->initialized)
2140 if (pthread_mutex_unlock (&kernel->init_mutex))
2141 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2142 "mutex");
2144 return;
2147 init_kernel_properties (kernel);
2149 if (!kernel->initialization_failed)
2151 GCN_DEBUG ("\n");
2153 kernel->initialized = true;
2155 if (pthread_mutex_unlock (&kernel->init_mutex))
2156 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2157 "mutex");
2160 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2161 launch attributes from KLA.
2163 MODULE_LOCKED indicates that the caller already holds the lock and
2164 run_kernel need not lock it again.
2165 If AQ is NULL then agent->sync_queue will be used. */
2167 static void
2168 run_kernel (struct kernel_info *kernel, void *vars,
2169 struct GOMP_kernel_launch_attributes *kla,
2170 struct goacc_asyncqueue *aq, bool module_locked)
2172 struct agent_info *agent = kernel->agent;
2173 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2174 kernel->description->vpgr_count);
2176 /* Reduce the number of threads/workers if there are insufficient
2177 VGPRs available to run the kernels together. */
2178 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2180 int max_vgprs = max_isa_vgprs (agent->device_isa);
2181 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2182 int max_threads = (max_vgprs / granulated_vgprs) * 4;
2183 if (kla->gdims[2] > max_threads)
2185 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2186 " per team/gang - reducing to %d threads/workers.\n",
2187 kla->gdims[2], max_threads);
2188 kla->gdims[2] = max_threads;
2192 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2193 (aq ? aq->id : 0));
2194 GCN_DEBUG ("GCN launch attribs: gdims:[");
2195 int i;
2196 for (i = 0; i < kla->ndim; ++i)
2198 if (i)
2199 DEBUG_PRINT (", ");
2200 DEBUG_PRINT ("%u", kla->gdims[i]);
2202 DEBUG_PRINT ("], normalized gdims:[");
2203 for (i = 0; i < kla->ndim; ++i)
2205 if (i)
2206 DEBUG_PRINT (", ");
2207 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2209 DEBUG_PRINT ("], wdims:[");
2210 for (i = 0; i < kla->ndim; ++i)
2212 if (i)
2213 DEBUG_PRINT (", ");
2214 DEBUG_PRINT ("%u", kla->wdims[i]);
2216 DEBUG_PRINT ("]\n");
2217 DEBUG_FLUSH ();
2219 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2220 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2222 if (!agent->initialized)
2223 GOMP_PLUGIN_fatal ("Agent must be initialized");
2225 if (!kernel->initialized)
2226 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2228 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2230 uint64_t index
2231 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2232 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2234 /* Wait until the queue is not full before writing the packet. */
2235 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2236 >= command_q->size)
2239 /* Do not allow the dimensions to be overridden when running
2240 constructors or destructors. */
2241 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2242 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2244 hsa_kernel_dispatch_packet_t *packet;
2245 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2246 + index % command_q->size;
2248 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2249 packet->grid_size_x = override_x ? : kla->gdims[0];
2250 packet->workgroup_size_x = get_group_size (kla->ndim,
2251 packet->grid_size_x,
2252 kla->wdims[0]);
2254 if (kla->ndim >= 2)
2256 packet->grid_size_y = kla->gdims[1];
2257 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2258 kla->wdims[1]);
2260 else
2262 packet->grid_size_y = 1;
2263 packet->workgroup_size_y = 1;
2266 if (kla->ndim == 3)
2268 packet->grid_size_z = limit_worker_threads (override_z
2269 ? : kla->gdims[2]);
2270 packet->workgroup_size_z = get_group_size (kla->ndim,
2271 packet->grid_size_z,
2272 kla->wdims[2]);
2274 else
2276 packet->grid_size_z = 1;
2277 packet->workgroup_size_z = 1;
2280 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2281 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2282 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2283 packet->grid_size_x / packet->workgroup_size_x,
2284 packet->grid_size_y / packet->workgroup_size_y,
2285 packet->grid_size_z / packet->workgroup_size_z,
2286 packet->workgroup_size_x, packet->workgroup_size_y,
2287 packet->workgroup_size_z);
2289 struct kernel_dispatch *shadow
2290 = create_kernel_dispatch (kernel, packet->grid_size_x,
2291 packet->grid_size_z);
2292 shadow->queue = command_q;
2294 if (debug)
2296 fprintf (stderr, "\nKernel has following dependencies:\n");
2297 print_kernel_dispatch (shadow, 2);
2300 packet->private_segment_size = shadow->private_segment_size;
2301 packet->group_segment_size = shadow->group_segment_size;
2302 packet->kernel_object = shadow->object;
2303 packet->kernarg_address = shadow->kernarg_address;
2304 hsa_signal_t s;
2305 s.handle = shadow->signal;
2306 packet->completion_signal = s;
2307 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2308 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2310 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2312 uint16_t header;
2313 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2314 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2315 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2317 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2318 agent->device_id);
2320 packet_store_release ((uint32_t *) packet, header,
2321 (uint16_t) kla->ndim
2322 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2324 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2325 index);
2327 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2329 /* Root signal waits with 1ms timeout. */
2330 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2331 1000 * 1000,
2332 HSA_WAIT_STATE_BLOCKED) != 0)
2334 console_output (kernel, shadow->kernarg_address, false);
2336 console_output (kernel, shadow->kernarg_address, true);
2338 struct kernargs *kernargs = shadow->kernarg_address;
2339 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2341 release_kernel_dispatch (shadow);
2343 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2344 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2346 unsigned int upper = (return_value & ~0xffff) >> 16;
2347 if (upper == 0xcafe)
2348 ; // exit not called, normal termination.
2349 else if (upper == 0xffff)
2350 ; // exit called.
2351 else
2353 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2354 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2355 return_value);
2356 abort ();
2359 if (upper == 0xffff)
2361 unsigned int signal = (return_value >> 8) & 0xff;
2363 if (signal == SIGABRT)
2365 GCN_WARNING ("GCN Kernel aborted\n");
2366 abort ();
2368 else if (signal != 0)
2370 GCN_WARNING ("GCN Kernel received unknown signal\n");
2371 abort ();
2374 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2375 exit (return_value & 0xff);
2379 /* }}} */
2380 /* {{{ Load/Unload */
2382 /* Initialize KERNEL from D and other parameters. Return true on success. */
2384 static bool
2385 init_basic_kernel_info (struct kernel_info *kernel,
2386 struct hsa_kernel_description *d,
2387 struct agent_info *agent,
2388 struct module_info *module)
2390 kernel->agent = agent;
2391 kernel->module = module;
2392 kernel->name = d->name;
2393 kernel->description = d;
2394 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2396 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2397 return false;
2399 return true;
2402 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2404 static bool
2405 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2407 int isa_field = elf_gcn_isa_field (image);
2408 const char* isa_s = isa_name (isa_field);
2409 if (!isa_s)
2411 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2412 return false;
2415 if (isa_field != agent->device_isa)
2417 char msg[204];
2418 const char *agent_isa_s = isa_name (agent->device_isa);
2419 assert (agent_isa_s);
2421 snprintf (msg, sizeof msg,
2422 "GCN code object ISA '%s' does not match GPU ISA '%s' "
2423 "(device %d).\n"
2424 "Try to recompile with '-foffload-options=-march=%s',\n"
2425 "or use ROCR_VISIBLE_DEVICES to disable incompatible "
2426 "devices.\n",
2427 isa_s, agent_isa_s, agent->device_id, agent_isa_s);
2429 hsa_error (msg, HSA_STATUS_ERROR);
2430 return false;
2433 return true;
2436 /* Create and finalize the program consisting of all loaded modules. */
2438 static bool
2439 create_and_finalize_hsa_program (struct agent_info *agent)
2441 hsa_status_t status;
2442 bool res = true;
2443 if (pthread_mutex_lock (&agent->prog_mutex))
2445 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2446 return false;
2448 if (agent->prog_finalized)
2449 goto final;
2451 status
2452 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2453 HSA_EXECUTABLE_STATE_UNFROZEN,
2454 "", &agent->executable);
2455 if (status != HSA_STATUS_SUCCESS)
2457 hsa_error ("Could not create GCN executable", status);
2458 goto fail;
2461 /* Load any GCN modules. */
2462 struct module_info *module = agent->module;
2463 if (module)
2465 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2467 if (!isa_matches_agent (agent, image))
2468 goto fail;
2470 hsa_code_object_t co = { 0 };
2471 status = hsa_fns.hsa_code_object_deserialize_fn
2472 (module->image_desc->gcn_image->image,
2473 module->image_desc->gcn_image->size,
2474 NULL, &co);
2475 if (status != HSA_STATUS_SUCCESS)
2477 hsa_error ("Could not deserialize GCN code object", status);
2478 goto fail;
2481 status = hsa_fns.hsa_executable_load_code_object_fn
2482 (agent->executable, agent->id, co, "");
2483 if (status != HSA_STATUS_SUCCESS)
2485 hsa_error ("Could not load GCN code object", status);
2486 goto fail;
2489 if (!module->heap)
2491 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2492 gcn_kernel_heap_size,
2493 (void**)&module->heap);
2494 if (status != HSA_STATUS_SUCCESS)
2496 hsa_error ("Could not allocate memory for GCN heap", status);
2497 goto fail;
2500 status = hsa_fns.hsa_memory_assign_agent_fn
2501 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2502 if (status != HSA_STATUS_SUCCESS)
2504 hsa_error ("Could not assign GCN heap memory to device", status);
2505 goto fail;
2508 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2509 &gcn_kernel_heap_size,
2510 sizeof (gcn_kernel_heap_size));
2515 if (debug)
2516 dump_executable_symbols (agent->executable);
2518 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2519 if (status != HSA_STATUS_SUCCESS)
2521 hsa_error ("Could not freeze the GCN executable", status);
2522 goto fail;
2525 final:
2526 agent->prog_finalized = true;
2528 if (pthread_mutex_unlock (&agent->prog_mutex))
2530 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2531 res = false;
2534 return res;
2536 fail:
2537 res = false;
2538 goto final;
2541 /* Free the HSA program in agent and everything associated with it and set
2542 agent->prog_finalized and the initialized flags of all kernels to false.
2543 Return TRUE on success. */
2545 static bool
2546 destroy_hsa_program (struct agent_info *agent)
2548 if (!agent->prog_finalized)
2549 return true;
2551 hsa_status_t status;
2553 GCN_DEBUG ("Destroying the current GCN program.\n");
2555 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2556 if (status != HSA_STATUS_SUCCESS)
2557 return hsa_error ("Could not destroy GCN executable", status);
2559 if (agent->module)
2561 int i;
2562 for (i = 0; i < agent->module->kernel_count; i++)
2563 agent->module->kernels[i].initialized = false;
2565 if (agent->module->heap)
2567 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2568 agent->module->heap = NULL;
2571 agent->prog_finalized = false;
2572 return true;
2575 /* Deinitialize all information associated with MODULE and kernels within
2576 it. Return TRUE on success. */
2578 static bool
2579 destroy_module (struct module_info *module, bool locked)
2581 /* Run destructors before destroying module. */
2582 struct GOMP_kernel_launch_attributes kla =
2583 { 3,
2584 /* Grid size. */
2585 { 1, 64, 1 },
2586 /* Work-group size. */
2587 { 1, 64, 1 }
2590 if (module->fini_array_func)
2592 init_kernel (module->fini_array_func);
2593 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2595 module->constructors_run_p = false;
2597 int i;
2598 for (i = 0; i < module->kernel_count; i++)
2599 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2601 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2602 "mutex");
2603 return false;
2606 return true;
2609 /* }}} */
2610 /* {{{ Async */
2612 /* Callback of dispatch queues to report errors. */
2614 static void
2615 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2617 struct queue_entry *entry = &aq->queue[index];
2619 switch (entry->type)
2621 case KERNEL_LAUNCH:
2622 if (DEBUG_QUEUES)
2623 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2624 aq->agent->device_id, aq->id, index);
2625 run_kernel (entry->u.launch.kernel,
2626 entry->u.launch.vars,
2627 &entry->u.launch.kla, aq, false);
2628 if (DEBUG_QUEUES)
2629 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2630 aq->agent->device_id, aq->id, index);
2631 break;
2633 case CALLBACK:
2634 if (DEBUG_QUEUES)
2635 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2636 aq->agent->device_id, aq->id, index);
2637 entry->u.callback.fn (entry->u.callback.data);
2638 if (DEBUG_QUEUES)
2639 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2640 aq->agent->device_id, aq->id, index);
2641 break;
2643 case ASYNC_WAIT:
2645 /* FIXME: is it safe to access a placeholder that may already have
2646 been executed? */
2647 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2649 if (DEBUG_QUEUES)
2650 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2651 aq->agent->device_id, aq->id, index);
2653 pthread_mutex_lock (&placeholderp->mutex);
2655 while (!placeholderp->executed)
2656 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2658 pthread_mutex_unlock (&placeholderp->mutex);
2660 if (pthread_cond_destroy (&placeholderp->cond))
2661 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2663 if (pthread_mutex_destroy (&placeholderp->mutex))
2664 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2666 if (DEBUG_QUEUES)
2667 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2668 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2670 break;
2672 case ASYNC_PLACEHOLDER:
2673 pthread_mutex_lock (&entry->u.placeholder.mutex);
2674 entry->u.placeholder.executed = 1;
2675 pthread_cond_signal (&entry->u.placeholder.cond);
2676 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2677 break;
2679 default:
2680 GOMP_PLUGIN_fatal ("Unknown queue element");
2684 /* This function is run as a thread to service an async queue in the
2685 background. It runs continuously until the stop flag is set. */
2687 static void *
2688 drain_queue (void *thread_arg)
2690 struct goacc_asyncqueue *aq = thread_arg;
2692 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2694 aq->drain_queue_stop = 2;
2695 return NULL;
2698 pthread_mutex_lock (&aq->mutex);
2700 while (true)
2702 if (aq->drain_queue_stop)
2703 break;
2705 if (aq->queue_n > 0)
2707 pthread_mutex_unlock (&aq->mutex);
2708 execute_queue_entry (aq, aq->queue_first);
2710 pthread_mutex_lock (&aq->mutex);
2711 aq->queue_first = ((aq->queue_first + 1)
2712 % ASYNC_QUEUE_SIZE);
2713 aq->queue_n--;
2715 if (DEBUG_THREAD_SIGNAL)
2716 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2717 aq->agent->device_id, aq->id);
2718 pthread_cond_broadcast (&aq->queue_cond_out);
2719 pthread_mutex_unlock (&aq->mutex);
2721 if (DEBUG_QUEUES)
2722 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2723 aq->id);
2724 pthread_mutex_lock (&aq->mutex);
2726 else
2728 if (DEBUG_THREAD_SLEEP)
2729 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2730 aq->agent->device_id, aq->id);
2731 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2732 if (DEBUG_THREAD_SLEEP)
2733 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2734 aq->agent->device_id, aq->id);
2738 aq->drain_queue_stop = 2;
2739 if (DEBUG_THREAD_SIGNAL)
2740 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2741 aq->agent->device_id, aq->id);
2742 pthread_cond_broadcast (&aq->queue_cond_out);
2743 pthread_mutex_unlock (&aq->mutex);
2745 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2746 return NULL;
2749 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2750 is not usually the case. This is just a debug tool. */
2752 static void
2753 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2755 pthread_mutex_lock (&aq->mutex);
2757 while (aq->queue_n > 0)
2759 execute_queue_entry (aq, aq->queue_first);
2761 aq->queue_first = ((aq->queue_first + 1)
2762 % ASYNC_QUEUE_SIZE);
2763 aq->queue_n--;
2766 pthread_mutex_unlock (&aq->mutex);
2769 /* Block the current thread until an async queue is writable. The aq->mutex
2770 lock should be held on entry, and remains locked on exit. */
2772 static void
2773 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2775 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2777 /* Queue is full. Wait for it to not be full. */
2778 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2779 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2783 /* Request an asynchronous kernel launch on the specified queue. This
2784 may block if the queue is full, but returns without waiting for the
2785 kernel to run. */
2787 static void
2788 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2789 void *vars, struct GOMP_kernel_launch_attributes *kla)
2791 assert (aq->agent == kernel->agent);
2793 pthread_mutex_lock (&aq->mutex);
2795 wait_for_queue_nonfull (aq);
2797 int queue_last = ((aq->queue_first + aq->queue_n)
2798 % ASYNC_QUEUE_SIZE);
2799 if (DEBUG_QUEUES)
2800 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2801 aq->id, queue_last);
2803 aq->queue[queue_last].type = KERNEL_LAUNCH;
2804 aq->queue[queue_last].u.launch.kernel = kernel;
2805 aq->queue[queue_last].u.launch.vars = vars;
2806 aq->queue[queue_last].u.launch.kla = *kla;
2808 aq->queue_n++;
2810 if (DEBUG_THREAD_SIGNAL)
2811 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2812 aq->agent->device_id, aq->id);
2813 pthread_cond_signal (&aq->queue_cond_in);
2815 pthread_mutex_unlock (&aq->mutex);
2818 /* Request an asynchronous callback on the specified queue. The callback
2819 function will be called, with the given opaque data, from the appropriate
2820 async thread, when all previous items on that queue are complete. */
2822 static void
2823 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2824 void *data)
2826 pthread_mutex_lock (&aq->mutex);
2828 wait_for_queue_nonfull (aq);
2830 int queue_last = ((aq->queue_first + aq->queue_n)
2831 % ASYNC_QUEUE_SIZE);
2832 if (DEBUG_QUEUES)
2833 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2834 aq->id, queue_last);
2836 aq->queue[queue_last].type = CALLBACK;
2837 aq->queue[queue_last].u.callback.fn = fn;
2838 aq->queue[queue_last].u.callback.data = data;
2840 aq->queue_n++;
2842 if (DEBUG_THREAD_SIGNAL)
2843 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2844 aq->agent->device_id, aq->id);
2845 pthread_cond_signal (&aq->queue_cond_in);
2847 pthread_mutex_unlock (&aq->mutex);
2850 /* Request that a given async thread wait for another thread (unspecified) to
2851 reach the given placeholder. The wait will occur when all previous entries
2852 on the queue are complete. A placeholder is effectively a kind of signal
2853 which simply sets a flag when encountered in a queue. */
2855 static void
2856 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2857 struct placeholder *placeholderp)
2859 pthread_mutex_lock (&aq->mutex);
2861 wait_for_queue_nonfull (aq);
2863 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2864 if (DEBUG_QUEUES)
2865 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2866 aq->id, queue_last);
2868 aq->queue[queue_last].type = ASYNC_WAIT;
2869 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2871 aq->queue_n++;
2873 if (DEBUG_THREAD_SIGNAL)
2874 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2875 aq->agent->device_id, aq->id);
2876 pthread_cond_signal (&aq->queue_cond_in);
2878 pthread_mutex_unlock (&aq->mutex);
2881 /* Add a placeholder into an async queue. When the async thread reaches the
2882 placeholder it will set the "executed" flag to true and continue.
2883 Another thread may be waiting on this thread reaching the placeholder. */
2885 static struct placeholder *
2886 queue_push_placeholder (struct goacc_asyncqueue *aq)
2888 struct placeholder *placeholderp;
2890 pthread_mutex_lock (&aq->mutex);
2892 wait_for_queue_nonfull (aq);
2894 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2895 if (DEBUG_QUEUES)
2896 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2897 aq->id, queue_last);
2899 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2900 placeholderp = &aq->queue[queue_last].u.placeholder;
2902 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2904 pthread_mutex_unlock (&aq->mutex);
2905 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2908 if (pthread_cond_init (&placeholderp->cond, NULL))
2910 pthread_mutex_unlock (&aq->mutex);
2911 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2914 placeholderp->executed = 0;
2916 aq->queue_n++;
2918 if (DEBUG_THREAD_SIGNAL)
2919 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2920 aq->agent->device_id, aq->id);
2921 pthread_cond_signal (&aq->queue_cond_in);
2923 pthread_mutex_unlock (&aq->mutex);
2925 return placeholderp;
2928 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2930 static void
2931 finalize_async_thread (struct goacc_asyncqueue *aq)
2933 pthread_mutex_lock (&aq->mutex);
2934 if (aq->drain_queue_stop == 2)
2936 pthread_mutex_unlock (&aq->mutex);
2937 return;
2940 aq->drain_queue_stop = 1;
2942 if (DEBUG_THREAD_SIGNAL)
2943 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2944 aq->agent->device_id, aq->id);
2945 pthread_cond_signal (&aq->queue_cond_in);
2947 while (aq->drain_queue_stop != 2)
2949 if (DEBUG_THREAD_SLEEP)
2950 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2951 " to sleep\n", aq->agent->device_id, aq->id);
2952 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2953 if (DEBUG_THREAD_SLEEP)
2954 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2955 aq->agent->device_id, aq->id);
2958 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2959 aq->id);
2960 pthread_mutex_unlock (&aq->mutex);
2962 int err = pthread_join (aq->thread_drain_queue, NULL);
2963 if (err != 0)
2964 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2965 aq->agent->device_id, aq->id, strerror (err));
2966 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2969 /* Set up an async queue for OpenMP. There will be only one. The
2970 implementation simply uses an OpenACC async queue.
2971 FIXME: is this thread-safe if two threads call this function? */
2973 static void
2974 maybe_init_omp_async (struct agent_info *agent)
2976 if (!agent->omp_async_queue)
2977 agent->omp_async_queue
2978 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2981 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2982 copies from read-only pages. */
2984 static void
2985 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2987 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2989 if (status == HSA_STATUS_SUCCESS)
2990 return;
2992 /* It appears that the copy fails if the source data is in a read-only page.
2993 We can't detect that easily, so try copying the data to a temporary buffer
2994 and doing the copy again if we got an error above. */
2996 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2997 "[%p:+%d]\n", (void *) src, (int) len);
2999 void *src_copy = malloc (len);
3000 memcpy (src_copy, src, len);
3001 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3002 free (src_copy);
3003 if (status != HSA_STATUS_SUCCESS)
3004 GOMP_PLUGIN_error ("memory copy failed");
3007 /* Copy data to or from a device. This is intended for use as an async
3008 callback event. */
3010 static void
3011 copy_data (void *data_)
3013 struct copy_data *data = (struct copy_data *)data_;
3014 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3015 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3016 data->dst);
3017 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3018 free (data);
3021 /* Request an asynchronous data copy, to or from a device, on a given queue.
3022 The event will be registered as a callback. */
3024 static void
3025 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3026 size_t len)
3028 if (DEBUG_QUEUES)
3029 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3030 aq->agent->device_id, aq->id, len, src, dst);
3031 struct copy_data *data
3032 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3033 data->dst = dst;
3034 data->src = src;
3035 data->len = len;
3036 data->aq = aq;
3037 queue_push_callback (aq, copy_data, data);
3040 /* Return true if the given queue is currently empty. */
3042 static int
3043 queue_empty (struct goacc_asyncqueue *aq)
3045 pthread_mutex_lock (&aq->mutex);
3046 int res = aq->queue_n == 0 ? 1 : 0;
3047 pthread_mutex_unlock (&aq->mutex);
3049 return res;
3052 /* Wait for a given queue to become empty. This implements an OpenACC wait
3053 directive. */
3055 static void
3056 wait_queue (struct goacc_asyncqueue *aq)
3058 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3060 drain_queue_synchronous (aq);
3061 return;
3064 pthread_mutex_lock (&aq->mutex);
3066 while (aq->queue_n > 0)
3068 if (DEBUG_THREAD_SLEEP)
3069 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3070 aq->agent->device_id, aq->id);
3071 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3072 if (DEBUG_THREAD_SLEEP)
3073 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3074 aq->id);
3077 pthread_mutex_unlock (&aq->mutex);
3078 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3081 /* }}} */
3082 /* {{{ OpenACC support */
3084 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3086 static void
3087 gcn_exec (struct kernel_info *kernel,
3088 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3089 struct goacc_asyncqueue *aq)
3091 if (!GOMP_OFFLOAD_can_run (kernel))
3092 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3094 /* If we get here then this must be an OpenACC kernel. */
3095 kernel->kind = KIND_OPENACC;
3097 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3098 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3100 struct hsa_kernel_description *d
3101 = &kernel->module->image_desc->kernel_infos[i];
3102 if (d->name == kernel->name)
3104 hsa_kernel_desc = d;
3105 break;
3109 /* We may have statically-determined dimensions in
3110 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3111 invocation at runtime in dims[]. We allow static dimensions to take
3112 priority over dynamic dimensions when present (non-zero). */
3113 if (hsa_kernel_desc->oacc_dims[0] > 0)
3114 dims[0] = hsa_kernel_desc->oacc_dims[0];
3115 if (hsa_kernel_desc->oacc_dims[1] > 0)
3116 dims[1] = hsa_kernel_desc->oacc_dims[1];
3117 if (hsa_kernel_desc->oacc_dims[2] > 0)
3118 dims[2] = hsa_kernel_desc->oacc_dims[2];
3120 /* Ideally, when a dimension isn't explicitly specified, we should
3121 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3122 In practice, we tune for peak performance on BabelStream, which
3123 for OpenACC is currently 32 threads per CU. */
3124 if (dims[0] == 0 && dims[1] == 0)
3126 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3127 number. There isn't really a correct answer for this without a clue
3128 about the problem size, so let's do a reasonable number of workers
3129 and gangs. */
3131 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3132 dims[1] = 8; /* Workers. */
3134 else if (dims[0] == 0 && dims[1] > 0)
3136 /* Auto-scale the number of gangs with the requested number of workers. */
3137 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3139 else if (dims[0] > 0 && dims[1] == 0)
3141 /* Auto-scale the number of workers with the requested number of gangs. */
3142 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3143 if (dims[1] == 0)
3144 dims[1] = 1;
3145 if (dims[1] > 16)
3146 dims[1] = 16;
3149 /* The incoming dimensions are expressed in terms of gangs, workers, and
3150 vectors. The HSA dimensions are expressed in terms of "work-items",
3151 which means multiples of vector lanes.
3153 The "grid size" specifies the size of the problem space, and the
3154 "work-group size" specifies how much of that we want a single compute
3155 unit to chew on at once.
3157 The three dimensions do not really correspond to hardware, but the
3158 important thing is that the HSA runtime will launch as many
3159 work-groups as it takes to process the entire grid, and each
3160 work-group will contain as many wave-fronts as it takes to process
3161 the work-items in that group.
3163 Essentially, as long as we set the Y dimension to 64 (the number of
3164 vector lanes in hardware), and the Z group size to the maximum (16),
3165 then we will get the gangs (X) and workers (Z) launched as we expect.
3167 The reason for the apparent reversal of vector and worker dimension
3168 order is to do with the way the run-time distributes work-items across
3169 v1 and v2. */
3170 struct GOMP_kernel_launch_attributes kla =
3172 /* Grid size. */
3173 {dims[0], 64, dims[1]},
3174 /* Work-group size. */
3175 {1, 64, 16}
3178 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3179 acc_prof_info *prof_info = thr->prof_info;
3180 acc_event_info enqueue_launch_event_info;
3181 acc_api_info *api_info = thr->api_info;
3182 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3183 if (profiling_dispatch_p)
3185 prof_info->event_type = acc_ev_enqueue_launch_start;
3187 enqueue_launch_event_info.launch_event.event_type
3188 = prof_info->event_type;
3189 enqueue_launch_event_info.launch_event.valid_bytes
3190 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3191 enqueue_launch_event_info.launch_event.parent_construct
3192 = acc_construct_parallel;
3193 enqueue_launch_event_info.launch_event.implicit = 1;
3194 enqueue_launch_event_info.launch_event.tool_info = NULL;
3195 enqueue_launch_event_info.launch_event.kernel_name
3196 = (char *) kernel->name;
3197 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3198 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3199 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3201 api_info->device_api = acc_device_api_other;
3203 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3204 &enqueue_launch_event_info, api_info);
3207 if (!async)
3208 run_kernel (kernel, devaddrs, &kla, NULL, false);
3209 else
3210 queue_push_launch (aq, kernel, devaddrs, &kla);
3212 if (profiling_dispatch_p)
3214 prof_info->event_type = acc_ev_enqueue_launch_end;
3215 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3216 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3217 &enqueue_launch_event_info,
3218 api_info);
3222 /* }}} */
3223 /* {{{ Generic Plugin API */
3225 /* Return the name of the accelerator, which is "gcn". */
3227 const char *
3228 GOMP_OFFLOAD_get_name (void)
3230 return "gcn";
3233 /* Return the UID; if not available return NULL.
3234 Returns freshly allocated memoy. */
3236 const char *
3237 GOMP_OFFLOAD_get_uid (int ord)
3239 char *str;
3240 hsa_status_t status;
3241 struct agent_info *agent = get_agent_info (ord);
3243 /* HSA documentation states: maximally 21 characters including NUL. */
3244 str = GOMP_PLUGIN_malloc (21 * sizeof (char));
3245 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AMD_AGENT_INFO_UUID,
3246 str);
3247 if (status != HSA_STATUS_SUCCESS)
3249 free (str);
3250 return NULL;
3252 return str;
3255 /* Return the specific capabilities the HSA accelerator have. */
3257 unsigned int
3258 GOMP_OFFLOAD_get_caps (void)
3260 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3261 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3262 | GOMP_OFFLOAD_CAP_OPENACC_200;
3265 /* Identify as GCN accelerator. */
3268 GOMP_OFFLOAD_get_type (void)
3270 return OFFLOAD_TARGET_TYPE_GCN;
3273 /* Return the libgomp version number we're compatible with. There is
3274 no requirement for cross-version compatibility. */
3276 unsigned
3277 GOMP_OFFLOAD_version (void)
3279 return GOMP_VERSION;
3282 /* Return the number of GCN devices on the system. */
3285 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3287 if (!init_hsa_context (true))
3288 exit (EXIT_FAILURE);
3289 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3290 devices were present. */
3291 if (hsa_context.agent_count > 0
3292 && ((omp_requires_mask
3293 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3294 | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
3295 | GOMP_REQUIRES_SELF_MAPS
3296 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3297 return -1;
3298 /* Check whether host page access is supported; this is per system level
3299 (all GPUs supported by HSA). While intrinsically true for APUs, it
3300 requires XNACK support for discrete GPUs. */
3301 if (hsa_context.agent_count > 0
3302 && (omp_requires_mask
3303 & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY | GOMP_REQUIRES_SELF_MAPS)))
3305 bool b;
3306 hsa_system_info_t type = HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT;
3307 hsa_status_t status = hsa_fns.hsa_system_get_info_fn (type, &b);
3308 if (status != HSA_STATUS_SUCCESS)
3309 GOMP_PLUGIN_error ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT "
3310 "failed");
3311 if (!b)
3312 return -1;
3315 return hsa_context.agent_count;
3318 /* Initialize device (agent) number N so that it can be used for computation.
3319 Return TRUE on success. */
3321 bool
3322 GOMP_OFFLOAD_init_device (int n)
3324 if (!init_hsa_context (false))
3325 return false;
3326 if (n >= hsa_context.agent_count)
3328 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3329 return false;
3331 struct agent_info *agent = &hsa_context.agents[n];
3333 if (agent->initialized)
3334 return true;
3336 agent->device_id = n;
3338 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3340 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3341 return false;
3343 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3345 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3346 return false;
3348 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3350 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3351 return false;
3353 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3355 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3356 return false;
3358 agent->async_queues = NULL;
3359 agent->omp_async_queue = NULL;
3360 agent->ephemeral_memories_list = NULL;
3362 uint32_t queue_size;
3363 hsa_status_t status;
3364 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3365 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3366 &queue_size);
3367 if (status != HSA_STATUS_SUCCESS)
3368 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3369 status);
3371 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3372 &agent->name);
3373 if (status != HSA_STATUS_SUCCESS)
3374 return hsa_error ("Error querying the name of the agent", status);
3376 agent->device_isa = isa_code (agent->name);
3377 if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
3379 char msg[33 + 64 + 1];
3380 snprintf (msg, sizeof msg,
3381 "Unknown GCN agent architecture '%s'", agent->name);
3382 return hsa_error (msg, HSA_STATUS_ERROR);
3385 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3386 &agent->vendor_name);
3387 if (status != HSA_STATUS_SUCCESS)
3388 return hsa_error ("Error querying the vendor name of the agent", status);
3390 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3391 HSA_QUEUE_TYPE_MULTI,
3392 hsa_queue_callback, NULL, UINT32_MAX,
3393 UINT32_MAX, &agent->sync_queue);
3394 if (status != HSA_STATUS_SUCCESS)
3395 return hsa_error ("Error creating command queue", status);
3397 agent->kernarg_region.handle = (uint64_t) -1;
3398 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3399 get_kernarg_memory_region,
3400 &agent->kernarg_region);
3401 if (status != HSA_STATUS_SUCCESS
3402 && status != HSA_STATUS_INFO_BREAK)
3403 hsa_error ("Scanning memory regions failed", status);
3404 if (agent->kernarg_region.handle == (uint64_t) -1)
3406 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3407 "arguments");
3408 return false;
3410 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3411 dump_hsa_region (agent->kernarg_region, NULL);
3413 agent->data_region.handle = (uint64_t) -1;
3414 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3415 get_data_memory_region,
3416 &agent->data_region);
3417 if (status != HSA_STATUS_SUCCESS
3418 && status != HSA_STATUS_INFO_BREAK)
3419 hsa_error ("Scanning memory regions failed", status);
3420 if (agent->data_region.handle == (uint64_t) -1)
3422 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3423 "data");
3424 return false;
3426 GCN_DEBUG ("Selected device data memory region:\n");
3427 dump_hsa_region (agent->data_region, NULL);
3429 GCN_DEBUG ("GCN agent %d initialized\n", n);
3431 agent->initialized = true;
3432 return true;
3435 /* Load GCN object-code module described by struct gcn_image_desc in
3436 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3437 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3438 contain the on-device addresses of the functions for reverse offload. To be
3439 freed by the caller. */
3442 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3443 struct addr_pair **target_table,
3444 uint64_t **rev_fn_table,
3445 uint64_t *host_ind_fn_table)
3447 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3449 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3450 " (expected %u, received %u)",
3451 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3452 return -1;
3455 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3456 struct agent_info *agent;
3457 struct addr_pair *pair;
3458 struct module_info *module;
3459 struct kernel_info *kernel;
3460 int kernel_count = image_desc->kernel_count;
3461 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3462 ? image_desc->ind_func_count : 0;
3463 unsigned var_count = image_desc->global_variable_count;
3464 /* Currently, "others" is a struct of ICVS. */
3465 int other_count = 1;
3467 agent = get_agent_info (ord);
3468 if (!agent)
3469 return -1;
3471 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3473 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3474 return -1;
3476 if (agent->prog_finalized
3477 && !destroy_hsa_program (agent))
3478 return -1;
3480 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3481 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3482 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3483 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3484 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3485 * sizeof (struct addr_pair));
3486 *target_table = pair;
3487 module = (struct module_info *)
3488 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3489 + kernel_count * sizeof (struct kernel_info));
3490 module->image_desc = image_desc;
3491 module->kernel_count = kernel_count;
3492 module->heap = NULL;
3493 module->constructors_run_p = false;
3495 kernel = &module->kernels[0];
3497 /* Allocate memory for kernel dependencies. */
3498 for (unsigned i = 0; i < kernel_count; i++)
3500 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3501 if (!init_basic_kernel_info (kernel, d, agent, module))
3502 return -1;
3503 if (strcmp (d->name, "_init_array") == 0)
3504 module->init_array_func = kernel;
3505 else if (strcmp (d->name, "_fini_array") == 0)
3506 module->fini_array_func = kernel;
3507 else
3509 pair->start = (uintptr_t) kernel;
3510 pair->end = (uintptr_t) (kernel + 1);
3511 pair++;
3513 kernel++;
3516 agent->module = module;
3517 if (pthread_rwlock_unlock (&agent->module_rwlock))
3519 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3520 return -1;
3523 if (!create_and_finalize_hsa_program (agent))
3524 return -1;
3526 if (var_count > 0)
3528 hsa_status_t status;
3529 hsa_executable_symbol_t var_symbol;
3530 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3531 ".offload_var_table",
3532 agent->id,
3533 0, &var_symbol);
3535 if (status != HSA_STATUS_SUCCESS)
3536 hsa_fatal ("Could not find symbol for variable in the code object",
3537 status);
3539 uint64_t var_table_addr;
3540 status = hsa_fns.hsa_executable_symbol_get_info_fn
3541 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3542 &var_table_addr);
3543 if (status != HSA_STATUS_SUCCESS)
3544 hsa_fatal ("Could not extract a variable from its symbol", status);
3546 struct {
3547 uint64_t addr;
3548 uint64_t size;
3549 } var_table[var_count];
3550 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3551 (void*)var_table_addr, sizeof (var_table));
3553 for (unsigned i = 0; i < var_count; i++)
3555 pair->start = var_table[i].addr;
3556 pair->end = var_table[i].addr + var_table[i].size;
3557 GCN_DEBUG ("Found variable at %p with size %lu\n",
3558 (void *)var_table[i].addr, var_table[i].size);
3559 pair++;
3563 if (ind_func_count > 0)
3565 hsa_status_t status;
3567 /* Read indirect function table from image. */
3568 hsa_executable_symbol_t ind_funcs_symbol;
3569 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3570 ".offload_ind_func_table",
3571 agent->id,
3572 0, &ind_funcs_symbol);
3574 if (status != HSA_STATUS_SUCCESS)
3575 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3576 "code object", status);
3578 uint64_t ind_funcs_table_addr;
3579 status = hsa_fns.hsa_executable_symbol_get_info_fn
3580 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3581 &ind_funcs_table_addr);
3582 if (status != HSA_STATUS_SUCCESS)
3583 hsa_fatal ("Could not extract a variable from its symbol", status);
3585 uint64_t ind_funcs_table[ind_func_count];
3586 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3587 (void*) ind_funcs_table_addr,
3588 sizeof (ind_funcs_table));
3590 /* Build host->target address map for indirect functions. */
3591 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3592 for (unsigned i = 0; i < ind_func_count; i++)
3594 ind_fn_map[i * 2] = host_ind_fn_table[i];
3595 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3596 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3597 i, host_ind_fn_table[i], ind_funcs_table[i]);
3599 ind_fn_map[ind_func_count * 2] = 0;
3601 /* Write the map onto the target. */
3602 void *map_target_addr
3603 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3604 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3606 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3607 (void*) ind_fn_map,
3608 sizeof (ind_fn_map));
3610 /* Write address of the map onto the target. */
3611 hsa_executable_symbol_t symbol;
3613 status
3614 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3615 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3616 agent->id, 0, &symbol);
3617 if (status != HSA_STATUS_SUCCESS)
3618 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3619 status);
3621 uint64_t varptr;
3622 uint32_t varsize;
3624 status = hsa_fns.hsa_executable_symbol_get_info_fn
3625 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3626 &varptr);
3627 if (status != HSA_STATUS_SUCCESS)
3628 hsa_fatal ("Could not extract a variable from its symbol", status);
3629 status = hsa_fns.hsa_executable_symbol_get_info_fn
3630 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3631 &varsize);
3632 if (status != HSA_STATUS_SUCCESS)
3633 hsa_fatal ("Could not extract a variable size from its symbol",
3634 status);
3636 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3637 varptr, varsize);
3639 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3640 &map_target_addr,
3641 sizeof (map_target_addr));
3644 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3646 hsa_status_t status;
3647 hsa_executable_symbol_t var_symbol;
3648 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3649 XSTRING (GOMP_ADDITIONAL_ICVS),
3650 agent->id, 0, &var_symbol);
3651 if (status == HSA_STATUS_SUCCESS)
3653 uint64_t varptr;
3654 uint32_t varsize;
3656 status = hsa_fns.hsa_executable_symbol_get_info_fn
3657 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3658 &varptr);
3659 if (status != HSA_STATUS_SUCCESS)
3660 hsa_fatal ("Could not extract a variable from its symbol", status);
3661 status = hsa_fns.hsa_executable_symbol_get_info_fn
3662 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3663 &varsize);
3664 if (status != HSA_STATUS_SUCCESS)
3665 hsa_fatal ("Could not extract a variable size from its symbol",
3666 status);
3668 pair->start = varptr;
3669 pair->end = varptr + varsize;
3671 else
3673 /* The variable was not in this image. */
3674 GCN_DEBUG ("Variable not found in image: %s\n",
3675 XSTRING (GOMP_ADDITIONAL_ICVS));
3676 pair->start = pair->end = 0;
3679 /* Ensure that constructors are run first. */
3680 struct GOMP_kernel_launch_attributes kla =
3681 { 3,
3682 /* Grid size. */
3683 { 1, 64, 1 },
3684 /* Work-group size. */
3685 { 1, 64, 1 }
3688 if (module->init_array_func)
3690 init_kernel (module->init_array_func);
3691 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3693 module->constructors_run_p = true;
3695 /* Don't report kernels that libgomp need not know about. */
3696 if (module->init_array_func)
3697 kernel_count--;
3698 if (module->fini_array_func)
3699 kernel_count--;
3701 if (rev_fn_table != NULL && kernel_count == 0)
3702 *rev_fn_table = NULL;
3703 else if (rev_fn_table != NULL)
3705 hsa_status_t status;
3706 hsa_executable_symbol_t var_symbol;
3707 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3708 ".offload_func_table",
3709 agent->id, 0, &var_symbol);
3710 if (status != HSA_STATUS_SUCCESS)
3711 hsa_fatal ("Could not find symbol for variable in the code object",
3712 status);
3713 uint64_t fn_table_addr;
3714 status = hsa_fns.hsa_executable_symbol_get_info_fn
3715 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3716 &fn_table_addr);
3717 if (status != HSA_STATUS_SUCCESS)
3718 hsa_fatal ("Could not extract a variable from its symbol", status);
3719 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3720 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3721 (void*) fn_table_addr,
3722 kernel_count * sizeof (uint64_t));
3725 return kernel_count + var_count + other_count;
3728 /* Unload GCN object-code module described by struct gcn_image_desc in
3729 TARGET_DATA from agent number N. Return TRUE on success. */
3731 bool
3732 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3734 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3736 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3737 " (expected %u, received %u)",
3738 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3739 return false;
3742 struct agent_info *agent;
3743 agent = get_agent_info (n);
3744 if (!agent)
3745 return false;
3747 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3749 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3750 return false;
3753 if (!agent->module || agent->module->image_desc != target_data)
3755 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3756 "loaded before");
3757 return false;
3760 if (!destroy_module (agent->module, true))
3761 return false;
3762 free (agent->module);
3763 agent->module = NULL;
3764 if (!destroy_hsa_program (agent))
3765 return false;
3766 if (pthread_rwlock_unlock (&agent->module_rwlock))
3768 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3769 return false;
3771 return true;
3774 /* Deinitialize all information and status associated with agent number N. We
3775 do not attempt any synchronization, assuming the user and libgomp will not
3776 attempt deinitialization of a device that is in any way being used at the
3777 same time. Return TRUE on success. */
3779 bool
3780 GOMP_OFFLOAD_fini_device (int n)
3782 struct agent_info *agent = get_agent_info (n);
3783 if (!agent)
3784 return false;
3786 if (!agent->initialized)
3787 return true;
3789 if (agent->omp_async_queue)
3791 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3792 agent->omp_async_queue = NULL;
3795 if (agent->module)
3797 if (!destroy_module (agent->module, false))
3798 return false;
3799 free (agent->module);
3800 agent->module = NULL;
3803 if (!destroy_ephemeral_memories (agent))
3804 return false;
3806 if (!destroy_hsa_program (agent))
3807 return false;
3809 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3810 if (status != HSA_STATUS_SUCCESS)
3811 return hsa_error ("Error destroying command queue", status);
3813 if (pthread_mutex_destroy (&agent->prog_mutex))
3815 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3816 return false;
3818 if (pthread_rwlock_destroy (&agent->module_rwlock))
3820 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3821 return false;
3824 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3826 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3827 return false;
3829 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3831 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3832 return false;
3834 agent->initialized = false;
3835 return true;
3838 /* Return true if the HSA runtime can run function FN_PTR. */
3840 bool
3841 GOMP_OFFLOAD_can_run (void *fn_ptr)
3843 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3845 init_kernel (kernel);
3846 if (kernel->initialization_failed)
3847 GOMP_PLUGIN_fatal ("kernel initialization failed");
3849 return true;
3852 /* Allocate memory on device N. */
3854 void *
3855 GOMP_OFFLOAD_alloc (int n, size_t size)
3857 struct agent_info *agent = get_agent_info (n);
3858 return alloc_by_agent (agent, size);
3861 /* Free memory from device N. */
3863 bool
3864 GOMP_OFFLOAD_free (int device, void *ptr)
3866 GCN_DEBUG ("Freeing memory on device %d\n", device);
3868 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3869 if (status != HSA_STATUS_SUCCESS)
3871 hsa_error ("Could not free device memory", status);
3872 return false;
3875 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3876 bool profiling_dispatch_p
3877 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3878 if (profiling_dispatch_p)
3880 acc_prof_info *prof_info = thr->prof_info;
3881 acc_event_info data_event_info;
3882 acc_api_info *api_info = thr->api_info;
3884 prof_info->event_type = acc_ev_free;
3886 data_event_info.data_event.event_type = prof_info->event_type;
3887 data_event_info.data_event.valid_bytes
3888 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3889 data_event_info.data_event.parent_construct
3890 = acc_construct_parallel;
3891 data_event_info.data_event.implicit = 1;
3892 data_event_info.data_event.tool_info = NULL;
3893 data_event_info.data_event.var_name = NULL;
3894 data_event_info.data_event.bytes = 0;
3895 data_event_info.data_event.host_ptr = NULL;
3896 data_event_info.data_event.device_ptr = (void *) ptr;
3898 api_info->device_api = acc_device_api_other;
3900 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3901 api_info);
3904 return true;
3907 /* Copy data from DEVICE to host. */
3909 bool
3910 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3912 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3913 src, dst);
3914 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3915 if (status != HSA_STATUS_SUCCESS)
3916 GOMP_PLUGIN_error ("memory copy failed");
3917 return true;
3920 /* Copy data from host to DEVICE. */
3922 bool
3923 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3925 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3926 device, dst);
3927 hsa_memory_copy_wrapper (dst, src, n);
3928 return true;
3931 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3933 bool
3934 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3936 struct gcn_thread *thread_data = gcn_thread ();
3938 if (thread_data && !async_synchronous_p (thread_data->async))
3940 struct agent_info *agent = get_agent_info (device);
3941 maybe_init_omp_async (agent);
3942 queue_push_copy (agent->omp_async_queue, dst, src, n);
3943 return true;
3946 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3947 device, src, device, dst);
3948 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3949 if (status != HSA_STATUS_SUCCESS)
3950 GOMP_PLUGIN_error ("memory copy failed");
3951 return true;
3954 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3955 measured in bytes. So we have:
3957 dim1_size: number of bytes to copy on innermost dimension ("row")
3958 dim0_len: number of rows to copy
3959 dst: base pointer for destination of copy
3960 dst_offset1_size: innermost row offset (for dest), in bytes
3961 dst_offset0_len: offset, number of rows (for dest)
3962 dst_dim1_size: whole-array dest row length, in bytes (pitch)
3963 src: base pointer for source of copy
3964 src_offset1_size: innermost row offset (for source), in bytes
3965 src_offset0_len: offset, number of rows (for source)
3966 src_dim1_size: whole-array source row length, in bytes (pitch)
3970 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
3971 size_t dim0_len, void *dst, size_t dst_offset1_size,
3972 size_t dst_offset0_len, size_t dst_dim1_size,
3973 const void *src, size_t src_offset1_size,
3974 size_t src_offset0_len, size_t src_dim1_size)
3976 if (!hsa_fns.hsa_amd_memory_lock_fn
3977 || !hsa_fns.hsa_amd_memory_unlock_fn
3978 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
3979 return -1;
3981 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
3982 out quietly if we have anything oddly-aligned rather than letting the
3983 driver raise an error. */
3984 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
3985 return -1;
3987 if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
3988 return -1;
3990 /* Only handle host to device or device to host transfers here. */
3991 if ((dst_ord == -1 && src_ord == -1)
3992 || (dst_ord != -1 && src_ord != -1))
3993 return -1;
3995 hsa_amd_copy_direction_t dir
3996 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
3997 hsa_agent_t copy_agent;
3999 /* We need to pin (lock) host memory before we start the transfer. Try to
4000 lock the minimum size necessary, i.e. using partial first/last rows of the
4001 whole array. Something like this:
4003 rows -->
4004 ..............
4005 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4006 o | ++#######+++++ <- whole row
4007 l | ++#######+++++ <- "
4008 s v ++#######..... <- last row apart from trailing remainder
4009 ..............
4011 We could split very large transfers into several rectangular copies, but
4012 that is unimplemented for now. */
4014 size_t bounded_size_host, first_elem_offset_host;
4015 void *host_ptr;
4016 if (dir == hsaHostToDevice)
4018 bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4019 first_elem_offset_host = src_offset0_len * src_dim1_size
4020 + src_offset1_size;
4021 host_ptr = (void *) src;
4022 struct agent_info *agent = get_agent_info (dst_ord);
4023 copy_agent = agent->id;
4025 else
4027 bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4028 first_elem_offset_host = dst_offset0_len * dst_dim1_size
4029 + dst_offset1_size;
4030 host_ptr = dst;
4031 struct agent_info *agent = get_agent_info (src_ord);
4032 copy_agent = agent->id;
4035 void *agent_ptr;
4037 hsa_status_t status
4038 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4039 bounded_size_host, NULL, 0, &agent_ptr);
4040 /* We can't lock the host memory: don't give up though, we might still be
4041 able to use the slow path in our caller. So, don't make this an
4042 error. */
4043 if (status != HSA_STATUS_SUCCESS)
4044 return -1;
4046 hsa_pitched_ptr_t dstpp, srcpp;
4047 hsa_dim3_t dst_offsets, src_offsets, ranges;
4049 int retval = 1;
4051 hsa_signal_t completion_signal;
4052 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4053 if (status != HSA_STATUS_SUCCESS)
4055 retval = -1;
4056 goto unlock;
4059 if (dir == hsaHostToDevice)
4061 srcpp.base = agent_ptr - first_elem_offset_host;
4062 dstpp.base = dst;
4064 else
4066 srcpp.base = (void *) src;
4067 dstpp.base = agent_ptr - first_elem_offset_host;
4070 srcpp.pitch = src_dim1_size;
4071 srcpp.slice = 0;
4073 src_offsets.x = src_offset1_size;
4074 src_offsets.y = src_offset0_len;
4075 src_offsets.z = 0;
4077 dstpp.pitch = dst_dim1_size;
4078 dstpp.slice = 0;
4080 dst_offsets.x = dst_offset1_size;
4081 dst_offsets.y = dst_offset0_len;
4082 dst_offsets.z = 0;
4084 ranges.x = dim1_size;
4085 ranges.y = dim0_len;
4086 ranges.z = 1;
4088 status
4089 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4090 &src_offsets, &ranges,
4091 copy_agent, dir, 0, NULL,
4092 completion_signal);
4093 /* If the rectangular copy fails, we might still be able to use the slow
4094 path. We need to unlock the host memory though, so don't return
4095 immediately. */
4096 if (status != HSA_STATUS_SUCCESS)
4097 retval = -1;
4098 else
4099 hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4100 HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4101 HSA_WAIT_STATE_ACTIVE);
4103 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4105 unlock:
4106 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4107 if (status != HSA_STATUS_SUCCESS)
4108 hsa_fatal ("Could not unlock host memory", status);
4110 return retval;
4113 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4114 measured in bytes. So we have:
4116 dim2_size: number of bytes to copy on innermost dimension ("row")
4117 dim1_len: number of rows per slice to copy
4118 dim0_len: number of slices to copy
4119 dst: base pointer for destination of copy
4120 dst_offset2_size: innermost row offset (for dest), in bytes
4121 dst_offset1_len: offset, number of rows (for dest)
4122 dst_offset0_len: offset, number of slices (for dest)
4123 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4124 dst_dim1_len: whole-array number of rows in slice (for dest)
4125 src: base pointer for source of copy
4126 src_offset2_size: innermost row offset (for source), in bytes
4127 src_offset1_len: offset, number of rows (for source)
4128 src_offset0_len: offset, number of slices (for source)
4129 src_dim2_size: whole-array source row length, in bytes (pitch)
4130 src_dim1_len: whole-array number of rows in slice (for source)
4134 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4135 size_t dim1_len, size_t dim0_len, void *dst,
4136 size_t dst_offset2_size, size_t dst_offset1_len,
4137 size_t dst_offset0_len, size_t dst_dim2_size,
4138 size_t dst_dim1_len, const void *src,
4139 size_t src_offset2_size, size_t src_offset1_len,
4140 size_t src_offset0_len, size_t src_dim2_size,
4141 size_t src_dim1_len)
4143 if (!hsa_fns.hsa_amd_memory_lock_fn
4144 || !hsa_fns.hsa_amd_memory_unlock_fn
4145 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4146 return -1;
4148 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4149 out quietly if we have anything oddly-aligned rather than letting the
4150 driver raise an error. */
4151 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4152 return -1;
4154 if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4155 return -1;
4157 /* Only handle host to device or device to host transfers here. */
4158 if ((dst_ord == -1 && src_ord == -1)
4159 || (dst_ord != -1 && src_ord != -1))
4160 return -1;
4162 hsa_amd_copy_direction_t dir
4163 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4164 hsa_agent_t copy_agent;
4166 /* We need to pin (lock) host memory before we start the transfer. Try to
4167 lock the minimum size necessary, i.e. using partial first/last slices of
4168 the whole 3D array. Something like this:
4170 slice 0: slice 1: slice 2:
4171 __________ __________ __________
4172 ^ /+++++++++/ : /+++++++++/ : / /
4173 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4174 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4175 /_________/ : /_________/ : /_________/
4176 row --->
4178 We could split very large transfers into several rectangular copies, but
4179 that is unimplemented for now. */
4181 size_t bounded_size_host, first_elem_offset_host;
4182 void *host_ptr;
4183 if (dir == hsaHostToDevice)
4185 size_t slice_bytes = src_dim2_size * src_dim1_len;
4186 bounded_size_host = slice_bytes * (dim0_len - 1)
4187 + src_dim2_size * (dim1_len - 1)
4188 + dim2_size;
4189 first_elem_offset_host = src_offset0_len * slice_bytes
4190 + src_offset1_len * src_dim2_size
4191 + src_offset2_size;
4192 host_ptr = (void *) src;
4193 struct agent_info *agent = get_agent_info (dst_ord);
4194 copy_agent = agent->id;
4196 else
4198 size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4199 bounded_size_host = slice_bytes * (dim0_len - 1)
4200 + dst_dim2_size * (dim1_len - 1)
4201 + dim2_size;
4202 first_elem_offset_host = dst_offset0_len * slice_bytes
4203 + dst_offset1_len * dst_dim2_size
4204 + dst_offset2_size;
4205 host_ptr = dst;
4206 struct agent_info *agent = get_agent_info (src_ord);
4207 copy_agent = agent->id;
4210 void *agent_ptr;
4212 hsa_status_t status
4213 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4214 bounded_size_host, NULL, 0, &agent_ptr);
4215 /* We can't lock the host memory: don't give up though, we might still be
4216 able to use the slow path in our caller (maybe even with iterated memcpy2d
4217 calls). So, don't make this an error. */
4218 if (status != HSA_STATUS_SUCCESS)
4219 return -1;
4221 hsa_pitched_ptr_t dstpp, srcpp;
4222 hsa_dim3_t dst_offsets, src_offsets, ranges;
4224 int retval = 1;
4226 hsa_signal_t completion_signal;
4227 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4228 if (status != HSA_STATUS_SUCCESS)
4230 retval = -1;
4231 goto unlock;
4234 if (dir == hsaHostToDevice)
4236 srcpp.base = agent_ptr - first_elem_offset_host;
4237 dstpp.base = dst;
4239 else
4241 srcpp.base = (void *) src;
4242 dstpp.base = agent_ptr - first_elem_offset_host;
4245 /* Pitch is measured in bytes. */
4246 srcpp.pitch = src_dim2_size;
4247 /* Slice is also measured in bytes (i.e. total per-slice). */
4248 srcpp.slice = src_dim2_size * src_dim1_len;
4250 src_offsets.x = src_offset2_size;
4251 src_offsets.y = src_offset1_len;
4252 src_offsets.z = src_offset0_len;
4254 /* As above. */
4255 dstpp.pitch = dst_dim2_size;
4256 dstpp.slice = dst_dim2_size * dst_dim1_len;
4258 dst_offsets.x = dst_offset2_size;
4259 dst_offsets.y = dst_offset1_len;
4260 dst_offsets.z = dst_offset0_len;
4262 ranges.x = dim2_size;
4263 ranges.y = dim1_len;
4264 ranges.z = dim0_len;
4266 status
4267 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4268 &src_offsets, &ranges,
4269 copy_agent, dir, 0, NULL,
4270 completion_signal);
4271 /* If the rectangular copy fails, we might still be able to use the slow
4272 path. We need to unlock the host memory though, so don't return
4273 immediately. */
4274 if (status != HSA_STATUS_SUCCESS)
4275 retval = -1;
4276 else
4278 hsa_signal_value_t sv
4279 = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4280 HSA_SIGNAL_CONDITION_LT, 1,
4281 UINT64_MAX,
4282 HSA_WAIT_STATE_ACTIVE);
4283 if (sv < 0)
4285 GCN_WARNING ("async copy rect failure");
4286 retval = -1;
4290 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4292 unlock:
4293 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4294 if (status != HSA_STATUS_SUCCESS)
4295 hsa_fatal ("Could not unlock host memory", status);
4297 return retval;
4300 /* }}} */
4301 /* {{{ OpenMP Plugin API */
4303 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4304 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4305 to a kernel_info structure, and must have previously been loaded to the
4306 specified device. */
4308 void
4309 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4311 struct agent_info *agent = get_agent_info (device);
4312 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4313 struct GOMP_kernel_launch_attributes def;
4314 struct GOMP_kernel_launch_attributes *kla;
4315 assert (agent == kernel->agent);
4317 /* If we get here then the kernel must be OpenMP. */
4318 kernel->kind = KIND_OPENMP;
4320 if (!parse_target_attributes (args, &def, &kla, agent))
4322 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4323 return;
4325 run_kernel (kernel, vars, kla, NULL, false);
4328 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4329 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4330 GOMP_PLUGIN_target_task_completion when it has finished. */
4332 void
4333 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4334 void **args, void *async_data)
4336 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4337 struct agent_info *agent = get_agent_info (device);
4338 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4339 struct GOMP_kernel_launch_attributes def;
4340 struct GOMP_kernel_launch_attributes *kla;
4341 assert (agent == kernel->agent);
4343 /* If we get here then the kernel must be OpenMP. */
4344 kernel->kind = KIND_OPENMP;
4346 if (!parse_target_attributes (args, &def, &kla, agent))
4348 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4349 return;
4352 maybe_init_omp_async (agent);
4353 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4354 queue_push_callback (agent->omp_async_queue,
4355 GOMP_PLUGIN_target_task_completion, async_data);
4358 /* }}} */
4359 /* {{{ OpenACC Plugin API */
4361 /* Run a synchronous OpenACC kernel. The device number is inferred from the
4362 already-loaded KERNEL. */
4364 void
4365 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4366 size_t mapnum __attribute__((unused)),
4367 void **hostaddrs __attribute__((unused)),
4368 void **devaddrs, unsigned *dims,
4369 void *targ_mem_desc)
4371 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4373 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4376 /* Run an asynchronous OpenACC kernel on the specified queue. */
4378 void
4379 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4380 size_t mapnum __attribute__((unused)),
4381 void **hostaddrs __attribute__((unused)),
4382 void **devaddrs,
4383 unsigned *dims, void *targ_mem_desc,
4384 struct goacc_asyncqueue *aq)
4386 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4388 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4391 /* Create a new asynchronous thread and queue for running future kernels;
4392 issues a fatal error if the queue cannot be created as all callers expect
4393 that the queue exists. */
4395 struct goacc_asyncqueue *
4396 GOMP_OFFLOAD_openacc_async_construct (int device)
4398 struct agent_info *agent = get_agent_info (device);
4400 pthread_mutex_lock (&agent->async_queues_mutex);
4402 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4403 aq->agent = get_agent_info (device);
4404 aq->prev = NULL;
4405 aq->next = agent->async_queues;
4406 if (aq->next)
4408 aq->next->prev = aq;
4409 aq->id = aq->next->id + 1;
4411 else
4412 aq->id = 1;
4413 agent->async_queues = aq;
4415 aq->queue_first = 0;
4416 aq->queue_n = 0;
4417 aq->drain_queue_stop = 0;
4419 if (pthread_mutex_init (&aq->mutex, NULL))
4421 GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue mutex");
4422 return NULL;
4424 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4426 GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue cond");
4427 return NULL;
4429 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4431 GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue cond");
4432 return NULL;
4435 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4436 ASYNC_QUEUE_SIZE,
4437 HSA_QUEUE_TYPE_MULTI,
4438 hsa_queue_callback, NULL,
4439 UINT32_MAX, UINT32_MAX,
4440 &aq->hsa_queue);
4441 if (status != HSA_STATUS_SUCCESS)
4442 hsa_fatal ("Error creating command queue", status);
4444 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4445 if (err != 0)
4446 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4447 strerror (err));
4448 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4449 aq->id);
4451 pthread_mutex_unlock (&agent->async_queues_mutex);
4453 return aq;
4456 /* Destroy an existing asynchronous thread and queue. Waits for any
4457 currently-running task to complete, but cancels any queued tasks. */
4459 bool
4460 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4462 struct agent_info *agent = aq->agent;
4464 finalize_async_thread (aq);
4466 pthread_mutex_lock (&agent->async_queues_mutex);
4468 int err;
4469 if ((err = pthread_mutex_destroy (&aq->mutex)))
4471 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4472 goto fail;
4474 if (pthread_cond_destroy (&aq->queue_cond_in))
4476 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4477 goto fail;
4479 if (pthread_cond_destroy (&aq->queue_cond_out))
4481 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4482 goto fail;
4484 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4485 if (status != HSA_STATUS_SUCCESS)
4487 hsa_error ("Error destroying command queue", status);
4488 goto fail;
4491 if (aq->prev)
4492 aq->prev->next = aq->next;
4493 if (aq->next)
4494 aq->next->prev = aq->prev;
4495 if (agent->async_queues == aq)
4496 agent->async_queues = aq->next;
4498 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4500 free (aq);
4501 pthread_mutex_unlock (&agent->async_queues_mutex);
4502 return true;
4504 fail:
4505 pthread_mutex_unlock (&agent->async_queues_mutex);
4506 return false;
4509 /* Return true if the specified async queue is currently empty. */
4512 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4514 return queue_empty (aq);
4517 /* Block until the specified queue has executed all its tasks and the
4518 queue is empty. */
4520 bool
4521 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4523 wait_queue (aq);
4524 return true;
4527 /* Add a serialization point across two async queues. Any new tasks added to
4528 AQ2, after this call, will not run until all tasks on AQ1, at the time
4529 of this call, have completed. */
4531 bool
4532 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4533 struct goacc_asyncqueue *aq2)
4535 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4536 scheduled to run on it up to this point. */
4537 if (aq1 != aq2)
4539 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4540 queue_push_asyncwait (aq2, placeholderp);
4542 return true;
4545 /* Add an opaque callback to the given async queue. */
4547 void
4548 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4549 void (*fn) (void *), void *data)
4551 queue_push_callback (aq, fn, data);
4554 /* Queue up an asynchronous data copy from host to DEVICE. */
4556 bool
4557 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4558 size_t n, struct goacc_asyncqueue *aq)
4560 struct agent_info *agent = get_agent_info (device);
4561 assert (agent == aq->agent);
4562 queue_push_copy (aq, dst, src, n);
4563 return true;
4566 /* Queue up an asynchronous data copy from DEVICE to host. */
4568 bool
4569 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4570 size_t n, struct goacc_asyncqueue *aq)
4572 struct agent_info *agent = get_agent_info (device);
4573 assert (agent == aq->agent);
4574 queue_push_copy (aq, dst, src, n);
4575 return true;
4578 union goacc_property_value
4579 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4581 struct agent_info *agent = get_agent_info (device);
4583 union goacc_property_value propval = { .val = 0 };
4585 switch (prop)
4587 case GOACC_PROPERTY_FREE_MEMORY:
4588 /* Not supported. */
4589 break;
4590 case GOACC_PROPERTY_MEMORY:
4592 size_t size;
4593 hsa_region_t region = agent->data_region;
4594 hsa_status_t status =
4595 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4596 if (status == HSA_STATUS_SUCCESS)
4597 propval.val = size;
4598 break;
4600 case GOACC_PROPERTY_NAME:
4601 propval.ptr = agent->name;
4602 break;
4603 case GOACC_PROPERTY_VENDOR:
4604 propval.ptr = agent->vendor_name;
4605 break;
4606 case GOACC_PROPERTY_DRIVER:
4607 propval.ptr = hsa_context.driver_version_s;
4608 break;
4611 return propval;
4614 /* Set up plugin-specific thread-local-data (host-side). */
4616 void *
4617 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4619 struct gcn_thread *thread_data
4620 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4622 thread_data->async = GOMP_ASYNC_SYNC;
4624 return (void *) thread_data;
4627 /* Clean up plugin-specific thread-local-data. */
4629 void
4630 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4632 free (data);
4635 /* }}} */