libgomp/plugin/plugin-gcn.c

   1 /* Plugin for AMD GCN execution.
   2
   3    Copyright (C) 2013-2024 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* {{{ Includes and defines  */
  30
  31 #include "config.h"
  32 #include "symcat.h"
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <pthread.h>
  37 #include <inttypes.h>
  38 #include <stdbool.h>
  39 #include <limits.h>
  40 #include <hsa.h>
  41 #include <hsa_ext_amd.h>
  42 #include <dlfcn.h>
  43 #include <signal.h>
  44 #include "libgomp-plugin.h"
  45 #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
  46 #include "gomp-constants.h"
  47 #include <elf.h>
  48 #include "oacc-plugin.h"
  49 #include "oacc-int.h"
  50 #include <assert.h>
  51
  52 /* These probably won't be in elf.h for a while.  */
  53 #ifndef R_AMDGPU_NONE
  54 #define R_AMDGPU_NONE           0
  55 #define R_AMDGPU_ABS32_LO       1       /* (S + A) & 0xFFFFFFFF  */
  56 #define R_AMDGPU_ABS32_HI       2       /* (S + A) >> 32  */
  57 #define R_AMDGPU_ABS64          3       /* S + A  */
  58 #define R_AMDGPU_REL32          4       /* S + A - P  */
  59 #define R_AMDGPU_REL64          5       /* S + A - P  */
  60 #define R_AMDGPU_ABS32          6       /* S + A  */
  61 #define R_AMDGPU_GOTPCREL       7       /* G + GOT + A - P  */
  62 #define R_AMDGPU_GOTPCREL32_LO  8       /* (G + GOT + A - P) & 0xFFFFFFFF  */
  63 #define R_AMDGPU_GOTPCREL32_HI  9       /* (G + GOT + A - P) >> 32  */
  64 #define R_AMDGPU_REL32_LO       10      /* (S + A - P) & 0xFFFFFFFF  */
  65 #define R_AMDGPU_REL32_HI       11      /* (S + A - P) >> 32  */
  66 #define R_AMDGPU_RELATIVE64     13      /* B + A  */
  67 #endif
  68
  69 /* GCN specific definitions for asynchronous queues.  */
  70
  71 #define ASYNC_QUEUE_SIZE 64
  72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
  73 #define DEBUG_QUEUES 0
  74 #define DEBUG_THREAD_SLEEP 0
  75 #define DEBUG_THREAD_SIGNAL 0
  76
  77 /* Defaults.  */
  78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
  79
  80 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
  81 #ifndef HAVE_SECURE_GETENV
  82 #ifdef HAVE___SECURE_GETENV
  83 #define secure_getenv __secure_getenv
  84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
  85   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
  86
  87 #include <unistd.h>
  88
  89 /* Implementation of secure_getenv() for targets where it is not provided but
  90    we have at least means to test real and effective IDs. */
  91
  92 static char *
  93 secure_getenv (const char *name)
  94 {
  95   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
  96     return getenv (name);
  97   else
  98     return NULL;
  99 }
 100
 101 #else
 102 #define secure_getenv getenv
 103 #endif
 104 #endif
 105
 106 /* }}}  */
 107 /* {{{ Types  */
 108
 109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
 110
 111 struct gcn_thread
 112 {
 113   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
 114   int async;
 115 };
 116
 117 /* As an HSA runtime is dlopened, following structure defines function
 118    pointers utilized by the HSA plug-in.  */
 119
 120 struct hsa_runtime_fn_info
 121 {
 122   /* HSA runtime.  */
 123   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
 124                                         const char **status_string);
 125   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
 126                                           void *value);
 127   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
 128                                          hsa_agent_info_t attribute,
 129                                          void *value);
 130   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
 131                                       hsa_isa_info_t attribute,
 132                                       uint32_t index,
 133                                       void *value);
 134   hsa_status_t (*hsa_init_fn) (void);
 135   hsa_status_t (*hsa_iterate_agents_fn)
 136     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
 137   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
 138                                           hsa_region_info_t attribute,
 139                                           void *value);
 140   hsa_status_t (*hsa_queue_create_fn)
 141     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
 142      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
 143      void *data, uint32_t private_segment_size,
 144      uint32_t group_segment_size, hsa_queue_t **queue);
 145   hsa_status_t (*hsa_agent_iterate_regions_fn)
 146     (hsa_agent_t agent,
 147      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
 148   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
 149   hsa_status_t (*hsa_executable_create_fn)
 150     (hsa_profile_t profile, hsa_executable_state_t executable_state,
 151      const char *options, hsa_executable_t *executable);
 152   hsa_status_t (*hsa_executable_global_variable_define_fn)
 153     (hsa_executable_t executable, const char *variable_name, void *address);
 154   hsa_status_t (*hsa_executable_load_code_object_fn)
 155     (hsa_executable_t executable, hsa_agent_t agent,
 156      hsa_code_object_t code_object, const char *options);
 157   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
 158                                            const char *options);
 159   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
 160                                         uint32_t num_consumers,
 161                                         const hsa_agent_t *consumers,
 162                                         hsa_signal_t *signal);
 163   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 164                                           void **ptr);
 165   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
 166                                               hsa_access_permission_t access);
 167   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
 168   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
 169   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
 170   hsa_status_t (*hsa_executable_get_symbol_fn)
 171     (hsa_executable_t executable, const char *module_name,
 172      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
 173      hsa_executable_symbol_t *symbol);
 174   hsa_status_t (*hsa_executable_symbol_get_info_fn)
 175     (hsa_executable_symbol_t executable_symbol,
 176      hsa_executable_symbol_info_t attribute, void *value);
 177   hsa_status_t (*hsa_executable_iterate_symbols_fn)
 178     (hsa_executable_t executable,
 179      hsa_status_t (*callback)(hsa_executable_t executable,
 180                               hsa_executable_symbol_t symbol, void *data),
 181      void *data);
 182   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
 183                                                     uint64_t value);
 184   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
 185   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
 186                                        hsa_signal_value_t value);
 187   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
 188                                        hsa_signal_value_t value);
 189   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
 190     (hsa_signal_t signal, hsa_signal_condition_t condition,
 191      hsa_signal_value_t compare_value, uint64_t timeout_hint,
 192      hsa_wait_state_t wait_state_hint);
 193   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
 194   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
 195
 196   hsa_status_t (*hsa_code_object_deserialize_fn)
 197     (void *serialized_code_object, size_t serialized_code_object_size,
 198      const char *options, hsa_code_object_t *code_object);
 199   hsa_status_t (*hsa_amd_memory_lock_fn)
 200     (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
 201      void **agent_ptr);
 202   hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
 203   hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
 204     (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
 205      const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
 206      const hsa_dim3_t *range, hsa_agent_t copy_agent,
 207      hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
 208      const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
 209 };
 210
 211 /* Structure describing the run-time and grid properties of an HSA kernel
 212    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
 213
 214 struct GOMP_kernel_launch_attributes
 215 {
 216   /* Number of dimensions the workload has.  Maximum number is 3.  */
 217   uint32_t ndim;
 218   /* Size of the grid in the three respective dimensions.  */
 219   uint32_t gdims[3];
 220   /* Size of work-groups in the respective dimensions.  */
 221   uint32_t wdims[3];
 222 };
 223
 224 /* Collection of information needed for a dispatch of a kernel from a
 225    kernel.  */
 226
 227 struct kernel_dispatch
 228 {
 229   struct agent_info *agent;
 230   /* Pointer to a command queue associated with a kernel dispatch agent.  */
 231   void *queue;
 232   /* Pointer to a memory space used for kernel arguments passing.  */
 233   void *kernarg_address;
 234   /* Kernel object.  */
 235   uint64_t object;
 236   /* Synchronization signal used for dispatch synchronization.  */
 237   uint64_t signal;
 238   /* Private segment size.  */
 239   uint32_t private_segment_size;
 240   /* Group segment size.  */
 241   uint32_t group_segment_size;
 242 };
 243
 244 /* Structure of the kernargs segment, supporting console output.
 245
 246    This needs to match the definitions in Newlib, and the expectations
 247    in libgomp target code.  */
 248
 249 struct kernargs {
 250   struct kernargs_abi abi;
 251
 252   /* Output data.  */
 253   struct output output_data;
 254 };
 255
 256 /* A queue entry for a future asynchronous launch.  */
 257
 258 struct kernel_launch
 259 {
 260   struct kernel_info *kernel;
 261   void *vars;
 262   struct GOMP_kernel_launch_attributes kla;
 263 };
 264
 265 /* A queue entry for a future callback.  */
 266
 267 struct callback
 268 {
 269   void (*fn)(void *);
 270   void *data;
 271 };
 272
 273 /* A data struct for the copy_data callback.  */
 274
 275 struct copy_data
 276 {
 277   void *dst;
 278   const void *src;
 279   size_t len;
 280   struct goacc_asyncqueue *aq;
 281 };
 282
 283 /* A queue entry for a placeholder.  These correspond to a wait event.  */
 284
 285 struct placeholder
 286 {
 287   int executed;
 288   pthread_cond_t cond;
 289   pthread_mutex_t mutex;
 290 };
 291
 292 /* A queue entry for a wait directive.  */
 293
 294 struct asyncwait_info
 295 {
 296   struct placeholder *placeholderp;
 297 };
 298
 299 /* Encode the type of an entry in an async queue.  */
 300
 301 enum entry_type
 302 {
 303   KERNEL_LAUNCH,
 304   CALLBACK,
 305   ASYNC_WAIT,
 306   ASYNC_PLACEHOLDER
 307 };
 308
 309 /* An entry in an async queue.  */
 310
 311 struct queue_entry
 312 {
 313   enum entry_type type;
 314   union {
 315     struct kernel_launch launch;
 316     struct callback callback;
 317     struct asyncwait_info asyncwait;
 318     struct placeholder placeholder;
 319   } u;
 320 };
 321
 322 /* An async queue header.
 323
 324    OpenMP may create one of these.
 325    OpenACC may create many.  */
 326
 327 struct goacc_asyncqueue
 328 {
 329   struct agent_info *agent;
 330   hsa_queue_t *hsa_queue;
 331
 332   pthread_t thread_drain_queue;
 333   pthread_mutex_t mutex;
 334   pthread_cond_t queue_cond_in;
 335   pthread_cond_t queue_cond_out;
 336   struct queue_entry queue[ASYNC_QUEUE_SIZE];
 337   int queue_first;
 338   int queue_n;
 339   int drain_queue_stop;
 340
 341   int id;
 342   struct goacc_asyncqueue *prev;
 343   struct goacc_asyncqueue *next;
 344 };
 345
 346 /* Mkoffload uses this structure to describe a kernel.
 347
 348    OpenMP kernel dimensions are passed at runtime.
 349    OpenACC kernel dimensions are passed at compile time, here.  */
 350
 351 struct hsa_kernel_description
 352 {
 353   const char *name;
 354   int oacc_dims[3];  /* Only present for GCN kernels.  */
 355   int sgpr_count;
 356   int vpgr_count;
 357 };
 358
 359 /* Mkoffload uses this structure to describe an offload variable.  */
 360
 361 struct global_var_info
 362 {
 363   const char *name;
 364   void *address;
 365 };
 366
 367 /* Mkoffload uses this structure to describe all the kernels in a
 368    loadable module.  These are passed the libgomp via static constructors.  */
 369
 370 struct gcn_image_desc
 371 {
 372   struct gcn_image {
 373     size_t size;
 374     void *image;
 375   } *gcn_image;
 376   const unsigned kernel_count;
 377   struct hsa_kernel_description *kernel_infos;
 378   const unsigned ind_func_count;
 379   const unsigned global_variable_count;
 380 };
 381
 382 /* Enum values corresponding to the the ELF architecture codes.
 383    Only 'special' values are actually referenced in this file, but having them
 384    all may aid debugging.  */
 385
 386 typedef enum {
 387   EF_AMDGPU_MACH_UNSUPPORTED = -1,
 388 #define GCN_DEVICE(name, NAME, ELF, ...) \
 389   EF_AMDGPU_MACH_AMDGCN_ ## NAME = ELF,
 390 #include "../../gcc/config/gcn/gcn-devices.def"
 391 } EF_AMDGPU_MACH;
 392
 393 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
 394 typedef EF_AMDGPU_MACH gcn_isa;
 395
 396 /* Description of an HSA GPU agent (device) and the program associated with
 397    it.  */
 398
 399 struct agent_info
 400 {
 401   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
 402   hsa_agent_t id;
 403   /* The user-visible device number.  */
 404   int device_id;
 405   /* Whether the agent has been initialized.  The fields below are usable only
 406      if it has been.  */
 407   bool initialized;
 408
 409   /* The instruction set architecture of the device. */
 410   gcn_isa device_isa;
 411   /* Name of the agent. */
 412   char name[64];
 413   /* Name of the vendor of the agent. */
 414   char vendor_name[64];
 415   /* Command queues of the agent.  */
 416   hsa_queue_t *sync_queue;
 417   struct goacc_asyncqueue *async_queues, *omp_async_queue;
 418   pthread_mutex_t async_queues_mutex;
 419
 420   /* The HSA memory region from which to allocate kernel arguments.  */
 421   hsa_region_t kernarg_region;
 422
 423   /* The HSA memory region from which to allocate device data.  */
 424   hsa_region_t data_region;
 425
 426   /* Allocated ephemeral memories (team arena and stack space).  */
 427   struct ephemeral_memories_list *ephemeral_memories_list;
 428   pthread_mutex_t ephemeral_memories_write_lock;
 429
 430   /* Read-write lock that protects kernels which are running or about to be run
 431      from interference with loading and unloading of images.  Needs to be
 432      locked for reading while a kernel is being run, and for writing if the
 433      list of modules is manipulated (and thus the HSA program invalidated).  */
 434   pthread_rwlock_t module_rwlock;
 435
 436   /* The module associated with this kernel.  */
 437   struct module_info *module;
 438
 439   /* Mutex enforcing that only one thread will finalize the HSA program.  A
 440      thread should have locked agent->module_rwlock for reading before
 441      acquiring it.  */
 442   pthread_mutex_t prog_mutex;
 443   /* Flag whether the HSA program that consists of all the modules has been
 444      finalized.  */
 445   bool prog_finalized;
 446   /* HSA executable - the finalized program that is used to locate kernels.  */
 447   hsa_executable_t executable;
 448 };
 449
 450 /* Information required to identify, finalize and run any given kernel.  */
 451
 452 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
 453
 454 struct kernel_info
 455 {
 456   /* Name of the kernel, required to locate it within the GCN object-code
 457      module.  */
 458   const char *name;
 459   /* The specific agent the kernel has been or will be finalized for and run
 460      on.  */
 461   struct agent_info *agent;
 462   /* The specific module where the kernel takes place.  */
 463   struct module_info *module;
 464   /* Information provided by mkoffload associated with the kernel.  */
 465   struct hsa_kernel_description *description;
 466   /* Mutex enforcing that at most once thread ever initializes a kernel for
 467      use.  A thread should have locked agent->module_rwlock for reading before
 468      acquiring it.  */
 469   pthread_mutex_t init_mutex;
 470   /* Flag indicating whether the kernel has been initialized and all fields
 471      below it contain valid data.  */
 472   bool initialized;
 473   /* Flag indicating that the kernel has a problem that blocks an execution.  */
 474   bool initialization_failed;
 475   /* The object to be put into the dispatch queue.  */
 476   uint64_t object;
 477   /* Required size of kernel arguments.  */
 478   uint32_t kernarg_segment_size;
 479   /* Required size of group segment.  */
 480   uint32_t group_segment_size;
 481   /* Required size of private segment.  */
 482   uint32_t private_segment_size;
 483   /* Set up for OpenMP or OpenACC?  */
 484   enum offload_kind kind;
 485 };
 486
 487 /* Information about a particular GCN module, its image and kernels.  */
 488
 489 struct module_info
 490 {
 491   /* The description with which the program has registered the image.  */
 492   struct gcn_image_desc *image_desc;
 493   /* GCN heap allocation.  */
 494   struct heap *heap;
 495   /* Physical boundaries of the loaded module.  */
 496   Elf64_Addr phys_address_start;
 497   Elf64_Addr phys_address_end;
 498
 499   bool constructors_run_p;
 500   struct kernel_info *init_array_func, *fini_array_func;
 501
 502   /* Number of kernels in this module.  */
 503   int kernel_count;
 504   /* An array of kernel_info structures describing each kernel in this
 505      module.  */
 506   struct kernel_info kernels[];
 507 };
 508
 509 /* A linked list of memory arenas allocated on the device.
 510    These are used by OpenMP, as a means to optimize per-team malloc,
 511    and for host-accessible stack space.  */
 512
 513 struct ephemeral_memories_list
 514 {
 515   struct ephemeral_memories_list *next;
 516
 517   /* The size is determined by the number of teams and threads.  */
 518   size_t size;
 519   /* The device address allocated memory.  */
 520   void *address;
 521   /* A flag to prevent two asynchronous kernels trying to use the same memory.
 522      The mutex is locked until the kernel exits.  */
 523   pthread_mutex_t in_use;
 524 };
 525
 526 /* Information about the whole HSA environment and all of its agents.  */
 527
 528 struct hsa_context_info
 529 {
 530   /* Whether the structure has been initialized.  */
 531   bool initialized;
 532   /* Number of usable GPU HSA agents in the system.  */
 533   int agent_count;
 534   /* Array of agent_info structures describing the individual HSA agents.  */
 535   struct agent_info *agents;
 536   /* Driver version string. */
 537   char driver_version_s[30];
 538 };
 539
 540 /* }}}  */
 541 /* {{{ Global variables  */
 542
 543 /* Information about the whole HSA environment and all of its agents.  */
 544
 545 static struct hsa_context_info hsa_context;
 546
 547 /* HSA runtime functions that are initialized in init_hsa_context.  */
 548
 549 static struct hsa_runtime_fn_info hsa_fns;
 550
 551 /* Heap space, allocated target-side, provided for use of newlib malloc.
 552    Each module should have it's own heap allocated.
 553    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
 554
 555 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 556
 557 /* Ephemeral memory sizes for each kernel launch.  */
 558
 559 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
 560 static int stack_size = DEFAULT_GCN_STACK_SIZE;
 561 static int lowlat_size = -1;
 562
 563 /* Flag to decide whether print to stderr information about what is going on.
 564    Set in init_debug depending on environment variables.  */
 565
 566 static bool debug;
 567
 568 /* Flag to decide if the runtime should suppress a possible fallback to host
 569    execution.  */
 570
 571 static bool suppress_host_fallback;
 572
 573 /* Flag to locate HSA runtime shared library that is dlopened
 574    by this plug-in.  */
 575
 576 static const char *hsa_runtime_lib;
 577
 578 /* Flag to decide if the runtime should support also CPU devices (can be
 579    a simulator).  */
 580
 581 static bool support_cpu_devices;
 582
 583 /* Runtime dimension overrides.  Zero indicates default.  */
 584
 585 static int override_x_dim = 0;
 586 static int override_z_dim = 0;
 587
 588 /* }}}  */
 589 /* {{{ Debug & Diagnostic  */
 590
 591 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
 592
 593 #define DEBUG_PRINT(...) \
 594   do \
 595   { \
 596     if (debug) \
 597       { \
 598         fprintf (stderr, __VA_ARGS__); \
 599       } \
 600   } \
 601   while (false);
 602
 603 /* Flush stderr if GCN_DEBUG value is set to true.  */
 604
 605 #define DEBUG_FLUSH()                           \
 606   do {                                          \
 607     if (debug)                                  \
 608       fflush (stderr);                          \
 609   } while (false)
 610
 611 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
 612    is set to true.  */
 613
 614 #define DEBUG_LOG(prefix, ...)                  \
 615   do                                            \
 616     {                                           \
 617       DEBUG_PRINT (prefix);                     \
 618       DEBUG_PRINT (__VA_ARGS__);                        \
 619       DEBUG_FLUSH ();                           \
 620     } while (false)
 621
 622 /* Print a debugging message to stderr.  */
 623
 624 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
 625
 626 /* Print a warning message to stderr.  */
 627
 628 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
 629
 630 /* Print HSA warning STR with an HSA STATUS code.  */
 631
 632 static void
 633 hsa_warn (const char *str, hsa_status_t status)
 634 {
 635   if (!debug)
 636     return;
 637
 638   const char *hsa_error_msg = "[unknown]";
 639   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 640
 641   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
 642            hsa_error_msg);
 643 }
 644
 645 /* Report a fatal error STR together with the HSA error corresponding to STATUS
 646    and terminate execution of the current process.  */
 647
 648 static void
 649 hsa_fatal (const char *str, hsa_status_t status)
 650 {
 651   const char *hsa_error_msg = "[unknown]";
 652   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 653   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
 654                      hsa_error_msg);
 655 }
 656
 657 /* Like hsa_fatal, except only report error message, and return FALSE
 658    for propagating error processing to outside of plugin.  */
 659
 660 static bool
 661 hsa_error (const char *str, hsa_status_t status)
 662 {
 663   const char *hsa_error_msg = "[unknown]";
 664   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 665   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
 666                      hsa_error_msg);
 667   return false;
 668 }
 669
 670 /* Dump information about the available hardware.  */
 671
 672 static void
 673 dump_hsa_system_info (void)
 674 {
 675   hsa_status_t status;
 676
 677   hsa_endianness_t endianness;
 678   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
 679                                            &endianness);
 680   if (status == HSA_STATUS_SUCCESS)
 681     switch (endianness)
 682       {
 683       case HSA_ENDIANNESS_LITTLE:
 684         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
 685         break;
 686       case HSA_ENDIANNESS_BIG:
 687         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
 688         break;
 689       default:
 690         GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
 691       }
 692   else
 693     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
 694
 695   uint8_t extensions[128];
 696   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
 697                                            &extensions);
 698   if (status == HSA_STATUS_SUCCESS)
 699     {
 700       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
 701         GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
 702     }
 703   else
 704     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
 705 }
 706
 707 /* Dump information about the available hardware.  */
 708
 709 static void
 710 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
 711 {
 712   switch (machine_model)
 713     {
 714     case HSA_MACHINE_MODEL_SMALL:
 715       GCN_DEBUG ("%s: SMALL\n", s);
 716       break;
 717     case HSA_MACHINE_MODEL_LARGE:
 718       GCN_DEBUG ("%s: LARGE\n", s);
 719       break;
 720     default:
 721       GCN_WARNING ("%s: UNKNOWN\n", s);
 722       break;
 723     }
 724 }
 725
 726 /* Dump information about the available hardware.  */
 727
 728 static void
 729 dump_profile (hsa_profile_t profile, const char *s)
 730 {
 731   switch (profile)
 732     {
 733     case HSA_PROFILE_FULL:
 734       GCN_DEBUG ("%s: FULL\n", s);
 735       break;
 736     case HSA_PROFILE_BASE:
 737       GCN_DEBUG ("%s: BASE\n", s);
 738       break;
 739     default:
 740       GCN_WARNING ("%s: UNKNOWN\n", s);
 741       break;
 742     }
 743 }
 744
 745 /* Dump information about a device memory region.  */
 746
 747 static hsa_status_t
 748 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
 749 {
 750   hsa_status_t status;
 751
 752   hsa_region_segment_t segment;
 753   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
 754                                            &segment);
 755   if (status == HSA_STATUS_SUCCESS)
 756     {
 757       if (segment == HSA_REGION_SEGMENT_GLOBAL)
 758         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
 759       else if (segment == HSA_REGION_SEGMENT_READONLY)
 760         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
 761       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
 762         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
 763       else if (segment == HSA_REGION_SEGMENT_GROUP)
 764         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
 765       else
 766         GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
 767     }
 768   else
 769     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
 770
 771   if (segment == HSA_REGION_SEGMENT_GLOBAL)
 772     {
 773       uint32_t flags;
 774       status
 775         = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 776                                           &flags);
 777       if (status == HSA_STATUS_SUCCESS)
 778         {
 779           if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
 780             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
 781           if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
 782             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
 783           if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
 784             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
 785         }
 786       else
 787         GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
 788     }
 789
 790   size_t size;
 791   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
 792   if (status == HSA_STATUS_SUCCESS)
 793     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
 794   else
 795     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
 796
 797   status
 798     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
 799                                       &size);
 800   if (status == HSA_STATUS_SUCCESS)
 801     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
 802   else
 803     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
 804
 805   bool alloc_allowed;
 806   status
 807     = hsa_fns.hsa_region_get_info_fn (region,
 808                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
 809                                       &alloc_allowed);
 810   if (status == HSA_STATUS_SUCCESS)
 811     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
 812   else
 813     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
 814
 815   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
 816     return HSA_STATUS_SUCCESS;
 817
 818   status
 819     = hsa_fns.hsa_region_get_info_fn (region,
 820                                       HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
 821                                       &size);
 822   if (status == HSA_STATUS_SUCCESS)
 823     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
 824   else
 825     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
 826
 827   size_t align;
 828   status
 829     = hsa_fns.hsa_region_get_info_fn (region,
 830                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
 831                                       &align);
 832   if (status == HSA_STATUS_SUCCESS)
 833     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
 834   else
 835     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
 836
 837   return HSA_STATUS_SUCCESS;
 838 }
 839
 840 /* Dump information about all the device memory regions.  */
 841
 842 static void
 843 dump_hsa_regions (hsa_agent_t agent)
 844 {
 845   hsa_status_t status;
 846   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
 847                                                  dump_hsa_region,
 848                                                  NULL);
 849   if (status != HSA_STATUS_SUCCESS)
 850     hsa_error ("Dumping hsa regions failed", status);
 851 }
 852
 853 /* Dump information about the available devices.  */
 854
 855 static hsa_status_t
 856 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
 857 {
 858   hsa_status_t status;
 859
 860   char buf[64];
 861   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
 862                                           &buf);
 863   if (status == HSA_STATUS_SUCCESS)
 864     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
 865   else
 866     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
 867
 868   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
 869                                           &buf);
 870   if (status == HSA_STATUS_SUCCESS)
 871     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
 872   else
 873     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
 874
 875   hsa_machine_model_t machine_model;
 876   status
 877     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
 878                                      &machine_model);
 879   if (status == HSA_STATUS_SUCCESS)
 880     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
 881   else
 882     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
 883
 884   hsa_profile_t profile;
 885   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
 886                                           &profile);
 887   if (status == HSA_STATUS_SUCCESS)
 888     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
 889   else
 890     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
 891
 892   hsa_device_type_t device_type;
 893   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
 894                                           &device_type);
 895   if (status == HSA_STATUS_SUCCESS)
 896     {
 897       switch (device_type)
 898         {
 899         case HSA_DEVICE_TYPE_CPU:
 900           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
 901           break;
 902         case HSA_DEVICE_TYPE_GPU:
 903           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
 904           break;
 905         case HSA_DEVICE_TYPE_DSP:
 906           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
 907           break;
 908         default:
 909           GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
 910           break;
 911         }
 912     }
 913   else
 914     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 915
 916   uint32_t cu_count;
 917   status = hsa_fns.hsa_agent_get_info_fn
 918     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
 919   if (status == HSA_STATUS_SUCCESS)
 920     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
 921   else
 922     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
 923
 924   uint32_t size;
 925   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
 926                                           &size);
 927   if (status == HSA_STATUS_SUCCESS)
 928     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
 929   else
 930     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
 931
 932   uint32_t max_dim;
 933   status = hsa_fns.hsa_agent_get_info_fn (agent,
 934                                           HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
 935                                           &max_dim);
 936   if (status == HSA_STATUS_SUCCESS)
 937     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
 938   else
 939     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
 940
 941   uint32_t max_size;
 942   status = hsa_fns.hsa_agent_get_info_fn (agent,
 943                                           HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
 944                                           &max_size);
 945   if (status == HSA_STATUS_SUCCESS)
 946     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
 947   else
 948     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
 949
 950   uint32_t grid_max_dim;
 951   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
 952                                           &grid_max_dim);
 953   if (status == HSA_STATUS_SUCCESS)
 954     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
 955   else
 956     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
 957
 958   uint32_t grid_max_size;
 959   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
 960                                           &grid_max_size);
 961   if (status == HSA_STATUS_SUCCESS)
 962     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
 963   else
 964     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
 965
 966   dump_hsa_regions (agent);
 967
 968   return HSA_STATUS_SUCCESS;
 969 }
 970
 971 /* Forward reference.  */
 972
 973 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
 974
 975 /* Helper function for dump_executable_symbols.  */
 976
 977 static hsa_status_t
 978 dump_executable_symbol (hsa_executable_t executable,
 979                         hsa_executable_symbol_t symbol,
 980                         void *data __attribute__((unused)))
 981 {
 982   char *name = get_executable_symbol_name (symbol);
 983
 984   if (name)
 985     {
 986       GCN_DEBUG ("executable symbol: %s\n", name);
 987       free (name);
 988     }
 989
 990   return HSA_STATUS_SUCCESS;
 991 }
 992
 993 /* Dump all global symbol in an executable.  */
 994
 995 static void
 996 dump_executable_symbols (hsa_executable_t executable)
 997 {
 998   hsa_status_t status;
 999   status
1000     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1001                                                  dump_executable_symbol,
1002                                                  NULL);
1003   if (status != HSA_STATUS_SUCCESS)
1004     hsa_fatal ("Could not dump HSA executable symbols", status);
1005 }
1006
1007 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1008
1009 static void
1010 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1011 {
1012   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1013
1014   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1015   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1016   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1017   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1018            (void*)kernargs->abi.heap_ptr);
1019   fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1020            "", (void*)kernargs->abi.arena_ptr,
1021            kernargs->abi.arena_size_per_team);
1022   fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1023            "", (void*)kernargs->abi.stack_ptr,
1024            kernargs->abi.stack_size_per_thread);
1025   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1026   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1027            dispatch->private_segment_size);
1028   fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1029            "", dispatch->group_segment_size);
1030   fprintf (stderr, "\n");
1031 }
1032
1033 /* }}}  */
1034 /* {{{ Utility functions  */
1035
1036 /* Cast the thread local storage to gcn_thread.  */
1037
1038 static inline struct gcn_thread *
1039 gcn_thread (void)
1040 {
1041   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1042 }
1043
1044 /* Initialize debug and suppress_host_fallback according to the environment.  */
1045
1046 static void
1047 init_environment_variables (void)
1048 {
1049   if (secure_getenv ("GCN_DEBUG"))
1050     debug = true;
1051   else
1052     debug = false;
1053
1054   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1055     suppress_host_fallback = true;
1056   else
1057     suppress_host_fallback = false;
1058
1059   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1060   if (hsa_runtime_lib == NULL)
1061     hsa_runtime_lib = "libhsa-runtime64.so.1";
1062
1063   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1064
1065   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1066   if (!x)
1067     x = secure_getenv ("GCN_NUM_GANGS");
1068   if (x)
1069     override_x_dim = atoi (x);
1070
1071   const char *z = secure_getenv ("GCN_NUM_THREADS");
1072   if (!z)
1073     z = secure_getenv ("GCN_NUM_WORKERS");
1074   if (z)
1075     override_z_dim = atoi (z);
1076
1077   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1078   if (heap)
1079     {
1080       size_t tmp = atol (heap);
1081       if (tmp)
1082         gcn_kernel_heap_size = tmp;
1083     }
1084
1085   const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1086   if (arena)
1087     {
1088       int tmp = atoi (arena);
1089       if (tmp)
1090         team_arena_size = tmp;;
1091     }
1092
1093   const char *stack = secure_getenv ("GCN_STACK_SIZE");
1094   if (stack)
1095     {
1096       int tmp = atoi (stack);
1097       if (tmp)
1098         stack_size = tmp;;
1099     }
1100
1101   const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1102   if (lowlat)
1103     lowlat_size = atoi (lowlat);
1104 }
1105
1106 /* Return malloc'd string with name of SYMBOL.  */
1107
1108 static char *
1109 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1110 {
1111   hsa_status_t status;
1112   char *res;
1113   uint32_t len;
1114   const hsa_executable_symbol_info_t info_name_length
1115     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1116
1117   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1118                                                       &len);
1119   if (status != HSA_STATUS_SUCCESS)
1120     {
1121       hsa_error ("Could not get length of symbol name", status);
1122       return NULL;
1123     }
1124
1125   res = GOMP_PLUGIN_malloc (len + 1);
1126
1127   const hsa_executable_symbol_info_t info_name
1128     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1129
1130   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1131
1132   if (status != HSA_STATUS_SUCCESS)
1133     {
1134       hsa_error ("Could not get symbol name", status);
1135       free (res);
1136       return NULL;
1137     }
1138
1139   res[len] = '\0';
1140
1141   return res;
1142 }
1143
1144 /* Get the number of GPU Compute Units.  */
1145
1146 static int
1147 get_cu_count (struct agent_info *agent)
1148 {
1149   uint32_t cu_count;
1150   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1151     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1152   if (status == HSA_STATUS_SUCCESS)
1153     return cu_count;
1154   else
1155     return 64;  /* The usual number for older devices.  */
1156 }
1157
1158 /* Calculate the maximum grid size for OMP threads / OACC workers.
1159    This depends on the kernel's resource usage levels.  */
1160
1161 static int
1162 limit_worker_threads (int threads)
1163 {
1164   /* FIXME Do something more inteligent here.
1165      GCN can always run 4 threads within a Compute Unit, but
1166      more than that depends on register usage.  */
1167   if (threads > 16)
1168     threads = 16;
1169   return threads;
1170 }
1171
1172 /* This sets the maximum number of teams to twice the number of GPU Compute
1173    Units to avoid memory waste and corresponding memory access faults.  */
1174
1175 static int
1176 limit_teams (int teams, struct agent_info *agent)
1177 {
1178   int max_teams = 2 * get_cu_count (agent);
1179   if (teams > max_teams)
1180     teams = max_teams;
1181   return teams;
1182 }
1183
1184 /* Parse the target attributes INPUT provided by the compiler and return true
1185    if we should run anything all.  If INPUT is NULL, fill DEF with default
1186    values, then store INPUT or DEF into *RESULT.
1187
1188    This is used for OpenMP only.  */
1189
1190 static bool
1191 parse_target_attributes (void **input,
1192                          struct GOMP_kernel_launch_attributes *def,
1193                          struct GOMP_kernel_launch_attributes **result,
1194                          struct agent_info *agent)
1195 {
1196   if (!input)
1197     GOMP_PLUGIN_fatal ("No target arguments provided");
1198
1199   bool grid_attrs_found = false;
1200   bool gcn_dims_found = false;
1201   int gcn_teams = 0;
1202   int gcn_threads = 0;
1203   while (*input)
1204     {
1205       intptr_t id = (intptr_t) *input++, val;
1206
1207       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1208         val = (intptr_t) *input++;
1209       else
1210         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1211
1212       val = (val > INT_MAX) ? INT_MAX : val;
1213
1214       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1215           && ((id & GOMP_TARGET_ARG_ID_MASK)
1216               == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1217         {
1218           grid_attrs_found = true;
1219           break;
1220         }
1221       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1222                == GOMP_TARGET_ARG_DEVICE_ALL)
1223         {
1224           gcn_dims_found = true;
1225           switch (id & GOMP_TARGET_ARG_ID_MASK)
1226             {
1227             case GOMP_TARGET_ARG_NUM_TEAMS:
1228               gcn_teams = limit_teams (val, agent);
1229               break;
1230             case GOMP_TARGET_ARG_THREAD_LIMIT:
1231               gcn_threads = limit_worker_threads (val);
1232               break;
1233             default:
1234               ;
1235             }
1236         }
1237     }
1238
1239   if (gcn_dims_found)
1240     {
1241       bool gfx900_workaround_p = false;
1242
1243       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1244           && gcn_threads == 0 && override_z_dim == 0)
1245         {
1246           gfx900_workaround_p = true;
1247           GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1248                        "threads to at most 4 per team.\n");
1249           GCN_WARNING (" - If this is not a Vega 10 device, please use "
1250                        "GCN_NUM_THREADS=16\n");
1251         }
1252
1253       /* Ideally, when a dimension isn't explicitly specified, we should
1254          tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1255          In practice, we tune for peak performance on BabelStream, which
1256          for OpenACC is currently 32 threads per CU.  */
1257       def->ndim = 3;
1258       if (gcn_teams <= 0 && gcn_threads <= 0)
1259         {
1260           /* Set up a reasonable number of teams and threads.  */
1261           gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1262           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1263           def->gdims[2] = gcn_threads;
1264         }
1265       else if (gcn_teams <= 0 && gcn_threads > 0)
1266         {
1267           /* Auto-scale the number of teams with the number of threads.  */
1268           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1269           def->gdims[2] = gcn_threads;
1270         }
1271       else if (gcn_teams > 0 && gcn_threads <= 0)
1272         {
1273           int max_threads = gfx900_workaround_p ? 4 : 16;
1274
1275           /* Auto-scale the number of threads with the number of teams.  */
1276           def->gdims[0] = gcn_teams;
1277           def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1278           if (def->gdims[2] == 0)
1279             def->gdims[2] = 1;
1280           else if (def->gdims[2] > max_threads)
1281             def->gdims[2] = max_threads;
1282         }
1283       else
1284         {
1285           def->gdims[0] = gcn_teams;
1286           def->gdims[2] = gcn_threads;
1287         }
1288       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
1289       def->wdims[0] = 1;  /* Single team per work-group.  */
1290       def->wdims[1] = 64;
1291       def->wdims[2] = 16;
1292       *result = def;
1293       return true;
1294     }
1295   else if (!grid_attrs_found)
1296     {
1297       def->ndim = 1;
1298       def->gdims[0] = 1;
1299       def->gdims[1] = 1;
1300       def->gdims[2] = 1;
1301       def->wdims[0] = 1;
1302       def->wdims[1] = 1;
1303       def->wdims[2] = 1;
1304       *result = def;
1305       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1306       return true;
1307     }
1308
1309   struct GOMP_kernel_launch_attributes *kla;
1310   kla = (struct GOMP_kernel_launch_attributes *) *input;
1311   *result = kla;
1312   if (kla->ndim == 0 || kla->ndim > 3)
1313     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1314
1315   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1316   unsigned i;
1317   for (i = 0; i < kla->ndim; i++)
1318     {
1319       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1320                  kla->gdims[i], kla->wdims[i]);
1321       if (kla->gdims[i] == 0)
1322         return false;
1323     }
1324   return true;
1325 }
1326
1327 /* Return the group size given the requested GROUP size, GRID size and number
1328    of grid dimensions NDIM.  */
1329
1330 static uint32_t
1331 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1332 {
1333   if (group == 0)
1334     {
1335       /* TODO: Provide a default via environment or device characteristics.  */
1336       if (ndim == 1)
1337         group = 64;
1338       else if (ndim == 2)
1339         group = 8;
1340       else
1341         group = 4;
1342     }
1343
1344   if (group > grid)
1345     group = grid;
1346   return group;
1347 }
1348
1349 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1350
1351 static void
1352 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1353 {
1354   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1355 }
1356
1357 /* A never-called callback for the HSA command queues.  These signal events
1358    that we don't use, so we trigger an error.
1359
1360    This "queue" is not to be confused with the async queues, below.  */
1361
1362 static void
1363 hsa_queue_callback (hsa_status_t status,
1364                 hsa_queue_t *queue __attribute__ ((unused)),
1365                 void *data __attribute__ ((unused)))
1366 {
1367   hsa_fatal ("Asynchronous queue error", status);
1368 }
1369
1370 /* }}}  */
1371 /* {{{ HSA initialization  */
1372
1373 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1374
1375 static bool
1376 init_hsa_runtime_functions (void)
1377 {
1378 #define DLSYM_FN(function) \
1379   hsa_fns.function##_fn = dlsym (handle, #function); \
1380   if (hsa_fns.function##_fn == NULL) \
1381     GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
1382 #define DLSYM_OPT_FN(function) \
1383   hsa_fns.function##_fn = dlsym (handle, #function);
1384
1385   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1386   if (handle == NULL)
1387     return false;
1388
1389   DLSYM_FN (hsa_status_string)
1390   DLSYM_FN (hsa_system_get_info)
1391   DLSYM_FN (hsa_agent_get_info)
1392   DLSYM_FN (hsa_init)
1393   DLSYM_FN (hsa_iterate_agents)
1394   DLSYM_FN (hsa_region_get_info)
1395   DLSYM_FN (hsa_queue_create)
1396   DLSYM_FN (hsa_agent_iterate_regions)
1397   DLSYM_FN (hsa_executable_destroy)
1398   DLSYM_FN (hsa_executable_create)
1399   DLSYM_FN (hsa_executable_global_variable_define)
1400   DLSYM_FN (hsa_executable_load_code_object)
1401   DLSYM_FN (hsa_executable_freeze)
1402   DLSYM_FN (hsa_signal_create)
1403   DLSYM_FN (hsa_memory_allocate)
1404   DLSYM_FN (hsa_memory_assign_agent)
1405   DLSYM_FN (hsa_memory_copy)
1406   DLSYM_FN (hsa_memory_free)
1407   DLSYM_FN (hsa_signal_destroy)
1408   DLSYM_FN (hsa_executable_get_symbol)
1409   DLSYM_FN (hsa_executable_symbol_get_info)
1410   DLSYM_FN (hsa_executable_iterate_symbols)
1411   DLSYM_FN (hsa_queue_add_write_index_release)
1412   DLSYM_FN (hsa_queue_load_read_index_acquire)
1413   DLSYM_FN (hsa_signal_wait_acquire)
1414   DLSYM_FN (hsa_signal_store_relaxed)
1415   DLSYM_FN (hsa_signal_store_release)
1416   DLSYM_FN (hsa_signal_load_acquire)
1417   DLSYM_FN (hsa_queue_destroy)
1418   DLSYM_FN (hsa_code_object_deserialize)
1419   DLSYM_OPT_FN (hsa_amd_memory_lock)
1420   DLSYM_OPT_FN (hsa_amd_memory_unlock)
1421   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1422   return true;
1423 #undef DLSYM_OPT_FN
1424 #undef DLSYM_FN
1425 }
1426
1427 static gcn_isa isa_code (const char *isa);
1428
1429 /* Return true if the agent is a GPU and can accept of concurrent submissions
1430    from different threads.  */
1431
1432 static bool
1433 suitable_hsa_agent_p (hsa_agent_t agent)
1434 {
1435   hsa_device_type_t device_type;
1436   hsa_status_t status
1437     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1438                                      &device_type);
1439   if (status != HSA_STATUS_SUCCESS)
1440     return false;
1441
1442   switch (device_type)
1443     {
1444     case HSA_DEVICE_TYPE_GPU:
1445       {
1446         char name[64];
1447         hsa_status_t status
1448           = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1449         if (status != HSA_STATUS_SUCCESS
1450             || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1451           {
1452             GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1453                        status == HSA_STATUS_SUCCESS ? name : "invalid");
1454             return false;
1455           }
1456       }
1457       break;
1458     case HSA_DEVICE_TYPE_CPU:
1459       if (!support_cpu_devices)
1460         return false;
1461       break;
1462     default:
1463       return false;
1464     }
1465
1466   uint32_t features = 0;
1467   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1468                                           &features);
1469   if (status != HSA_STATUS_SUCCESS
1470       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1471     return false;
1472   hsa_queue_type_t queue_type;
1473   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1474                                           &queue_type);
1475   if (status != HSA_STATUS_SUCCESS
1476       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1477     return false;
1478
1479   return true;
1480 }
1481
1482 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1483    agent_count in hsa_context.  */
1484
1485 static hsa_status_t
1486 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1487 {
1488   if (suitable_hsa_agent_p (agent))
1489     hsa_context.agent_count++;
1490   return HSA_STATUS_SUCCESS;
1491 }
1492
1493 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1494    id to the describing structure in the hsa context.  The index of the
1495    structure is pointed to by DATA, increment it afterwards.  */
1496
1497 static hsa_status_t
1498 assign_agent_ids (hsa_agent_t agent, void *data)
1499 {
1500   if (suitable_hsa_agent_p (agent))
1501     {
1502       int *agent_index = (int *) data;
1503       hsa_context.agents[*agent_index].id = agent;
1504       ++*agent_index;
1505     }
1506   return HSA_STATUS_SUCCESS;
1507 }
1508
1509 /* Initialize hsa_context if it has not already been done.
1510    If !PROBE: returns TRUE on success.
1511    If PROBE: returns TRUE on success or if the plugin/device shall be silently
1512    ignored, and otherwise emits an error and returns FALSE.  */
1513
1514 static bool
1515 init_hsa_context (bool probe)
1516 {
1517   hsa_status_t status;
1518   int agent_index = 0;
1519
1520   if (hsa_context.initialized)
1521     return true;
1522   init_environment_variables ();
1523   if (!init_hsa_runtime_functions ())
1524     {
1525       const char *msg = "Run-time could not be dynamically opened";
1526       if (suppress_host_fallback)
1527         GOMP_PLUGIN_fatal ("%s\n", msg);
1528       else
1529         GCN_WARNING ("%s\n", msg);
1530       return probe ? true : false;
1531     }
1532   status = hsa_fns.hsa_init_fn ();
1533   if (status != HSA_STATUS_SUCCESS)
1534     return hsa_error ("Run-time could not be initialized", status);
1535   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1536
1537   if (debug)
1538     dump_hsa_system_info ();
1539
1540   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1541   if (status != HSA_STATUS_SUCCESS)
1542     return hsa_error ("GCN GPU devices could not be enumerated", status);
1543   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1544
1545   hsa_context.agents
1546     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1547                                   * sizeof (struct agent_info));
1548   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1549   if (status != HSA_STATUS_SUCCESS)
1550     return hsa_error ("Scanning compute agents failed", status);
1551   if (agent_index != hsa_context.agent_count)
1552     {
1553       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1554       return false;
1555     }
1556
1557   if (debug)
1558     {
1559       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1560       if (status != HSA_STATUS_SUCCESS)
1561         GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1562     }
1563
1564   uint16_t minor, major;
1565   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1566                                            &minor);
1567   if (status != HSA_STATUS_SUCCESS)
1568     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1569   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1570                                            &major);
1571   if (status != HSA_STATUS_SUCCESS)
1572     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1573
1574   size_t len = sizeof hsa_context.driver_version_s;
1575   int printed = snprintf (hsa_context.driver_version_s, len,
1576                           "HSA Runtime %hu.%hu", (unsigned short int)major,
1577                           (unsigned short int)minor);
1578   if (printed >= len)
1579     GCN_WARNING ("HSA runtime version string was truncated."
1580                  "Version %hu.%hu is too long.", (unsigned short int)major,
1581                  (unsigned short int)minor);
1582
1583   hsa_context.initialized = true;
1584   return true;
1585 }
1586
1587 /* Verify that hsa_context has already been initialized and return the
1588    agent_info structure describing device number N.  Return NULL on error.  */
1589
1590 static struct agent_info *
1591 get_agent_info (int n)
1592 {
1593   if (!hsa_context.initialized)
1594     {
1595       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1596       return NULL;
1597     }
1598   if (n >= hsa_context.agent_count)
1599     {
1600       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1601       return NULL;
1602     }
1603   if (!hsa_context.agents[n].initialized)
1604     {
1605       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1606       return NULL;
1607     }
1608   return &hsa_context.agents[n];
1609 }
1610
1611 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1612
1613    Selects (breaks at) a suitable region of type KIND.  */
1614
1615 static hsa_status_t
1616 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1617                    hsa_region_global_flag_t kind)
1618 {
1619   hsa_status_t status;
1620   hsa_region_segment_t segment;
1621
1622   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1623                                            &segment);
1624   if (status != HSA_STATUS_SUCCESS)
1625     return status;
1626   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1627     return HSA_STATUS_SUCCESS;
1628
1629   uint32_t flags;
1630   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1631                                            &flags);
1632   if (status != HSA_STATUS_SUCCESS)
1633     return status;
1634   if (flags & kind)
1635     {
1636       *retval = region;
1637       return HSA_STATUS_INFO_BREAK;
1638     }
1639   return HSA_STATUS_SUCCESS;
1640 }
1641
1642 /* Callback of hsa_agent_iterate_regions.
1643
1644    Selects a kernargs memory region.  */
1645
1646 static hsa_status_t
1647 get_kernarg_memory_region (hsa_region_t region, void *data)
1648 {
1649   return get_memory_region (region, (hsa_region_t *)data,
1650                             HSA_REGION_GLOBAL_FLAG_KERNARG);
1651 }
1652
1653 /* Callback of hsa_agent_iterate_regions.
1654
1655    Selects a coarse-grained memory region suitable for the heap and
1656    offload data.  */
1657
1658 static hsa_status_t
1659 get_data_memory_region (hsa_region_t region, void *data)
1660 {
1661   return get_memory_region (region, (hsa_region_t *)data,
1662                             HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1663 }
1664
1665 static int
1666 elf_gcn_isa_field (Elf64_Ehdr *image)
1667 {
1668   return image->e_flags & EF_AMDGPU_MACH_MASK;
1669 }
1670
1671 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1672    support the ISA. */
1673
1674 static const char*
1675 isa_name (int isa) {
1676   switch(isa)
1677     {
1678 #define GCN_DEVICE(name, NAME, ELF, ...) \
1679     case ELF: return #name;
1680 #include "../../gcc/config/gcn/gcn-devices.def"
1681     }
1682   return NULL;
1683 }
1684
1685 /* Returns the code which is used in the GCN object code to identify the ISA with
1686    the given name (as used by the HSA runtime).  */
1687
1688 static gcn_isa
1689 isa_code(const char *isa) {
1690 #define GCN_DEVICE(name, NAME, ELF, ...) \
1691   if (!strcmp (isa, #name)) return ELF;
1692 #include "../../gcc/config/gcn/gcn-devices.def"
1693
1694   return EF_AMDGPU_MACH_UNSUPPORTED;
1695 }
1696
1697 /* CDNA2 devices have twice as many VGPRs compared to older devices.  */
1698
1699 static int
1700 max_isa_vgprs (int isa)
1701 {
1702   switch (isa)
1703     {
1704 #define GCN_DEVICE(name, NAME, ELF, ISA, XNACK, SRAM, WAVE64, CU, \
1705                    MAX_ISA_VGPRS, ...) \
1706     case ELF: return MAX_ISA_VGPRS;
1707 #include "../../gcc/config/gcn/gcn-devices.def"
1708     default:
1709       GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1710     }
1711 }
1712
1713 /* }}}  */
1714 /* {{{ Run  */
1715
1716 /* Create or reuse a team arena and stack space.
1717
1718    Team arenas are used by OpenMP to avoid calling malloc multiple times
1719    while setting up each team.  This is purely a performance optimization.
1720
1721    The stack space is used by all kernels.  We must allocate it in such a
1722    way that the reverse offload implmentation can access the data.
1723
1724    Allocating this memory costs performance, so this function will reuse an
1725    existing allocation if a large enough one is idle.
1726    The memory lock is released, but not deallocated, when the kernel exits.  */
1727
1728 static void
1729 configure_ephemeral_memories (struct kernel_info *kernel,
1730                               struct kernargs_abi *kernargs, int num_teams,
1731                               int num_threads)
1732 {
1733   struct agent_info *agent = kernel->agent;
1734   struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1735   struct ephemeral_memories_list *item;
1736
1737   int actual_arena_size = (kernel->kind == KIND_OPENMP
1738                            ? team_arena_size : 0);
1739   int actual_arena_total_size = actual_arena_size * num_teams;
1740   size_t size = (actual_arena_total_size
1741                  + num_teams * num_threads * stack_size);
1742
1743   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1744     {
1745       if (item->size < size)
1746         continue;
1747
1748       if (pthread_mutex_trylock (&item->in_use) == 0)
1749         break;
1750     }
1751
1752   if (!item)
1753     {
1754       GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1755                  " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1756                  num_teams, num_threads, size);
1757
1758       if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1759         {
1760           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1761           return;
1762         }
1763       item = malloc (sizeof (*item));
1764       item->size = size;
1765       item->next = NULL;
1766       *next_ptr = item;
1767
1768       if (pthread_mutex_init (&item->in_use, NULL))
1769         {
1770           GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1771           return;
1772         }
1773       if (pthread_mutex_lock (&item->in_use))
1774         {
1775           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1776           return;
1777         }
1778       if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1779         {
1780           GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1781           return;
1782         }
1783
1784       hsa_status_t status;
1785       status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1786                                                &item->address);
1787       if (status != HSA_STATUS_SUCCESS)
1788         hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1789       status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1790                                                    HSA_ACCESS_PERMISSION_RW);
1791       if (status != HSA_STATUS_SUCCESS)
1792         hsa_fatal ("Could not assign arena & stack memory to device", status);
1793     }
1794
1795   kernargs->arena_ptr = (actual_arena_total_size
1796                          ? (uint64_t)item->address
1797                          : 0);
1798   kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1799   kernargs->arena_size_per_team = actual_arena_size;
1800   kernargs->stack_size_per_thread = stack_size;
1801 }
1802
1803 /* Mark an ephemeral memory space available for reuse.  */
1804
1805 static void
1806 release_ephemeral_memories (struct agent_info* agent, void *address)
1807 {
1808   struct ephemeral_memories_list *item;
1809
1810   for (item = agent->ephemeral_memories_list; item; item = item->next)
1811     {
1812       if (item->address == address)
1813         {
1814           if (pthread_mutex_unlock (&item->in_use))
1815             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1816           return;
1817         }
1818     }
1819   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1820 }
1821
1822 /* Clean up all the allocated team arenas.  */
1823
1824 static bool
1825 destroy_ephemeral_memories (struct agent_info *agent)
1826 {
1827   struct ephemeral_memories_list *item, *next;
1828
1829   for (item = agent->ephemeral_memories_list; item; item = next)
1830     {
1831       next = item->next;
1832       hsa_fns.hsa_memory_free_fn (item->address);
1833       if (pthread_mutex_destroy (&item->in_use))
1834         {
1835           GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1836           return false;
1837         }
1838       free (item);
1839     }
1840   agent->ephemeral_memories_list = NULL;
1841
1842   return true;
1843 }
1844
1845 /* Allocate memory on a specified device.  */
1846
1847 static void *
1848 alloc_by_agent (struct agent_info *agent, size_t size)
1849 {
1850   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1851
1852   void *ptr;
1853   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1854                                                         size, &ptr);
1855   if (status != HSA_STATUS_SUCCESS)
1856     {
1857       hsa_error ("Could not allocate device memory", status);
1858       return NULL;
1859     }
1860
1861   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1862                                                HSA_ACCESS_PERMISSION_RW);
1863   if (status != HSA_STATUS_SUCCESS)
1864     {
1865       hsa_error ("Could not assign data memory to device", status);
1866       return NULL;
1867     }
1868
1869   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1870   bool profiling_dispatch_p
1871     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1872   if (profiling_dispatch_p)
1873     {
1874       acc_prof_info *prof_info = thr->prof_info;
1875       acc_event_info data_event_info;
1876       acc_api_info *api_info = thr->api_info;
1877
1878       prof_info->event_type = acc_ev_alloc;
1879
1880       data_event_info.data_event.event_type = prof_info->event_type;
1881       data_event_info.data_event.valid_bytes
1882         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1883       data_event_info.data_event.parent_construct
1884         = acc_construct_parallel;
1885       data_event_info.data_event.implicit = 1;
1886       data_event_info.data_event.tool_info = NULL;
1887       data_event_info.data_event.var_name = NULL;
1888       data_event_info.data_event.bytes = size;
1889       data_event_info.data_event.host_ptr = NULL;
1890       data_event_info.data_event.device_ptr = (void *) ptr;
1891
1892       api_info->device_api = acc_device_api_other;
1893
1894       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1895                                             api_info);
1896     }
1897
1898   return ptr;
1899 }
1900
1901 /* Create kernel dispatch data structure for given KERNEL, along with
1902    the necessary device signals and memory allocations.  */
1903
1904 static struct kernel_dispatch *
1905 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1906                         int num_threads)
1907 {
1908   struct agent_info *agent = kernel->agent;
1909   struct kernel_dispatch *shadow
1910     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1911
1912   shadow->agent = kernel->agent;
1913   shadow->object = kernel->object;
1914
1915   hsa_signal_t sync_signal;
1916   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1917   if (status != HSA_STATUS_SUCCESS)
1918     hsa_fatal ("Error creating the GCN sync signal", status);
1919
1920   shadow->signal = sync_signal.handle;
1921   shadow->private_segment_size = kernel->private_segment_size;
1922
1923   if (lowlat_size < 0)
1924     {
1925       /* Divide the LDS between the number of running teams.
1926          Allocate not less than is defined in the kernel metadata.  */
1927       int teams_per_cu = num_teams / get_cu_count (agent);
1928       int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1929       shadow->group_segment_size
1930         = (kernel->group_segment_size > LDS_per_team
1931            ? kernel->group_segment_size
1932            : LDS_per_team);;
1933     }
1934   else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1935     /* Ensure that there's space for the OpenMP libgomp data.  */
1936     shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1937   else
1938     shadow->group_segment_size = (lowlat_size > 65536
1939                                   ? 65536
1940                                   : lowlat_size);
1941
1942   /* We expect kernels to request a single pointer, explicitly, and the
1943      rest of struct kernargs, implicitly.  If they request anything else
1944      then something is wrong.  */
1945   if (kernel->kernarg_segment_size > 8)
1946     {
1947       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1948       return NULL;
1949     }
1950
1951   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1952                                            sizeof (struct kernargs),
1953                                            &shadow->kernarg_address);
1954   if (status != HSA_STATUS_SUCCESS)
1955     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1956   struct kernargs *kernargs = shadow->kernarg_address;
1957
1958   /* Zero-initialize the output_data (minimum needed).  */
1959   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
1960   kernargs->output_data.next_output = 0;
1961   for (unsigned i = 0;
1962        i < (sizeof (kernargs->output_data.queue)
1963             / sizeof (kernargs->output_data.queue[0]));
1964        i++)
1965     kernargs->output_data.queue[i].written = 0;
1966   kernargs->output_data.consumed = 0;
1967
1968   /* Pass in the heap location.  */
1969   kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
1970
1971   /* Create the ephemeral memory spaces.  */
1972   configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
1973
1974   /* Ensure we can recognize unset return values.  */
1975   kernargs->output_data.return_value = 0xcafe0000;
1976
1977   return shadow;
1978 }
1979
1980 static void
1981 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
1982                          uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
1983 {
1984   int dev_num = dev_num64;
1985   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
1986                           NULL);
1987 }
1988
1989 /* Output any data written to console output from the kernel.  It is expected
1990    that this function is polled during kernel execution.
1991
1992    We print all entries from the last item printed to the next entry without
1993    a "written" flag.  If the "final" flag is set then it'll continue right to
1994    the end.
1995
1996    The print buffer is circular, but the from and to locations don't wrap when
1997    the buffer does, so the output limit is UINT_MAX.  The target blocks on
1998    output when the buffer is full.  */
1999
2000 static void
2001 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2002                 bool final)
2003 {
2004   unsigned int limit = (sizeof (kernargs->output_data.queue)
2005                         / sizeof (kernargs->output_data.queue[0]));
2006
2007   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2008                                        __ATOMIC_ACQUIRE);
2009   unsigned int to = kernargs->output_data.next_output;
2010
2011   if (from > to)
2012     {
2013       /* Overflow.  */
2014       if (final)
2015         printf ("GCN print buffer overflowed.\n");
2016       return;
2017     }
2018
2019   unsigned int i;
2020   for (i = from; i < to; i++)
2021     {
2022       struct printf_data *data = &kernargs->output_data.queue[i%limit];
2023
2024       if (!data->written && !final)
2025         break;
2026
2027       switch (data->type)
2028         {
2029         case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2030         case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2031         case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2032         case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2033         case 4:
2034           process_reverse_offload (data->value_u64[0], data->value_u64[1],
2035                                    data->value_u64[2], data->value_u64[3],
2036                                    data->value_u64[4], data->value_u64[5]);
2037           break;
2038         default: printf ("GCN print buffer error!\n"); break;
2039         }
2040       data->written = 0;
2041       __atomic_store_n (&kernargs->output_data.consumed, i+1,
2042                         __ATOMIC_RELEASE);
2043     }
2044   fflush (stdout);
2045 }
2046
2047 /* Release data structure created for a kernel dispatch in SHADOW argument,
2048    and clean up the signal and memory allocations.  */
2049
2050 static void
2051 release_kernel_dispatch (struct kernel_dispatch *shadow)
2052 {
2053   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2054
2055   struct kernargs *kernargs = shadow->kernarg_address;
2056   void *addr = (void *)kernargs->abi.arena_ptr;
2057   if (!addr)
2058     addr = (void *)kernargs->abi.stack_ptr;
2059   release_ephemeral_memories (shadow->agent, addr);
2060
2061   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2062
2063   hsa_signal_t s;
2064   s.handle = shadow->signal;
2065   hsa_fns.hsa_signal_destroy_fn (s);
2066
2067   free (shadow);
2068 }
2069
2070 /* Extract the properties from a kernel binary.  */
2071
2072 static void
2073 init_kernel_properties (struct kernel_info *kernel)
2074 {
2075   hsa_status_t status;
2076   struct agent_info *agent = kernel->agent;
2077   hsa_executable_symbol_t kernel_symbol;
2078   char *buf = alloca (strlen (kernel->name) + 4);
2079   sprintf (buf, "%s.kd", kernel->name);
2080   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2081                                                  buf, agent->id,
2082                                                  0, &kernel_symbol);
2083   if (status != HSA_STATUS_SUCCESS)
2084     {
2085       hsa_warn ("Could not find symbol for kernel in the code object", status);
2086       fprintf (stderr, "not found name: '%s'\n", buf);
2087       dump_executable_symbols (agent->executable);
2088       goto failure;
2089     }
2090   GCN_DEBUG ("Located kernel %s\n", kernel->name);
2091   status = hsa_fns.hsa_executable_symbol_get_info_fn
2092     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2093   if (status != HSA_STATUS_SUCCESS)
2094     hsa_fatal ("Could not extract a kernel object from its symbol", status);
2095   status = hsa_fns.hsa_executable_symbol_get_info_fn
2096     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2097      &kernel->kernarg_segment_size);
2098   if (status != HSA_STATUS_SUCCESS)
2099     hsa_fatal ("Could not get info about kernel argument size", status);
2100   status = hsa_fns.hsa_executable_symbol_get_info_fn
2101     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2102      &kernel->group_segment_size);
2103   if (status != HSA_STATUS_SUCCESS)
2104     hsa_fatal ("Could not get info about kernel group segment size", status);
2105   status = hsa_fns.hsa_executable_symbol_get_info_fn
2106     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2107      &kernel->private_segment_size);
2108   if (status != HSA_STATUS_SUCCESS)
2109     hsa_fatal ("Could not get info about kernel private segment size",
2110                status);
2111
2112   /* The kernel type is not known until something tries to launch it.  */
2113   kernel->kind = KIND_UNKNOWN;
2114
2115   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2116              "following segment sizes: \n", kernel->name);
2117   GCN_DEBUG ("  group_segment_size: %u\n",
2118              (unsigned) kernel->group_segment_size);
2119   GCN_DEBUG ("  private_segment_size: %u\n",
2120              (unsigned) kernel->private_segment_size);
2121   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2122              (unsigned) kernel->kernarg_segment_size);
2123   return;
2124
2125 failure:
2126   kernel->initialization_failed = true;
2127 }
2128
2129 /* Do all the work that is necessary before running KERNEL for the first time.
2130    The function assumes the program has been created, finalized and frozen by
2131    create_and_finalize_hsa_program.  */
2132
2133 static void
2134 init_kernel (struct kernel_info *kernel)
2135 {
2136   if (pthread_mutex_lock (&kernel->init_mutex))
2137     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2138   if (kernel->initialized)
2139     {
2140       if (pthread_mutex_unlock (&kernel->init_mutex))
2141         GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2142                            "mutex");
2143
2144       return;
2145     }
2146
2147   init_kernel_properties (kernel);
2148
2149   if (!kernel->initialization_failed)
2150     {
2151       GCN_DEBUG ("\n");
2152
2153       kernel->initialized = true;
2154     }
2155   if (pthread_mutex_unlock (&kernel->init_mutex))
2156     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2157                        "mutex");
2158 }
2159
2160 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2161    launch attributes from KLA.
2162
2163    MODULE_LOCKED indicates that the caller already holds the lock and
2164    run_kernel need not lock it again.
2165    If AQ is NULL then agent->sync_queue will be used.  */
2166
2167 static void
2168 run_kernel (struct kernel_info *kernel, void *vars,
2169             struct GOMP_kernel_launch_attributes *kla,
2170             struct goacc_asyncqueue *aq, bool module_locked)
2171 {
2172   struct agent_info *agent = kernel->agent;
2173   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2174              kernel->description->vpgr_count);
2175
2176   /* Reduce the number of threads/workers if there are insufficient
2177      VGPRs available to run the kernels together.  */
2178   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2179     {
2180       int max_vgprs = max_isa_vgprs (agent->device_isa);
2181       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2182       int max_threads = (max_vgprs / granulated_vgprs) * 4;
2183       if (kla->gdims[2] > max_threads)
2184         {
2185           GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2186                        " per team/gang - reducing to %d threads/workers.\n",
2187                        kla->gdims[2], max_threads);
2188           kla->gdims[2] = max_threads;
2189         }
2190     }
2191
2192   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2193              (aq ? aq->id : 0));
2194   GCN_DEBUG ("GCN launch attribs: gdims:[");
2195   int i;
2196   for (i = 0; i < kla->ndim; ++i)
2197     {
2198       if (i)
2199         DEBUG_PRINT (", ");
2200       DEBUG_PRINT ("%u", kla->gdims[i]);
2201     }
2202   DEBUG_PRINT ("], normalized gdims:[");
2203   for (i = 0; i < kla->ndim; ++i)
2204     {
2205       if (i)
2206         DEBUG_PRINT (", ");
2207       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2208     }
2209   DEBUG_PRINT ("], wdims:[");
2210   for (i = 0; i < kla->ndim; ++i)
2211     {
2212       if (i)
2213         DEBUG_PRINT (", ");
2214       DEBUG_PRINT ("%u", kla->wdims[i]);
2215     }
2216   DEBUG_PRINT ("]\n");
2217   DEBUG_FLUSH ();
2218
2219   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2220     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2221
2222   if (!agent->initialized)
2223     GOMP_PLUGIN_fatal ("Agent must be initialized");
2224
2225   if (!kernel->initialized)
2226     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2227
2228   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2229
2230   uint64_t index
2231     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2232   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2233
2234   /* Wait until the queue is not full before writing the packet.   */
2235   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2236          >= command_q->size)
2237     ;
2238
2239   /* Do not allow the dimensions to be overridden when running
2240      constructors or destructors.  */
2241   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2242   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2243
2244   hsa_kernel_dispatch_packet_t *packet;
2245   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2246            + index % command_q->size;
2247
2248   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2249   packet->grid_size_x = override_x ? : kla->gdims[0];
2250   packet->workgroup_size_x = get_group_size (kla->ndim,
2251                                              packet->grid_size_x,
2252                                              kla->wdims[0]);
2253
2254   if (kla->ndim >= 2)
2255     {
2256       packet->grid_size_y = kla->gdims[1];
2257       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2258                                                  kla->wdims[1]);
2259     }
2260   else
2261     {
2262       packet->grid_size_y = 1;
2263       packet->workgroup_size_y = 1;
2264     }
2265
2266   if (kla->ndim == 3)
2267     {
2268       packet->grid_size_z = limit_worker_threads (override_z
2269                                                   ? : kla->gdims[2]);
2270       packet->workgroup_size_z = get_group_size (kla->ndim,
2271                                                  packet->grid_size_z,
2272                                                  kla->wdims[2]);
2273     }
2274   else
2275     {
2276       packet->grid_size_z = 1;
2277       packet->workgroup_size_z = 1;
2278     }
2279
2280   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2281              " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2282              packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2283              packet->grid_size_x / packet->workgroup_size_x,
2284              packet->grid_size_y / packet->workgroup_size_y,
2285              packet->grid_size_z / packet->workgroup_size_z,
2286              packet->workgroup_size_x, packet->workgroup_size_y,
2287              packet->workgroup_size_z);
2288
2289   struct kernel_dispatch *shadow
2290     = create_kernel_dispatch (kernel, packet->grid_size_x,
2291                               packet->grid_size_z);
2292   shadow->queue = command_q;
2293
2294   if (debug)
2295     {
2296       fprintf (stderr, "\nKernel has following dependencies:\n");
2297       print_kernel_dispatch (shadow, 2);
2298     }
2299
2300   packet->private_segment_size = shadow->private_segment_size;
2301   packet->group_segment_size = shadow->group_segment_size;
2302   packet->kernel_object = shadow->object;
2303   packet->kernarg_address = shadow->kernarg_address;
2304   hsa_signal_t s;
2305   s.handle = shadow->signal;
2306   packet->completion_signal = s;
2307   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2308   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2309
2310   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2311
2312   uint16_t header;
2313   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2314   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2315   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2316
2317   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2318              agent->device_id);
2319
2320   packet_store_release ((uint32_t *) packet, header,
2321                         (uint16_t) kla->ndim
2322                         << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2323
2324   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2325                                        index);
2326
2327   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2328
2329   /* Root signal waits with 1ms timeout.  */
2330   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2331                                              1000 * 1000,
2332                                              HSA_WAIT_STATE_BLOCKED) != 0)
2333     {
2334       console_output (kernel, shadow->kernarg_address, false);
2335     }
2336   console_output (kernel, shadow->kernarg_address, true);
2337
2338   struct kernargs *kernargs = shadow->kernarg_address;
2339   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2340
2341   release_kernel_dispatch (shadow);
2342
2343   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2344     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2345
2346   unsigned int upper = (return_value & ~0xffff) >> 16;
2347   if (upper == 0xcafe)
2348     ; // exit not called, normal termination.
2349   else if (upper == 0xffff)
2350     ; // exit called.
2351   else
2352     {
2353       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2354                          " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2355                          return_value);
2356       abort ();
2357     }
2358
2359   if (upper == 0xffff)
2360     {
2361       unsigned int signal = (return_value >> 8) & 0xff;
2362
2363       if (signal == SIGABRT)
2364         {
2365           GCN_WARNING ("GCN Kernel aborted\n");
2366           abort ();
2367         }
2368       else if (signal != 0)
2369         {
2370           GCN_WARNING ("GCN Kernel received unknown signal\n");
2371           abort ();
2372         }
2373
2374       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2375       exit (return_value & 0xff);
2376     }
2377 }
2378
2379 /* }}}  */
2380 /* {{{ Load/Unload  */
2381
2382 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2383
2384 static bool
2385 init_basic_kernel_info (struct kernel_info *kernel,
2386                         struct hsa_kernel_description *d,
2387                         struct agent_info *agent,
2388                         struct module_info *module)
2389 {
2390   kernel->agent = agent;
2391   kernel->module = module;
2392   kernel->name = d->name;
2393   kernel->description = d;
2394   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2395     {
2396       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2397       return false;
2398     }
2399   return true;
2400 }
2401
2402 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2403
2404 static bool
2405 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2406 {
2407   int isa_field = elf_gcn_isa_field (image);
2408   const char* isa_s = isa_name (isa_field);
2409   if (!isa_s)
2410     {
2411       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2412       return false;
2413     }
2414
2415   if (isa_field != agent->device_isa)
2416     {
2417       char msg[204];
2418       const char *agent_isa_s = isa_name (agent->device_isa);
2419       assert (agent_isa_s);
2420
2421       snprintf (msg, sizeof msg,
2422                 "GCN code object ISA '%s' does not match GPU ISA '%s' "
2423                 "(device %d).\n"
2424                 "Try to recompile with '-foffload-options=-march=%s',\n"
2425                 "or use ROCR_VISIBLE_DEVICES to disable incompatible "
2426                 "devices.\n",
2427                 isa_s, agent_isa_s, agent->device_id, agent_isa_s);
2428
2429       hsa_error (msg, HSA_STATUS_ERROR);
2430       return false;
2431     }
2432
2433   return true;
2434 }
2435
2436 /* Create and finalize the program consisting of all loaded modules.  */
2437
2438 static bool
2439 create_and_finalize_hsa_program (struct agent_info *agent)
2440 {
2441   hsa_status_t status;
2442   bool res = true;
2443   if (pthread_mutex_lock (&agent->prog_mutex))
2444     {
2445       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2446       return false;
2447     }
2448   if (agent->prog_finalized)
2449     goto final;
2450
2451   status
2452     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2453                                         HSA_EXECUTABLE_STATE_UNFROZEN,
2454                                         "", &agent->executable);
2455   if (status != HSA_STATUS_SUCCESS)
2456     {
2457       hsa_error ("Could not create GCN executable", status);
2458       goto fail;
2459     }
2460
2461   /* Load any GCN modules.  */
2462   struct module_info *module = agent->module;
2463   if (module)
2464     {
2465       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2466
2467       if (!isa_matches_agent (agent, image))
2468         goto fail;
2469
2470       hsa_code_object_t co = { 0 };
2471       status = hsa_fns.hsa_code_object_deserialize_fn
2472         (module->image_desc->gcn_image->image,
2473          module->image_desc->gcn_image->size,
2474          NULL, &co);
2475       if (status != HSA_STATUS_SUCCESS)
2476         {
2477           hsa_error ("Could not deserialize GCN code object", status);
2478           goto fail;
2479         }
2480
2481       status = hsa_fns.hsa_executable_load_code_object_fn
2482         (agent->executable, agent->id, co, "");
2483       if (status != HSA_STATUS_SUCCESS)
2484         {
2485           hsa_error ("Could not load GCN code object", status);
2486           goto fail;
2487         }
2488
2489       if (!module->heap)
2490         {
2491           status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2492                                                    gcn_kernel_heap_size,
2493                                                    (void**)&module->heap);
2494           if (status != HSA_STATUS_SUCCESS)
2495             {
2496               hsa_error ("Could not allocate memory for GCN heap", status);
2497               goto fail;
2498             }
2499
2500           status = hsa_fns.hsa_memory_assign_agent_fn
2501                         (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2502           if (status != HSA_STATUS_SUCCESS)
2503             {
2504               hsa_error ("Could not assign GCN heap memory to device", status);
2505               goto fail;
2506             }
2507
2508           hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2509                                       &gcn_kernel_heap_size,
2510                                       sizeof (gcn_kernel_heap_size));
2511         }
2512
2513     }
2514
2515   if (debug)
2516     dump_executable_symbols (agent->executable);
2517
2518   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2519   if (status != HSA_STATUS_SUCCESS)
2520     {
2521       hsa_error ("Could not freeze the GCN executable", status);
2522       goto fail;
2523     }
2524
2525 final:
2526   agent->prog_finalized = true;
2527
2528   if (pthread_mutex_unlock (&agent->prog_mutex))
2529     {
2530       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2531       res = false;
2532     }
2533
2534   return res;
2535
2536 fail:
2537   res = false;
2538   goto final;
2539 }
2540
2541 /* Free the HSA program in agent and everything associated with it and set
2542    agent->prog_finalized and the initialized flags of all kernels to false.
2543    Return TRUE on success.  */
2544
2545 static bool
2546 destroy_hsa_program (struct agent_info *agent)
2547 {
2548   if (!agent->prog_finalized)
2549     return true;
2550
2551   hsa_status_t status;
2552
2553   GCN_DEBUG ("Destroying the current GCN program.\n");
2554
2555   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2556   if (status != HSA_STATUS_SUCCESS)
2557     return hsa_error ("Could not destroy GCN executable", status);
2558
2559   if (agent->module)
2560     {
2561       int i;
2562       for (i = 0; i < agent->module->kernel_count; i++)
2563         agent->module->kernels[i].initialized = false;
2564
2565       if (agent->module->heap)
2566         {
2567           hsa_fns.hsa_memory_free_fn (agent->module->heap);
2568           agent->module->heap = NULL;
2569         }
2570     }
2571   agent->prog_finalized = false;
2572   return true;
2573 }
2574
2575 /* Deinitialize all information associated with MODULE and kernels within
2576    it.  Return TRUE on success.  */
2577
2578 static bool
2579 destroy_module (struct module_info *module, bool locked)
2580 {
2581   /* Run destructors before destroying module.  */
2582   struct GOMP_kernel_launch_attributes kla =
2583     { 3,
2584       /* Grid size.  */
2585       { 1, 64, 1 },
2586       /* Work-group size.  */
2587       { 1, 64, 1 }
2588     };
2589
2590   if (module->fini_array_func)
2591     {
2592       init_kernel (module->fini_array_func);
2593       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2594     }
2595   module->constructors_run_p = false;
2596
2597   int i;
2598   for (i = 0; i < module->kernel_count; i++)
2599     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2600       {
2601         GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2602                            "mutex");
2603         return false;
2604       }
2605
2606   return true;
2607 }
2608
2609 /* }}}  */
2610 /* {{{ Async  */
2611
2612 /* Callback of dispatch queues to report errors.  */
2613
2614 static void
2615 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2616 {
2617   struct queue_entry *entry = &aq->queue[index];
2618
2619   switch (entry->type)
2620     {
2621     case KERNEL_LAUNCH:
2622       if (DEBUG_QUEUES)
2623         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2624                    aq->agent->device_id, aq->id, index);
2625       run_kernel (entry->u.launch.kernel,
2626                   entry->u.launch.vars,
2627                   &entry->u.launch.kla, aq, false);
2628       if (DEBUG_QUEUES)
2629         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2630                    aq->agent->device_id, aq->id, index);
2631       break;
2632
2633     case CALLBACK:
2634       if (DEBUG_QUEUES)
2635         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2636                    aq->agent->device_id, aq->id, index);
2637       entry->u.callback.fn (entry->u.callback.data);
2638       if (DEBUG_QUEUES)
2639         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2640                    aq->agent->device_id, aq->id, index);
2641       break;
2642
2643     case ASYNC_WAIT:
2644       {
2645         /* FIXME: is it safe to access a placeholder that may already have
2646            been executed?  */
2647         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2648
2649         if (DEBUG_QUEUES)
2650           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2651                      aq->agent->device_id, aq->id, index);
2652
2653         pthread_mutex_lock (&placeholderp->mutex);
2654
2655         while (!placeholderp->executed)
2656           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2657
2658         pthread_mutex_unlock (&placeholderp->mutex);
2659
2660         if (pthread_cond_destroy (&placeholderp->cond))
2661           GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2662
2663         if (pthread_mutex_destroy (&placeholderp->mutex))
2664           GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2665
2666         if (DEBUG_QUEUES)
2667           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2668                      "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2669       }
2670       break;
2671
2672     case ASYNC_PLACEHOLDER:
2673       pthread_mutex_lock (&entry->u.placeholder.mutex);
2674       entry->u.placeholder.executed = 1;
2675       pthread_cond_signal (&entry->u.placeholder.cond);
2676       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2677       break;
2678
2679     default:
2680       GOMP_PLUGIN_fatal ("Unknown queue element");
2681     }
2682 }
2683
2684 /* This function is run as a thread to service an async queue in the
2685    background.  It runs continuously until the stop flag is set.  */
2686
2687 static void *
2688 drain_queue (void *thread_arg)
2689 {
2690   struct goacc_asyncqueue *aq = thread_arg;
2691
2692   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2693     {
2694       aq->drain_queue_stop = 2;
2695       return NULL;
2696     }
2697
2698   pthread_mutex_lock (&aq->mutex);
2699
2700   while (true)
2701     {
2702       if (aq->drain_queue_stop)
2703         break;
2704
2705       if (aq->queue_n > 0)
2706         {
2707           pthread_mutex_unlock (&aq->mutex);
2708           execute_queue_entry (aq, aq->queue_first);
2709
2710           pthread_mutex_lock (&aq->mutex);
2711           aq->queue_first = ((aq->queue_first + 1)
2712                              % ASYNC_QUEUE_SIZE);
2713           aq->queue_n--;
2714
2715           if (DEBUG_THREAD_SIGNAL)
2716             GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2717                        aq->agent->device_id, aq->id);
2718           pthread_cond_broadcast (&aq->queue_cond_out);
2719           pthread_mutex_unlock (&aq->mutex);
2720
2721           if (DEBUG_QUEUES)
2722             GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2723                        aq->id);
2724           pthread_mutex_lock (&aq->mutex);
2725         }
2726       else
2727         {
2728           if (DEBUG_THREAD_SLEEP)
2729             GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2730                        aq->agent->device_id, aq->id);
2731           pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2732           if (DEBUG_THREAD_SLEEP)
2733             GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2734                        aq->agent->device_id, aq->id);
2735         }
2736     }
2737
2738   aq->drain_queue_stop = 2;
2739   if (DEBUG_THREAD_SIGNAL)
2740     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2741                aq->agent->device_id, aq->id);
2742   pthread_cond_broadcast (&aq->queue_cond_out);
2743   pthread_mutex_unlock (&aq->mutex);
2744
2745   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2746   return NULL;
2747 }
2748
2749 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2750    is not usually the case.  This is just a debug tool.  */
2751
2752 static void
2753 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2754 {
2755   pthread_mutex_lock (&aq->mutex);
2756
2757   while (aq->queue_n > 0)
2758     {
2759       execute_queue_entry (aq, aq->queue_first);
2760
2761       aq->queue_first = ((aq->queue_first + 1)
2762                          % ASYNC_QUEUE_SIZE);
2763       aq->queue_n--;
2764     }
2765
2766   pthread_mutex_unlock (&aq->mutex);
2767 }
2768
2769 /* Block the current thread until an async queue is writable.  The aq->mutex
2770    lock should be held on entry, and remains locked on exit.  */
2771
2772 static void
2773 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2774 {
2775   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2776     {
2777       /* Queue is full.  Wait for it to not be full.  */
2778       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2779         pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2780     }
2781 }
2782
2783 /* Request an asynchronous kernel launch on the specified queue.  This
2784    may block if the queue is full, but returns without waiting for the
2785    kernel to run.  */
2786
2787 static void
2788 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2789                    void *vars, struct GOMP_kernel_launch_attributes *kla)
2790 {
2791   assert (aq->agent == kernel->agent);
2792
2793   pthread_mutex_lock (&aq->mutex);
2794
2795   wait_for_queue_nonfull (aq);
2796
2797   int queue_last = ((aq->queue_first + aq->queue_n)
2798                     % ASYNC_QUEUE_SIZE);
2799   if (DEBUG_QUEUES)
2800     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2801                aq->id, queue_last);
2802
2803   aq->queue[queue_last].type = KERNEL_LAUNCH;
2804   aq->queue[queue_last].u.launch.kernel = kernel;
2805   aq->queue[queue_last].u.launch.vars = vars;
2806   aq->queue[queue_last].u.launch.kla = *kla;
2807
2808   aq->queue_n++;
2809
2810   if (DEBUG_THREAD_SIGNAL)
2811     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2812                aq->agent->device_id, aq->id);
2813   pthread_cond_signal (&aq->queue_cond_in);
2814
2815   pthread_mutex_unlock (&aq->mutex);
2816 }
2817
2818 /* Request an asynchronous callback on the specified queue.  The callback
2819    function will be called, with the given opaque data, from the appropriate
2820    async thread, when all previous items on that queue are complete.  */
2821
2822 static void
2823 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2824                      void *data)
2825 {
2826   pthread_mutex_lock (&aq->mutex);
2827
2828   wait_for_queue_nonfull (aq);
2829
2830   int queue_last = ((aq->queue_first + aq->queue_n)
2831                     % ASYNC_QUEUE_SIZE);
2832   if (DEBUG_QUEUES)
2833     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2834                aq->id, queue_last);
2835
2836   aq->queue[queue_last].type = CALLBACK;
2837   aq->queue[queue_last].u.callback.fn = fn;
2838   aq->queue[queue_last].u.callback.data = data;
2839
2840   aq->queue_n++;
2841
2842   if (DEBUG_THREAD_SIGNAL)
2843     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2844                aq->agent->device_id, aq->id);
2845   pthread_cond_signal (&aq->queue_cond_in);
2846
2847   pthread_mutex_unlock (&aq->mutex);
2848 }
2849
2850 /* Request that a given async thread wait for another thread (unspecified) to
2851    reach the given placeholder.  The wait will occur when all previous entries
2852    on the queue are complete.  A placeholder is effectively a kind of signal
2853    which simply sets a flag when encountered in a queue.  */
2854
2855 static void
2856 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2857                       struct placeholder *placeholderp)
2858 {
2859   pthread_mutex_lock (&aq->mutex);
2860
2861   wait_for_queue_nonfull (aq);
2862
2863   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2864   if (DEBUG_QUEUES)
2865     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2866                aq->id, queue_last);
2867
2868   aq->queue[queue_last].type = ASYNC_WAIT;
2869   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2870
2871   aq->queue_n++;
2872
2873   if (DEBUG_THREAD_SIGNAL)
2874     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2875                aq->agent->device_id, aq->id);
2876   pthread_cond_signal (&aq->queue_cond_in);
2877
2878   pthread_mutex_unlock (&aq->mutex);
2879 }
2880
2881 /* Add a placeholder into an async queue.  When the async thread reaches the
2882    placeholder it will set the "executed" flag to true and continue.
2883    Another thread may be waiting on this thread reaching the placeholder.  */
2884
2885 static struct placeholder *
2886 queue_push_placeholder (struct goacc_asyncqueue *aq)
2887 {
2888   struct placeholder *placeholderp;
2889
2890   pthread_mutex_lock (&aq->mutex);
2891
2892   wait_for_queue_nonfull (aq);
2893
2894   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2895   if (DEBUG_QUEUES)
2896     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2897                aq->id, queue_last);
2898
2899   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2900   placeholderp = &aq->queue[queue_last].u.placeholder;
2901
2902   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2903     {
2904       pthread_mutex_unlock (&aq->mutex);
2905       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2906     }
2907
2908   if (pthread_cond_init (&placeholderp->cond, NULL))
2909     {
2910       pthread_mutex_unlock (&aq->mutex);
2911       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2912     }
2913
2914   placeholderp->executed = 0;
2915
2916   aq->queue_n++;
2917
2918   if (DEBUG_THREAD_SIGNAL)
2919     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2920                aq->agent->device_id, aq->id);
2921   pthread_cond_signal (&aq->queue_cond_in);
2922
2923   pthread_mutex_unlock (&aq->mutex);
2924
2925   return placeholderp;
2926 }
2927
2928 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2929
2930 static void
2931 finalize_async_thread (struct goacc_asyncqueue *aq)
2932 {
2933   pthread_mutex_lock (&aq->mutex);
2934   if (aq->drain_queue_stop == 2)
2935     {
2936       pthread_mutex_unlock (&aq->mutex);
2937       return;
2938     }
2939
2940   aq->drain_queue_stop = 1;
2941
2942   if (DEBUG_THREAD_SIGNAL)
2943     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2944                aq->agent->device_id, aq->id);
2945   pthread_cond_signal (&aq->queue_cond_in);
2946
2947   while (aq->drain_queue_stop != 2)
2948     {
2949       if (DEBUG_THREAD_SLEEP)
2950         GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2951                    " to sleep\n", aq->agent->device_id, aq->id);
2952       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2953       if (DEBUG_THREAD_SLEEP)
2954         GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
2955                    aq->agent->device_id, aq->id);
2956     }
2957
2958   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2959              aq->id);
2960   pthread_mutex_unlock (&aq->mutex);
2961
2962   int err = pthread_join (aq->thread_drain_queue, NULL);
2963   if (err != 0)
2964     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2965                        aq->agent->device_id, aq->id, strerror (err));
2966   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2967 }
2968
2969 /* Set up an async queue for OpenMP.  There will be only one.  The
2970    implementation simply uses an OpenACC async queue.
2971    FIXME: is this thread-safe if two threads call this function?  */
2972
2973 static void
2974 maybe_init_omp_async (struct agent_info *agent)
2975 {
2976   if (!agent->omp_async_queue)
2977     agent->omp_async_queue
2978       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2979 }
2980
2981 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2982    copies from read-only pages.  */
2983
2984 static void
2985 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2986 {
2987   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2988
2989   if (status == HSA_STATUS_SUCCESS)
2990     return;
2991
2992   /* It appears that the copy fails if the source data is in a read-only page.
2993      We can't detect that easily, so try copying the data to a temporary buffer
2994      and doing the copy again if we got an error above.  */
2995
2996   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2997                "[%p:+%d]\n", (void *) src, (int) len);
2998
2999   void *src_copy = malloc (len);
3000   memcpy (src_copy, src, len);
3001   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3002   free (src_copy);
3003   if (status != HSA_STATUS_SUCCESS)
3004     GOMP_PLUGIN_error ("memory copy failed");
3005 }
3006
3007 /* Copy data to or from a device.  This is intended for use as an async
3008    callback event.  */
3009
3010 static void
3011 copy_data (void *data_)
3012 {
3013   struct copy_data *data = (struct copy_data *)data_;
3014   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3015              data->aq->agent->device_id, data->aq->id, data->len, data->src,
3016              data->dst);
3017   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3018   free (data);
3019 }
3020
3021 /* Request an asynchronous data copy, to or from a device, on a given queue.
3022    The event will be registered as a callback.  */
3023
3024 static void
3025 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3026                  size_t len)
3027 {
3028   if (DEBUG_QUEUES)
3029     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3030                aq->agent->device_id, aq->id, len, src, dst);
3031   struct copy_data *data
3032     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3033   data->dst = dst;
3034   data->src = src;
3035   data->len = len;
3036   data->aq = aq;
3037   queue_push_callback (aq, copy_data, data);
3038 }
3039
3040 /* Return true if the given queue is currently empty.  */
3041
3042 static int
3043 queue_empty (struct goacc_asyncqueue *aq)
3044 {
3045   pthread_mutex_lock (&aq->mutex);
3046   int res = aq->queue_n == 0 ? 1 : 0;
3047   pthread_mutex_unlock (&aq->mutex);
3048
3049   return res;
3050 }
3051
3052 /* Wait for a given queue to become empty.  This implements an OpenACC wait
3053    directive.  */
3054
3055 static void
3056 wait_queue (struct goacc_asyncqueue *aq)
3057 {
3058   if (DRAIN_QUEUE_SYNCHRONOUS_P)
3059     {
3060       drain_queue_synchronous (aq);
3061       return;
3062     }
3063
3064   pthread_mutex_lock (&aq->mutex);
3065
3066   while (aq->queue_n > 0)
3067     {
3068       if (DEBUG_THREAD_SLEEP)
3069         GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3070                    aq->agent->device_id, aq->id);
3071       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3072       if (DEBUG_THREAD_SLEEP)
3073         GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3074                    aq->id);
3075     }
3076
3077   pthread_mutex_unlock (&aq->mutex);
3078   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3079 }
3080
3081 /* }}}  */
3082 /* {{{ OpenACC support  */
3083
3084 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3085
3086 static void
3087 gcn_exec (struct kernel_info *kernel,
3088           void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3089           struct goacc_asyncqueue *aq)
3090 {
3091   if (!GOMP_OFFLOAD_can_run (kernel))
3092     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3093
3094   /* If we get here then this must be an OpenACC kernel.  */
3095   kernel->kind = KIND_OPENACC;
3096
3097   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3098   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3099     {
3100       struct hsa_kernel_description *d
3101         = &kernel->module->image_desc->kernel_infos[i];
3102       if (d->name == kernel->name)
3103         {
3104           hsa_kernel_desc = d;
3105           break;
3106         }
3107     }
3108
3109   /* We may have statically-determined dimensions in
3110      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3111      invocation at runtime in dims[].  We allow static dimensions to take
3112      priority over dynamic dimensions when present (non-zero).  */
3113   if (hsa_kernel_desc->oacc_dims[0] > 0)
3114     dims[0] = hsa_kernel_desc->oacc_dims[0];
3115   if (hsa_kernel_desc->oacc_dims[1] > 0)
3116     dims[1] = hsa_kernel_desc->oacc_dims[1];
3117   if (hsa_kernel_desc->oacc_dims[2] > 0)
3118     dims[2] = hsa_kernel_desc->oacc_dims[2];
3119
3120   /* Ideally, when a dimension isn't explicitly specified, we should
3121      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3122      In practice, we tune for peak performance on BabelStream, which
3123      for OpenACC is currently 32 threads per CU.  */
3124   if (dims[0] == 0 && dims[1] == 0)
3125     {
3126       /* If any of the OpenACC dimensions remain 0 then we get to pick a
3127          number.  There isn't really a correct answer for this without a clue
3128          about the problem size, so let's do a reasonable number of workers
3129          and gangs.  */
3130
3131       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
3132       dims[1] = 8; /* Workers.  */
3133     }
3134   else if (dims[0] == 0 && dims[1] > 0)
3135     {
3136       /* Auto-scale the number of gangs with the requested number of workers.  */
3137       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3138     }
3139   else if (dims[0] > 0 && dims[1] == 0)
3140     {
3141       /* Auto-scale the number of workers with the requested number of gangs.  */
3142       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3143       if (dims[1] == 0)
3144         dims[1] = 1;
3145       if (dims[1] > 16)
3146         dims[1] = 16;
3147     }
3148
3149   /* The incoming dimensions are expressed in terms of gangs, workers, and
3150      vectors.  The HSA dimensions are expressed in terms of "work-items",
3151      which means multiples of vector lanes.
3152
3153      The "grid size" specifies the size of the problem space, and the
3154      "work-group size" specifies how much of that we want a single compute
3155      unit to chew on at once.
3156
3157      The three dimensions do not really correspond to hardware, but the
3158      important thing is that the HSA runtime will launch as many
3159      work-groups as it takes to process the entire grid, and each
3160      work-group will contain as many wave-fronts as it takes to process
3161      the work-items in that group.
3162
3163      Essentially, as long as we set the Y dimension to 64 (the number of
3164      vector lanes in hardware), and the Z group size to the maximum (16),
3165      then we will get the gangs (X) and workers (Z) launched as we expect.
3166
3167      The reason for the apparent reversal of vector and worker dimension
3168      order is to do with the way the run-time distributes work-items across
3169      v1 and v2.  */
3170   struct GOMP_kernel_launch_attributes kla =
3171     {3,
3172      /* Grid size.  */
3173      {dims[0], 64, dims[1]},
3174      /* Work-group size.  */
3175      {1,       64, 16}
3176     };
3177
3178   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3179   acc_prof_info *prof_info = thr->prof_info;
3180   acc_event_info enqueue_launch_event_info;
3181   acc_api_info *api_info = thr->api_info;
3182   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3183   if (profiling_dispatch_p)
3184     {
3185       prof_info->event_type = acc_ev_enqueue_launch_start;
3186
3187       enqueue_launch_event_info.launch_event.event_type
3188         = prof_info->event_type;
3189       enqueue_launch_event_info.launch_event.valid_bytes
3190         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3191       enqueue_launch_event_info.launch_event.parent_construct
3192         = acc_construct_parallel;
3193       enqueue_launch_event_info.launch_event.implicit = 1;
3194       enqueue_launch_event_info.launch_event.tool_info = NULL;
3195       enqueue_launch_event_info.launch_event.kernel_name
3196         = (char *) kernel->name;
3197       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3198       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3199       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3200
3201       api_info->device_api = acc_device_api_other;
3202
3203       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3204         &enqueue_launch_event_info, api_info);
3205     }
3206
3207   if (!async)
3208     run_kernel (kernel, devaddrs, &kla, NULL, false);
3209   else
3210     queue_push_launch (aq, kernel, devaddrs, &kla);
3211
3212   if (profiling_dispatch_p)
3213     {
3214       prof_info->event_type = acc_ev_enqueue_launch_end;
3215       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3216       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3217                                             &enqueue_launch_event_info,
3218                                             api_info);
3219     }
3220 }
3221
3222 /* }}}  */
3223 /* {{{ Generic Plugin API  */
3224
3225 /* Return the name of the accelerator, which is "gcn".  */
3226
3227 const char *
3228 GOMP_OFFLOAD_get_name (void)
3229 {
3230   return "gcn";
3231 }
3232
3233 /* Return the UID; if not available return NULL.
3234    Returns freshly allocated memoy.  */
3235
3236 const char *
3237 GOMP_OFFLOAD_get_uid (int ord)
3238 {
3239   char *str;
3240   hsa_status_t status;
3241   struct agent_info *agent = get_agent_info (ord);
3242
3243   /* HSA documentation states: maximally 21 characters including NUL.  */
3244   str = GOMP_PLUGIN_malloc (21 * sizeof (char));
3245   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AMD_AGENT_INFO_UUID,
3246                                           str);
3247   if (status != HSA_STATUS_SUCCESS)
3248     {
3249       free (str);
3250       return NULL;
3251     }
3252   return str;
3253 }
3254
3255 /* Return the specific capabilities the HSA accelerator have.  */
3256
3257 unsigned int
3258 GOMP_OFFLOAD_get_caps (void)
3259 {
3260   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3261   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3262             | GOMP_OFFLOAD_CAP_OPENACC_200;
3263 }
3264
3265 /* Identify as GCN accelerator.  */
3266
3267 int
3268 GOMP_OFFLOAD_get_type (void)
3269 {
3270   return OFFLOAD_TARGET_TYPE_GCN;
3271 }
3272
3273 /* Return the libgomp version number we're compatible with.  There is
3274    no requirement for cross-version compatibility.  */
3275
3276 unsigned
3277 GOMP_OFFLOAD_version (void)
3278 {
3279   return GOMP_VERSION;
3280 }
3281
3282 /* Return the number of GCN devices on the system.  */
3283
3284 int
3285 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3286 {
3287   if (!init_hsa_context (true))
3288     exit (EXIT_FAILURE);
3289   /* Return -1 if no omp_requires_mask cannot be fulfilled but
3290      devices were present.  */
3291   if (hsa_context.agent_count > 0
3292       && ((omp_requires_mask
3293            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3294                | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
3295                | GOMP_REQUIRES_SELF_MAPS
3296                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3297     return -1;
3298   /* Check whether host page access is supported; this is per system level
3299      (all GPUs supported by HSA).  While intrinsically true for APUs, it
3300      requires XNACK support for discrete GPUs.  */
3301   if (hsa_context.agent_count > 0
3302       && (omp_requires_mask
3303           & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY | GOMP_REQUIRES_SELF_MAPS)))
3304     {
3305       bool b;
3306       hsa_system_info_t type = HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT;
3307       hsa_status_t status = hsa_fns.hsa_system_get_info_fn (type, &b);
3308       if (status != HSA_STATUS_SUCCESS)
3309         GOMP_PLUGIN_error ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT "
3310                            "failed");
3311       if (!b)
3312         return -1;
3313     }
3314
3315   return hsa_context.agent_count;
3316 }
3317
3318 /* Initialize device (agent) number N so that it can be used for computation.
3319    Return TRUE on success.  */
3320
3321 bool
3322 GOMP_OFFLOAD_init_device (int n)
3323 {
3324   if (!init_hsa_context (false))
3325     return false;
3326   if (n >= hsa_context.agent_count)
3327     {
3328       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3329       return false;
3330     }
3331   struct agent_info *agent = &hsa_context.agents[n];
3332
3333   if (agent->initialized)
3334     return true;
3335
3336   agent->device_id = n;
3337
3338   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3339     {
3340       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3341       return false;
3342     }
3343   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3344     {
3345       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3346       return false;
3347     }
3348   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3349     {
3350       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3351       return false;
3352     }
3353   if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3354     {
3355       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3356       return false;
3357     }
3358   agent->async_queues = NULL;
3359   agent->omp_async_queue = NULL;
3360   agent->ephemeral_memories_list = NULL;
3361
3362   uint32_t queue_size;
3363   hsa_status_t status;
3364   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3365                                           HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3366                                           &queue_size);
3367   if (status != HSA_STATUS_SUCCESS)
3368     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3369                       status);
3370
3371   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3372                                           &agent->name);
3373   if (status != HSA_STATUS_SUCCESS)
3374     return hsa_error ("Error querying the name of the agent", status);
3375
3376   agent->device_isa = isa_code (agent->name);
3377   if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
3378     {
3379       char msg[33 + 64 + 1];
3380       snprintf (msg, sizeof msg,
3381                 "Unknown GCN agent architecture '%s'", agent->name);
3382       return hsa_error (msg, HSA_STATUS_ERROR);
3383     }
3384
3385   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3386                                           &agent->vendor_name);
3387   if (status != HSA_STATUS_SUCCESS)
3388     return hsa_error ("Error querying the vendor name of the agent", status);
3389
3390   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3391                                         HSA_QUEUE_TYPE_MULTI,
3392                                         hsa_queue_callback, NULL, UINT32_MAX,
3393                                         UINT32_MAX, &agent->sync_queue);
3394   if (status != HSA_STATUS_SUCCESS)
3395     return hsa_error ("Error creating command queue", status);
3396
3397   agent->kernarg_region.handle = (uint64_t) -1;
3398   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3399                                                  get_kernarg_memory_region,
3400                                                  &agent->kernarg_region);
3401   if (status != HSA_STATUS_SUCCESS
3402       && status != HSA_STATUS_INFO_BREAK)
3403     hsa_error ("Scanning memory regions failed", status);
3404   if (agent->kernarg_region.handle == (uint64_t) -1)
3405     {
3406       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3407                          "arguments");
3408       return false;
3409     }
3410   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3411   dump_hsa_region (agent->kernarg_region, NULL);
3412
3413   agent->data_region.handle = (uint64_t) -1;
3414   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3415                                                  get_data_memory_region,
3416                                                  &agent->data_region);
3417   if (status != HSA_STATUS_SUCCESS
3418       && status != HSA_STATUS_INFO_BREAK)
3419     hsa_error ("Scanning memory regions failed", status);
3420   if (agent->data_region.handle == (uint64_t) -1)
3421     {
3422       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3423                          "data");
3424       return false;
3425     }
3426   GCN_DEBUG ("Selected device data memory region:\n");
3427   dump_hsa_region (agent->data_region, NULL);
3428
3429   GCN_DEBUG ("GCN agent %d initialized\n", n);
3430
3431   agent->initialized = true;
3432   return true;
3433 }
3434
3435 /* Load GCN object-code module described by struct gcn_image_desc in
3436    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3437    If there are any constructors then run them.  If not NULL, REV_FN_TABLE will
3438    contain the on-device addresses of the functions for reverse offload.  To be
3439    freed by the caller.  */
3440
3441 int
3442 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3443                          struct addr_pair **target_table,
3444                          uint64_t **rev_fn_table,
3445                          uint64_t *host_ind_fn_table)
3446 {
3447   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3448     {
3449       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3450                          " (expected %u, received %u)",
3451                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3452       return -1;
3453     }
3454
3455   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3456   struct agent_info *agent;
3457   struct addr_pair *pair;
3458   struct module_info *module;
3459   struct kernel_info *kernel;
3460   int kernel_count = image_desc->kernel_count;
3461   unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3462                               ? image_desc->ind_func_count : 0;
3463   unsigned var_count = image_desc->global_variable_count;
3464   /* Currently, "others" is a struct of ICVS.  */
3465   int other_count = 1;
3466
3467   agent = get_agent_info (ord);
3468   if (!agent)
3469     return -1;
3470
3471   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3472     {
3473       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3474       return -1;
3475     }
3476   if (agent->prog_finalized
3477       && !destroy_hsa_program (agent))
3478     return -1;
3479
3480   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3481   GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3482   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3483   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3484   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3485                              * sizeof (struct addr_pair));
3486   *target_table = pair;
3487   module = (struct module_info *)
3488     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3489                                 + kernel_count * sizeof (struct kernel_info));
3490   module->image_desc = image_desc;
3491   module->kernel_count = kernel_count;
3492   module->heap = NULL;
3493   module->constructors_run_p = false;
3494
3495   kernel = &module->kernels[0];
3496
3497   /* Allocate memory for kernel dependencies.  */
3498   for (unsigned i = 0; i < kernel_count; i++)
3499     {
3500       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3501       if (!init_basic_kernel_info (kernel, d, agent, module))
3502         return -1;
3503       if (strcmp (d->name, "_init_array") == 0)
3504         module->init_array_func = kernel;
3505       else if (strcmp (d->name, "_fini_array") == 0)
3506         module->fini_array_func = kernel;
3507       else
3508         {
3509           pair->start = (uintptr_t) kernel;
3510           pair->end = (uintptr_t) (kernel + 1);
3511           pair++;
3512         }
3513       kernel++;
3514     }
3515
3516   agent->module = module;
3517   if (pthread_rwlock_unlock (&agent->module_rwlock))
3518     {
3519       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3520       return -1;
3521     }
3522
3523   if (!create_and_finalize_hsa_program (agent))
3524     return -1;
3525
3526   if (var_count > 0)
3527     {
3528       hsa_status_t status;
3529       hsa_executable_symbol_t var_symbol;
3530       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3531                                                      ".offload_var_table",
3532                                                      agent->id,
3533                                                      0, &var_symbol);
3534
3535       if (status != HSA_STATUS_SUCCESS)
3536         hsa_fatal ("Could not find symbol for variable in the code object",
3537                    status);
3538
3539       uint64_t var_table_addr;
3540       status = hsa_fns.hsa_executable_symbol_get_info_fn
3541         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3542          &var_table_addr);
3543       if (status != HSA_STATUS_SUCCESS)
3544         hsa_fatal ("Could not extract a variable from its symbol", status);
3545
3546       struct {
3547         uint64_t addr;
3548         uint64_t size;
3549       } var_table[var_count];
3550       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3551                              (void*)var_table_addr, sizeof (var_table));
3552
3553       for (unsigned i = 0; i < var_count; i++)
3554         {
3555           pair->start = var_table[i].addr;
3556           pair->end = var_table[i].addr + var_table[i].size;
3557           GCN_DEBUG ("Found variable at %p with size %lu\n",
3558                      (void *)var_table[i].addr, var_table[i].size);
3559           pair++;
3560         }
3561     }
3562
3563   if (ind_func_count > 0)
3564     {
3565       hsa_status_t status;
3566
3567       /* Read indirect function table from image.  */
3568       hsa_executable_symbol_t ind_funcs_symbol;
3569       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3570                                                      ".offload_ind_func_table",
3571                                                      agent->id,
3572                                                      0, &ind_funcs_symbol);
3573
3574       if (status != HSA_STATUS_SUCCESS)
3575         hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3576                    "code object", status);
3577
3578       uint64_t ind_funcs_table_addr;
3579       status = hsa_fns.hsa_executable_symbol_get_info_fn
3580         (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3581          &ind_funcs_table_addr);
3582       if (status != HSA_STATUS_SUCCESS)
3583         hsa_fatal ("Could not extract a variable from its symbol", status);
3584
3585       uint64_t ind_funcs_table[ind_func_count];
3586       GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3587                              (void*) ind_funcs_table_addr,
3588                              sizeof (ind_funcs_table));
3589
3590       /* Build host->target address map for indirect functions.  */
3591       uint64_t ind_fn_map[ind_func_count * 2 + 1];
3592       for (unsigned i = 0; i < ind_func_count; i++)
3593         {
3594           ind_fn_map[i * 2] = host_ind_fn_table[i];
3595           ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3596           GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3597                      i, host_ind_fn_table[i], ind_funcs_table[i]);
3598         }
3599       ind_fn_map[ind_func_count * 2] = 0;
3600
3601       /* Write the map onto the target.  */
3602       void *map_target_addr
3603         = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3604       GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3605
3606       GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3607                              (void*) ind_fn_map,
3608                              sizeof (ind_fn_map));
3609
3610       /* Write address of the map onto the target.  */
3611       hsa_executable_symbol_t symbol;
3612
3613       status
3614         = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3615                                                 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3616                                                 agent->id, 0, &symbol);
3617       if (status != HSA_STATUS_SUCCESS)
3618         hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3619                    status);
3620
3621       uint64_t varptr;
3622       uint32_t varsize;
3623
3624       status = hsa_fns.hsa_executable_symbol_get_info_fn
3625         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3626          &varptr);
3627       if (status != HSA_STATUS_SUCCESS)
3628         hsa_fatal ("Could not extract a variable from its symbol", status);
3629       status = hsa_fns.hsa_executable_symbol_get_info_fn
3630         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3631         &varsize);
3632       if (status != HSA_STATUS_SUCCESS)
3633         hsa_fatal ("Could not extract a variable size from its symbol",
3634                    status);
3635
3636       GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3637                  varptr, varsize);
3638
3639       GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3640                              &map_target_addr,
3641                              sizeof (map_target_addr));
3642     }
3643
3644   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3645
3646   hsa_status_t status;
3647   hsa_executable_symbol_t var_symbol;
3648   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3649                                                  XSTRING (GOMP_ADDITIONAL_ICVS),
3650                                                  agent->id, 0, &var_symbol);
3651   if (status == HSA_STATUS_SUCCESS)
3652     {
3653       uint64_t varptr;
3654       uint32_t varsize;
3655
3656       status = hsa_fns.hsa_executable_symbol_get_info_fn
3657         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3658          &varptr);
3659       if (status != HSA_STATUS_SUCCESS)
3660         hsa_fatal ("Could not extract a variable from its symbol", status);
3661       status = hsa_fns.hsa_executable_symbol_get_info_fn
3662         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3663          &varsize);
3664       if (status != HSA_STATUS_SUCCESS)
3665         hsa_fatal ("Could not extract a variable size from its symbol",
3666                    status);
3667
3668       pair->start = varptr;
3669       pair->end = varptr + varsize;
3670     }
3671   else
3672     {
3673       /* The variable was not in this image.  */
3674       GCN_DEBUG ("Variable not found in image: %s\n",
3675                  XSTRING (GOMP_ADDITIONAL_ICVS));
3676       pair->start = pair->end = 0;
3677     }
3678
3679   /* Ensure that constructors are run first.  */
3680   struct GOMP_kernel_launch_attributes kla =
3681     { 3,
3682       /* Grid size.  */
3683       { 1, 64, 1 },
3684       /* Work-group size.  */
3685       { 1, 64, 1 }
3686     };
3687
3688   if (module->init_array_func)
3689     {
3690       init_kernel (module->init_array_func);
3691       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3692     }
3693   module->constructors_run_p = true;
3694
3695   /* Don't report kernels that libgomp need not know about.  */
3696   if (module->init_array_func)
3697     kernel_count--;
3698   if (module->fini_array_func)
3699     kernel_count--;
3700
3701   if (rev_fn_table != NULL && kernel_count == 0)
3702     *rev_fn_table = NULL;
3703   else if (rev_fn_table != NULL)
3704     {
3705       hsa_status_t status;
3706       hsa_executable_symbol_t var_symbol;
3707       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3708                                                      ".offload_func_table",
3709                                                      agent->id, 0, &var_symbol);
3710       if (status != HSA_STATUS_SUCCESS)
3711         hsa_fatal ("Could not find symbol for variable in the code object",
3712                    status);
3713       uint64_t fn_table_addr;
3714       status = hsa_fns.hsa_executable_symbol_get_info_fn
3715         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3716          &fn_table_addr);
3717       if (status != HSA_STATUS_SUCCESS)
3718         hsa_fatal ("Could not extract a variable from its symbol", status);
3719       *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3720       GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3721                              (void*) fn_table_addr,
3722                              kernel_count * sizeof (uint64_t));
3723     }
3724
3725   return kernel_count + var_count + other_count;
3726 }
3727
3728 /* Unload GCN object-code module described by struct gcn_image_desc in
3729    TARGET_DATA from agent number N.  Return TRUE on success.  */
3730
3731 bool
3732 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3733 {
3734   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3735     {
3736       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3737                          " (expected %u, received %u)",
3738                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3739       return false;
3740     }
3741
3742   struct agent_info *agent;
3743   agent = get_agent_info (n);
3744   if (!agent)
3745     return false;
3746
3747   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3748     {
3749       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3750       return false;
3751     }
3752
3753   if (!agent->module || agent->module->image_desc != target_data)
3754     {
3755       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3756                          "loaded before");
3757       return false;
3758     }
3759
3760   if (!destroy_module (agent->module, true))
3761     return false;
3762   free (agent->module);
3763   agent->module = NULL;
3764   if (!destroy_hsa_program (agent))
3765     return false;
3766   if (pthread_rwlock_unlock (&agent->module_rwlock))
3767     {
3768       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3769       return false;
3770     }
3771   return true;
3772 }
3773
3774 /* Deinitialize all information and status associated with agent number N.  We
3775    do not attempt any synchronization, assuming the user and libgomp will not
3776    attempt deinitialization of a device that is in any way being used at the
3777    same time.  Return TRUE on success.  */
3778
3779 bool
3780 GOMP_OFFLOAD_fini_device (int n)
3781 {
3782   struct agent_info *agent = get_agent_info (n);
3783   if (!agent)
3784     return false;
3785
3786   if (!agent->initialized)
3787     return true;
3788
3789   if (agent->omp_async_queue)
3790     {
3791       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3792       agent->omp_async_queue = NULL;
3793     }
3794
3795   if (agent->module)
3796     {
3797       if (!destroy_module (agent->module, false))
3798         return false;
3799       free (agent->module);
3800       agent->module = NULL;
3801     }
3802
3803   if (!destroy_ephemeral_memories (agent))
3804     return false;
3805
3806   if (!destroy_hsa_program (agent))
3807     return false;
3808
3809   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3810   if (status != HSA_STATUS_SUCCESS)
3811     return hsa_error ("Error destroying command queue", status);
3812
3813   if (pthread_mutex_destroy (&agent->prog_mutex))
3814     {
3815       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3816       return false;
3817     }
3818   if (pthread_rwlock_destroy (&agent->module_rwlock))
3819     {
3820       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3821       return false;
3822     }
3823
3824   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3825     {
3826       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3827       return false;
3828     }
3829   if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3830     {
3831       GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3832       return false;
3833     }
3834   agent->initialized = false;
3835   return true;
3836 }
3837
3838 /* Return true if the HSA runtime can run function FN_PTR.  */
3839
3840 bool
3841 GOMP_OFFLOAD_can_run (void *fn_ptr)
3842 {
3843   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3844
3845   init_kernel (kernel);
3846   if (kernel->initialization_failed)
3847     GOMP_PLUGIN_fatal ("kernel initialization failed");
3848
3849   return true;
3850 }
3851
3852 /* Allocate memory on device N.  */
3853
3854 void *
3855 GOMP_OFFLOAD_alloc (int n, size_t size)
3856 {
3857   struct agent_info *agent = get_agent_info (n);
3858   return alloc_by_agent (agent, size);
3859 }
3860
3861 /* Free memory from device N.  */
3862
3863 bool
3864 GOMP_OFFLOAD_free (int device, void *ptr)
3865 {
3866   GCN_DEBUG ("Freeing memory on device %d\n", device);
3867
3868   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3869   if (status != HSA_STATUS_SUCCESS)
3870     {
3871       hsa_error ("Could not free device memory", status);
3872       return false;
3873     }
3874
3875   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3876   bool profiling_dispatch_p
3877     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3878   if (profiling_dispatch_p)
3879     {
3880       acc_prof_info *prof_info = thr->prof_info;
3881       acc_event_info data_event_info;
3882       acc_api_info *api_info = thr->api_info;
3883
3884       prof_info->event_type = acc_ev_free;
3885
3886       data_event_info.data_event.event_type = prof_info->event_type;
3887       data_event_info.data_event.valid_bytes
3888         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3889       data_event_info.data_event.parent_construct
3890         = acc_construct_parallel;
3891       data_event_info.data_event.implicit = 1;
3892       data_event_info.data_event.tool_info = NULL;
3893       data_event_info.data_event.var_name = NULL;
3894       data_event_info.data_event.bytes = 0;
3895       data_event_info.data_event.host_ptr = NULL;
3896       data_event_info.data_event.device_ptr = (void *) ptr;
3897
3898       api_info->device_api = acc_device_api_other;
3899
3900       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3901                                             api_info);
3902     }
3903
3904   return true;
3905 }
3906
3907 /* Copy data from DEVICE to host.  */
3908
3909 bool
3910 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3911 {
3912   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3913              src, dst);
3914   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3915   if (status != HSA_STATUS_SUCCESS)
3916     GOMP_PLUGIN_error ("memory copy failed");
3917   return true;
3918 }
3919
3920 /* Copy data from host to DEVICE.  */
3921
3922 bool
3923 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3924 {
3925   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3926              device, dst);
3927   hsa_memory_copy_wrapper (dst, src, n);
3928   return true;
3929 }
3930
3931 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3932
3933 bool
3934 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3935 {
3936   struct gcn_thread *thread_data = gcn_thread ();
3937
3938   if (thread_data && !async_synchronous_p (thread_data->async))
3939     {
3940       struct agent_info *agent = get_agent_info (device);
3941       maybe_init_omp_async (agent);
3942       queue_push_copy (agent->omp_async_queue, dst, src, n);
3943       return true;
3944     }
3945
3946   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3947              device, src, device, dst);
3948   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3949   if (status != HSA_STATUS_SUCCESS)
3950     GOMP_PLUGIN_error ("memory copy failed");
3951   return true;
3952 }
3953
3954 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3955    measured in bytes.  So we have:
3956
3957    dim1_size: number of bytes to copy on innermost dimension ("row")
3958    dim0_len: number of rows to copy
3959    dst: base pointer for destination of copy
3960    dst_offset1_size: innermost row offset (for dest), in bytes
3961    dst_offset0_len: offset, number of rows (for dest)
3962    dst_dim1_size: whole-array dest row length, in bytes (pitch)
3963    src: base pointer for source of copy
3964    src_offset1_size: innermost row offset (for source), in bytes
3965    src_offset0_len: offset, number of rows (for source)
3966    src_dim1_size: whole-array source row length, in bytes (pitch)
3967 */
3968
3969 int
3970 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
3971                        size_t dim0_len, void *dst, size_t dst_offset1_size,
3972                        size_t dst_offset0_len, size_t dst_dim1_size,
3973                        const void *src, size_t src_offset1_size,
3974                        size_t src_offset0_len, size_t src_dim1_size)
3975 {
3976   if (!hsa_fns.hsa_amd_memory_lock_fn
3977       || !hsa_fns.hsa_amd_memory_unlock_fn
3978       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
3979     return -1;
3980
3981   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
3982      out quietly if we have anything oddly-aligned rather than letting the
3983      driver raise an error.  */
3984   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
3985     return -1;
3986
3987   if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
3988     return -1;
3989
3990   /* Only handle host to device or device to host transfers here.  */
3991   if ((dst_ord == -1 && src_ord == -1)
3992       || (dst_ord != -1 && src_ord != -1))
3993     return -1;
3994
3995   hsa_amd_copy_direction_t dir
3996     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
3997   hsa_agent_t copy_agent;
3998
3999   /* We need to pin (lock) host memory before we start the transfer.  Try to
4000      lock the minimum size necessary, i.e. using partial first/last rows of the
4001      whole array.  Something like this:
4002
4003          rows -->
4004          ..............
4005      c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4006      o | ++#######+++++ <- whole row
4007      l | ++#######+++++ <-     "
4008      s v ++#######..... <- last row apart from trailing remainder
4009          ..............
4010
4011      We could split very large transfers into several rectangular copies, but
4012      that is unimplemented for now.  */
4013
4014   size_t bounded_size_host, first_elem_offset_host;
4015   void *host_ptr;
4016   if (dir == hsaHostToDevice)
4017     {
4018       bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4019       first_elem_offset_host = src_offset0_len * src_dim1_size
4020                                + src_offset1_size;
4021       host_ptr = (void *) src;
4022       struct agent_info *agent = get_agent_info (dst_ord);
4023       copy_agent = agent->id;
4024     }
4025   else
4026     {
4027       bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4028       first_elem_offset_host = dst_offset0_len * dst_dim1_size
4029                                + dst_offset1_size;
4030       host_ptr = dst;
4031       struct agent_info *agent = get_agent_info (src_ord);
4032       copy_agent = agent->id;
4033     }
4034
4035   void *agent_ptr;
4036
4037   hsa_status_t status
4038     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4039                                       bounded_size_host, NULL, 0, &agent_ptr);
4040   /* We can't lock the host memory: don't give up though, we might still be
4041      able to use the slow path in our caller.  So, don't make this an
4042      error.  */
4043   if (status != HSA_STATUS_SUCCESS)
4044     return -1;
4045
4046   hsa_pitched_ptr_t dstpp, srcpp;
4047   hsa_dim3_t dst_offsets, src_offsets, ranges;
4048
4049   int retval = 1;
4050
4051   hsa_signal_t completion_signal;
4052   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4053   if (status != HSA_STATUS_SUCCESS)
4054     {
4055       retval = -1;
4056       goto unlock;
4057     }
4058
4059   if (dir == hsaHostToDevice)
4060     {
4061       srcpp.base = agent_ptr - first_elem_offset_host;
4062       dstpp.base = dst;
4063     }
4064   else
4065     {
4066       srcpp.base = (void *) src;
4067       dstpp.base = agent_ptr - first_elem_offset_host;
4068     }
4069
4070   srcpp.pitch = src_dim1_size;
4071   srcpp.slice = 0;
4072
4073   src_offsets.x = src_offset1_size;
4074   src_offsets.y = src_offset0_len;
4075   src_offsets.z = 0;
4076
4077   dstpp.pitch = dst_dim1_size;
4078   dstpp.slice = 0;
4079
4080   dst_offsets.x = dst_offset1_size;
4081   dst_offsets.y = dst_offset0_len;
4082   dst_offsets.z = 0;
4083
4084   ranges.x = dim1_size;
4085   ranges.y = dim0_len;
4086   ranges.z = 1;
4087
4088   status
4089     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4090                                                  &src_offsets, &ranges,
4091                                                  copy_agent, dir, 0, NULL,
4092                                                  completion_signal);
4093   /* If the rectangular copy fails, we might still be able to use the slow
4094      path.  We need to unlock the host memory though, so don't return
4095      immediately.  */
4096   if (status != HSA_STATUS_SUCCESS)
4097     retval = -1;
4098   else
4099     hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4100                                         HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4101                                         HSA_WAIT_STATE_ACTIVE);
4102
4103   hsa_fns.hsa_signal_destroy_fn (completion_signal);
4104
4105 unlock:
4106   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4107   if (status != HSA_STATUS_SUCCESS)
4108     hsa_fatal ("Could not unlock host memory", status);
4109
4110   return retval;
4111 }
4112
4113 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4114    measured in bytes.  So we have:
4115
4116    dim2_size: number of bytes to copy on innermost dimension ("row")
4117    dim1_len: number of rows per slice to copy
4118    dim0_len: number of slices to copy
4119    dst: base pointer for destination of copy
4120    dst_offset2_size: innermost row offset (for dest), in bytes
4121    dst_offset1_len: offset, number of rows (for dest)
4122    dst_offset0_len: offset, number of slices (for dest)
4123    dst_dim2_size: whole-array dest row length, in bytes (pitch)
4124    dst_dim1_len: whole-array number of rows in slice (for dest)
4125    src: base pointer for source of copy
4126    src_offset2_size: innermost row offset (for source), in bytes
4127    src_offset1_len: offset, number of rows (for source)
4128    src_offset0_len: offset, number of slices (for source)
4129    src_dim2_size: whole-array source row length, in bytes (pitch)
4130    src_dim1_len: whole-array number of rows in slice (for source)
4131 */
4132
4133 int
4134 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4135                        size_t dim1_len, size_t dim0_len, void *dst,
4136                        size_t dst_offset2_size, size_t dst_offset1_len,
4137                        size_t dst_offset0_len, size_t dst_dim2_size,
4138                        size_t dst_dim1_len, const void *src,
4139                        size_t src_offset2_size, size_t src_offset1_len,
4140                        size_t src_offset0_len, size_t src_dim2_size,
4141                        size_t src_dim1_len)
4142 {
4143   if (!hsa_fns.hsa_amd_memory_lock_fn
4144       || !hsa_fns.hsa_amd_memory_unlock_fn
4145       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4146     return -1;
4147
4148   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
4149      out quietly if we have anything oddly-aligned rather than letting the
4150      driver raise an error.  */
4151   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4152     return -1;
4153
4154   if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4155     return -1;
4156
4157   /* Only handle host to device or device to host transfers here.  */
4158   if ((dst_ord == -1 && src_ord == -1)
4159       || (dst_ord != -1 && src_ord != -1))
4160     return -1;
4161
4162   hsa_amd_copy_direction_t dir
4163     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4164   hsa_agent_t copy_agent;
4165
4166   /* We need to pin (lock) host memory before we start the transfer.  Try to
4167      lock the minimum size necessary, i.e. using partial first/last slices of
4168      the whole 3D array.  Something like this:
4169
4170          slice 0:          slice 1:          slice 2:
4171              __________        __________        __________
4172        ^    /+++++++++/    :  /+++++++++/    :  /         /
4173     column /+++##++++/|    | /+++##++++/|    | /+++##    /  # = subarray
4174      /    /   ##++++/ |    |/+++##++++/ |    |/+++##++++/   + = area to pin
4175          /_________/  :    /_________/  :    /_________/
4176         row --->
4177
4178      We could split very large transfers into several rectangular copies, but
4179      that is unimplemented for now.  */
4180
4181   size_t bounded_size_host, first_elem_offset_host;
4182   void *host_ptr;
4183   if (dir == hsaHostToDevice)
4184     {
4185       size_t slice_bytes = src_dim2_size * src_dim1_len;
4186       bounded_size_host = slice_bytes * (dim0_len - 1)
4187                           + src_dim2_size * (dim1_len - 1)
4188                           + dim2_size;
4189       first_elem_offset_host = src_offset0_len * slice_bytes
4190                                + src_offset1_len * src_dim2_size
4191                                + src_offset2_size;
4192       host_ptr = (void *) src;
4193       struct agent_info *agent = get_agent_info (dst_ord);
4194       copy_agent = agent->id;
4195     }
4196   else
4197     {
4198       size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4199       bounded_size_host = slice_bytes * (dim0_len - 1)
4200                           + dst_dim2_size * (dim1_len - 1)
4201                           + dim2_size;
4202       first_elem_offset_host = dst_offset0_len * slice_bytes
4203                                + dst_offset1_len * dst_dim2_size
4204                                + dst_offset2_size;
4205       host_ptr = dst;
4206       struct agent_info *agent = get_agent_info (src_ord);
4207       copy_agent = agent->id;
4208     }
4209
4210   void *agent_ptr;
4211
4212   hsa_status_t status
4213     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4214                                       bounded_size_host, NULL, 0, &agent_ptr);
4215   /* We can't lock the host memory: don't give up though, we might still be
4216      able to use the slow path in our caller (maybe even with iterated memcpy2d
4217      calls).  So, don't make this an error.  */
4218   if (status != HSA_STATUS_SUCCESS)
4219     return -1;
4220
4221   hsa_pitched_ptr_t dstpp, srcpp;
4222   hsa_dim3_t dst_offsets, src_offsets, ranges;
4223
4224   int retval = 1;
4225
4226   hsa_signal_t completion_signal;
4227   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4228   if (status != HSA_STATUS_SUCCESS)
4229     {
4230       retval = -1;
4231       goto unlock;
4232     }
4233
4234   if (dir == hsaHostToDevice)
4235     {
4236       srcpp.base = agent_ptr - first_elem_offset_host;
4237       dstpp.base = dst;
4238     }
4239   else
4240     {
4241       srcpp.base = (void *) src;
4242       dstpp.base = agent_ptr - first_elem_offset_host;
4243     }
4244
4245   /* Pitch is measured in bytes.  */
4246   srcpp.pitch = src_dim2_size;
4247   /* Slice is also measured in bytes (i.e. total per-slice).  */
4248   srcpp.slice = src_dim2_size * src_dim1_len;
4249
4250   src_offsets.x = src_offset2_size;
4251   src_offsets.y = src_offset1_len;
4252   src_offsets.z = src_offset0_len;
4253
4254   /* As above.  */
4255   dstpp.pitch = dst_dim2_size;
4256   dstpp.slice = dst_dim2_size * dst_dim1_len;
4257
4258   dst_offsets.x = dst_offset2_size;
4259   dst_offsets.y = dst_offset1_len;
4260   dst_offsets.z = dst_offset0_len;
4261
4262   ranges.x = dim2_size;
4263   ranges.y = dim1_len;
4264   ranges.z = dim0_len;
4265
4266   status
4267     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4268                                                  &src_offsets, &ranges,
4269                                                  copy_agent, dir, 0, NULL,
4270                                                  completion_signal);
4271   /* If the rectangular copy fails, we might still be able to use the slow
4272      path.  We need to unlock the host memory though, so don't return
4273      immediately.  */
4274   if (status != HSA_STATUS_SUCCESS)
4275     retval = -1;
4276   else
4277     {
4278       hsa_signal_value_t sv
4279         = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4280                                               HSA_SIGNAL_CONDITION_LT, 1,
4281                                               UINT64_MAX,
4282                                               HSA_WAIT_STATE_ACTIVE);
4283       if (sv < 0)
4284         {
4285           GCN_WARNING ("async copy rect failure");
4286           retval = -1;
4287         }
4288     }
4289
4290   hsa_fns.hsa_signal_destroy_fn (completion_signal);
4291
4292 unlock:
4293   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4294   if (status != HSA_STATUS_SUCCESS)
4295     hsa_fatal ("Could not unlock host memory", status);
4296
4297   return retval;
4298 }
4299
4300 /* }}}  */
4301 /* {{{ OpenMP Plugin API  */
4302
4303 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4304    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
4305    to a kernel_info structure, and must have previously been loaded to the
4306    specified device.  */
4307
4308 void
4309 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4310 {
4311   struct agent_info *agent = get_agent_info (device);
4312   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4313   struct GOMP_kernel_launch_attributes def;
4314   struct GOMP_kernel_launch_attributes *kla;
4315   assert (agent == kernel->agent);
4316
4317   /* If we get here then the kernel must be OpenMP.  */
4318   kernel->kind = KIND_OPENMP;
4319
4320   if (!parse_target_attributes (args, &def, &kla, agent))
4321     {
4322       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4323       return;
4324     }
4325   run_kernel (kernel, vars, kla, NULL, false);
4326 }
4327
4328 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
4329    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4330    GOMP_PLUGIN_target_task_completion when it has finished.  */
4331
4332 void
4333 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4334                         void **args, void *async_data)
4335 {
4336   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4337   struct agent_info *agent = get_agent_info (device);
4338   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4339   struct GOMP_kernel_launch_attributes def;
4340   struct GOMP_kernel_launch_attributes *kla;
4341   assert (agent == kernel->agent);
4342
4343   /* If we get here then the kernel must be OpenMP.  */
4344   kernel->kind = KIND_OPENMP;
4345
4346   if (!parse_target_attributes (args, &def, &kla, agent))
4347     {
4348       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4349       return;
4350     }
4351
4352   maybe_init_omp_async (agent);
4353   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4354   queue_push_callback (agent->omp_async_queue,
4355                        GOMP_PLUGIN_target_task_completion, async_data);
4356 }
4357
4358 /* }}} */
4359 /* {{{ OpenACC Plugin API  */
4360
4361 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
4362    already-loaded KERNEL.  */
4363
4364 void
4365 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4366                            size_t mapnum __attribute__((unused)),
4367                            void **hostaddrs __attribute__((unused)),
4368                            void **devaddrs, unsigned *dims,
4369                            void *targ_mem_desc)
4370 {
4371   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4372
4373   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4374 }
4375
4376 /* Run an asynchronous OpenACC kernel on the specified queue.  */
4377
4378 void
4379 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4380                                  size_t mapnum __attribute__((unused)),
4381                                  void **hostaddrs __attribute__((unused)),
4382                                  void **devaddrs,
4383                                  unsigned *dims, void *targ_mem_desc,
4384                                  struct goacc_asyncqueue *aq)
4385 {
4386   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4387
4388   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4389 }
4390
4391 /* Create a new asynchronous thread and queue for running future kernels;
4392    issues a fatal error if the queue cannot be created as all callers expect
4393    that the queue exists.  */
4394
4395 struct goacc_asyncqueue *
4396 GOMP_OFFLOAD_openacc_async_construct (int device)
4397 {
4398   struct agent_info *agent = get_agent_info (device);
4399
4400   pthread_mutex_lock (&agent->async_queues_mutex);
4401
4402   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4403   aq->agent = get_agent_info (device);
4404   aq->prev = NULL;
4405   aq->next = agent->async_queues;
4406   if (aq->next)
4407     {
4408       aq->next->prev = aq;
4409       aq->id = aq->next->id + 1;
4410     }
4411   else
4412     aq->id = 1;
4413   agent->async_queues = aq;
4414
4415   aq->queue_first = 0;
4416   aq->queue_n = 0;
4417   aq->drain_queue_stop = 0;
4418
4419   if (pthread_mutex_init (&aq->mutex, NULL))
4420     {
4421       GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue mutex");
4422       return NULL;
4423     }
4424   if (pthread_cond_init (&aq->queue_cond_in, NULL))
4425     {
4426       GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue cond");
4427       return NULL;
4428     }
4429   if (pthread_cond_init (&aq->queue_cond_out, NULL))
4430     {
4431       GOMP_PLUGIN_fatal ("Failed to initialize a GCN agent queue cond");
4432       return NULL;
4433     }
4434
4435   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4436                                                      ASYNC_QUEUE_SIZE,
4437                                                      HSA_QUEUE_TYPE_MULTI,
4438                                                      hsa_queue_callback, NULL,
4439                                                      UINT32_MAX, UINT32_MAX,
4440                                                      &aq->hsa_queue);
4441   if (status != HSA_STATUS_SUCCESS)
4442     hsa_fatal ("Error creating command queue", status);
4443
4444   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4445   if (err != 0)
4446     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4447                        strerror (err));
4448   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4449              aq->id);
4450
4451   pthread_mutex_unlock (&agent->async_queues_mutex);
4452
4453   return aq;
4454 }
4455
4456 /* Destroy an existing asynchronous thread and queue.  Waits for any
4457    currently-running task to complete, but cancels any queued tasks.  */
4458
4459 bool
4460 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4461 {
4462   struct agent_info *agent = aq->agent;
4463
4464   finalize_async_thread (aq);
4465
4466   pthread_mutex_lock (&agent->async_queues_mutex);
4467
4468   int err;
4469   if ((err = pthread_mutex_destroy (&aq->mutex)))
4470     {
4471       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4472       goto fail;
4473     }
4474   if (pthread_cond_destroy (&aq->queue_cond_in))
4475     {
4476       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4477       goto fail;
4478     }
4479   if (pthread_cond_destroy (&aq->queue_cond_out))
4480     {
4481       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4482       goto fail;
4483     }
4484   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4485   if (status != HSA_STATUS_SUCCESS)
4486     {
4487       hsa_error ("Error destroying command queue", status);
4488       goto fail;
4489     }
4490
4491   if (aq->prev)
4492     aq->prev->next = aq->next;
4493   if (aq->next)
4494     aq->next->prev = aq->prev;
4495   if (agent->async_queues == aq)
4496     agent->async_queues = aq->next;
4497
4498   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4499
4500   free (aq);
4501   pthread_mutex_unlock (&agent->async_queues_mutex);
4502   return true;
4503
4504 fail:
4505   pthread_mutex_unlock (&agent->async_queues_mutex);
4506   return false;
4507 }
4508
4509 /* Return true if the specified async queue is currently empty.  */
4510
4511 int
4512 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4513 {
4514   return queue_empty (aq);
4515 }
4516
4517 /* Block until the specified queue has executed all its tasks and the
4518    queue is empty.  */
4519
4520 bool
4521 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4522 {
4523   wait_queue (aq);
4524   return true;
4525 }
4526
4527 /* Add a serialization point across two async queues. Any new tasks added to
4528    AQ2, after this call, will not run until all tasks on AQ1, at the time
4529    of this call, have completed.  */
4530
4531 bool
4532 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4533                                       struct goacc_asyncqueue *aq2)
4534 {
4535   /* For serialize, stream aq2 waits for aq1 to complete work that has been
4536      scheduled to run on it up to this point.  */
4537   if (aq1 != aq2)
4538     {
4539       struct placeholder *placeholderp = queue_push_placeholder (aq1);
4540       queue_push_asyncwait (aq2, placeholderp);
4541     }
4542   return true;
4543 }
4544
4545 /* Add an opaque callback to the given async queue.  */
4546
4547 void
4548 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4549                                            void (*fn) (void *), void *data)
4550 {
4551   queue_push_callback (aq, fn, data);
4552 }
4553
4554 /* Queue up an asynchronous data copy from host to DEVICE.  */
4555
4556 bool
4557 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4558                                      size_t n, struct goacc_asyncqueue *aq)
4559 {
4560   struct agent_info *agent = get_agent_info (device);
4561   assert (agent == aq->agent);
4562   queue_push_copy (aq, dst, src, n);
4563   return true;
4564 }
4565
4566 /* Queue up an asynchronous data copy from DEVICE to host.  */
4567
4568 bool
4569 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4570                                      size_t n, struct goacc_asyncqueue *aq)
4571 {
4572   struct agent_info *agent = get_agent_info (device);
4573   assert (agent == aq->agent);
4574   queue_push_copy (aq, dst, src, n);
4575   return true;
4576 }
4577
4578 union goacc_property_value
4579 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4580 {
4581   struct agent_info *agent = get_agent_info (device);
4582
4583   union goacc_property_value propval = { .val = 0 };
4584
4585   switch (prop)
4586     {
4587     case GOACC_PROPERTY_FREE_MEMORY:
4588       /* Not supported. */
4589       break;
4590     case GOACC_PROPERTY_MEMORY:
4591       {
4592         size_t size;
4593         hsa_region_t region = agent->data_region;
4594         hsa_status_t status =
4595           hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4596         if (status == HSA_STATUS_SUCCESS)
4597           propval.val = size;
4598         break;
4599       }
4600     case GOACC_PROPERTY_NAME:
4601       propval.ptr = agent->name;
4602       break;
4603     case GOACC_PROPERTY_VENDOR:
4604       propval.ptr = agent->vendor_name;
4605       break;
4606     case GOACC_PROPERTY_DRIVER:
4607       propval.ptr = hsa_context.driver_version_s;
4608       break;
4609     }
4610
4611   return propval;
4612 }
4613
4614 /* Set up plugin-specific thread-local-data (host-side).  */
4615
4616 void *
4617 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4618 {
4619   struct gcn_thread *thread_data
4620     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4621
4622   thread_data->async = GOMP_ASYNC_SYNC;
4623
4624   return (void *) thread_data;
4625 }
4626
4627 /* Clean up plugin-specific thread-local-data.  */
4628
4629 void
4630 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4631 {
4632   free (data);
4633 }
4634
4635 /* }}} */